🚧 Use a temporary sqlite db

fix/use-temp-sqlite3
Mike Gerber 4 weeks ago
parent 8d6b97f6b3
commit a1390699d4

@ -3,7 +3,9 @@ import csv
import logging import logging
import os import os
import re import re
import sqlite3
import warnings import warnings
import sys
from lxml import etree as ET from lxml import etree as ET
from itertools import groupby from itertools import groupby
from operator import attrgetter from operator import attrgetter
@ -394,7 +396,45 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
else: else:
mets_files_real.append(m) mets_files_real.append(m)
current_columns = []
def valid_column_key(k):
if re.match("^[a-zA-Z0-9 _-]+$", k):
return True
else:
return False
def insert_into_db(con, d: Dict):
# Create table if necessary
if not current_columns:
for k in d.keys():
assert valid_column_key(k), f"\"{k}\" is not a valid column name"
current_columns.append(k)
con.execute(f"CREATE TABLE mods_info({",".join(f"\"{c}\"" for c in current_columns)})")
# Add columns if necessary
for k in d.keys():
if not k in current_columns:
assert valid_column_key(k), f"\"{k}\" is not a valid column name"
current_columns.append(k)
con.execute(f"ALTER TABLE mods_info ADD COLUMN \"{k}\"")
# Insert
# Unfortunately, Python3's sqlite3 does not like named placeholders with spaces, so we
# have use qmark style here.
columns = d.keys()
con.execute(
"INSERT INTO mods_info"
f"( {",".join(f"\"{c}\"" for c in columns)} )"
"VALUES"
f"( {",".join("?" for c in columns)} )",
[str(d[c]) for c in columns]
)
# Process METS files # Process METS files
output_file_sqlite3 = output_file + ".sqlite3"
con = sqlite3.connect(output_file_sqlite3)
with open(output_file + '.warnings.csv', 'w') as csvfile: with open(output_file + '.warnings.csv', 'w') as csvfile:
csvwriter = csv.writer(csvfile) csvwriter = csv.writer(csvfile)
mods_info = [] mods_info = []
@ -423,9 +463,11 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
if output_page_info: if output_page_info:
page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True) page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True)
mods_info.append(d) insert_into_db(con, d)
if output_page_info: con.commit()
page_info.extend(page_info_doc) #TODO
#if output_page_info:
# page_info.extend(page_info_doc)
if caught_warnings: if caught_warnings:
# PyCharm thinks caught_warnings is not Iterable: # PyCharm thinks caught_warnings is not Iterable:
@ -433,8 +475,7 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
for caught_warning in caught_warnings: for caught_warning in caught_warnings:
csvwriter.writerow([mets_file, caught_warning.message]) csvwriter.writerow([mets_file, caught_warning.message])
except Exception as e: except Exception as e:
logger.error('Exception in {}: {}'.format(mets_file, e)) logger.exception('Exception in {}'.format(mets_file))
#import traceback; traceback.print_exc()
# Convert the mods_info List[Dict] to a pandas DataFrame # Convert the mods_info List[Dict] to a pandas DataFrame
mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier") mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier")

Loading…
Cancel
Save