From a1390699d4273b3a151a0f14398c5f2def0f094d Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Tue, 26 Nov 2024 16:27:43 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A7=20Use=20a=20temporary=20sqlite=20d?= =?UTF-8?q?b?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mods4pandas/mods4pandas.py | 51 ++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py index ef24d36..e12af4f 100755 --- a/src/mods4pandas/mods4pandas.py +++ b/src/mods4pandas/mods4pandas.py @@ -3,7 +3,9 @@ import csv import logging import os import re +import sqlite3 import warnings +import sys from lxml import etree as ET from itertools import groupby from operator import attrgetter @@ -394,7 +396,45 @@ def process(mets_files: List[str], output_file: str, output_page_info: str): else: mets_files_real.append(m) + current_columns = [] + + def valid_column_key(k): + if re.match("^[a-zA-Z0-9 _-]+$", k): + return True + else: + return False + + def insert_into_db(con, d: Dict): + # Create table if necessary + if not current_columns: + for k in d.keys(): + assert valid_column_key(k), f"\"{k}\" is not a valid column name" + current_columns.append(k) + con.execute(f"CREATE TABLE mods_info({",".join(f"\"{c}\"" for c in current_columns)})") + + # Add columns if necessary + for k in d.keys(): + if not k in current_columns: + assert valid_column_key(k), f"\"{k}\" is not a valid column name" + current_columns.append(k) + con.execute(f"ALTER TABLE mods_info ADD COLUMN \"{k}\"") + + # Insert + # Unfortunately, Python3's sqlite3 does not like named placeholders with spaces, so we + # have use qmark style here. + columns = d.keys() + con.execute( + "INSERT INTO mods_info" + f"( {",".join(f"\"{c}\"" for c in columns)} )" + "VALUES" + f"( {",".join("?" for c in columns)} )", + [str(d[c]) for c in columns] + ) + + # Process METS files + output_file_sqlite3 = output_file + ".sqlite3" + con = sqlite3.connect(output_file_sqlite3) with open(output_file + '.warnings.csv', 'w') as csvfile: csvwriter = csv.writer(csvfile) mods_info = [] @@ -423,9 +463,11 @@ def process(mets_files: List[str], output_file: str, output_page_info: str): if output_page_info: page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True) - mods_info.append(d) - if output_page_info: - page_info.extend(page_info_doc) + insert_into_db(con, d) + con.commit() + #TODO + #if output_page_info: + # page_info.extend(page_info_doc) if caught_warnings: # PyCharm thinks caught_warnings is not Iterable: @@ -433,8 +475,7 @@ def process(mets_files: List[str], output_file: str, output_page_info: str): for caught_warning in caught_warnings: csvwriter.writerow([mets_file, caught_warning.message]) except Exception as e: - logger.error('Exception in {}: {}'.format(mets_file, e)) - #import traceback; traceback.print_exc() + logger.exception('Exception in {}'.format(mets_file)) # Convert the mods_info List[Dict] to a pandas DataFrame mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier")