From b385f273915e05f7274cc51811281313be8b60eb Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 27 Nov 2024 14:43:42 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A7=20Write=20out=20page=5Finfo?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mods4pandas/lib.py | 43 ++++++++++++++++++++ src/mods4pandas/mods4pandas.py | 71 +++++++++++----------------------- 2 files changed, 66 insertions(+), 48 deletions(-) diff --git a/src/mods4pandas/lib.py b/src/mods4pandas/lib.py index d2e1f8f..302e4f2 100644 --- a/src/mods4pandas/lib.py +++ b/src/mods4pandas/lib.py @@ -2,6 +2,7 @@ from itertools import groupby import re import warnings from typing import List, Sequence, MutableMapping, Dict +from collections import defaultdict import pandas as pd import numpy as np @@ -328,3 +329,45 @@ def dicts_to_df(data_list: List[Dict], *, index_column) -> pd.DataFrame: df = pd.DataFrame(data=data, index=index, columns=columns) return df + + +def valid_column_key(k): + if re.match("^[a-zA-Z0-9 _-]+$", k): + return True + else: + return False + +current_columns = defaultdict(list) + +def insert_into_db(con, table, d: Dict): + """Insert the values from the dict into the table, creating columns if necessary""" + + # Create table if necessary + if not current_columns[table]: + for k in d.keys(): + assert valid_column_key(k), f"\"{k}\" is not a valid column name" + current_columns[table].append(k) + con.execute(f"CREATE TABLE {table} ({",".join(f"\"{c}\"" for c in current_columns[table])})") + + # Add columns if necessary + for k in d.keys(): + if not k in current_columns[table]: + assert valid_column_key(k), f"\"{k}\" is not a valid column name" + current_columns[table].append(k) + con.execute(f"ALTER TABLE {table} ADD COLUMN \"{k}\"") + + # Insert + # Unfortunately, Python3's sqlite3 does not like named placeholders with spaces, so we + # have use qmark style here. + columns = d.keys() + con.execute( + f"INSERT INTO {table}" + f"( {",".join(f"\"{c}\"" for c in columns)} )" + "VALUES" + f"( {",".join("?" for c in columns)} )", + [str(d[c]) for c in columns] + ) + +def insert_into_db_multiple(con, table, ld: List[Dict]): + for d in ld: + insert_into_db(con, table, d) diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py index e12af4f..aae282d 100755 --- a/src/mods4pandas/mods4pandas.py +++ b/src/mods4pandas/mods4pandas.py @@ -10,13 +10,14 @@ from lxml import etree as ET from itertools import groupby from operator import attrgetter from typing import Dict, List +from collections import defaultdict from collections.abc import MutableMapping, Sequence import click import pandas as pd from tqdm import tqdm -from .lib import sorted_groupby, TagGroup, ns, flatten, dicts_to_df +from .lib import sorted_groupby, TagGroup, ns, flatten, dicts_to_df, insert_into_db, insert_into_db_multiple @@ -396,45 +397,16 @@ def process(mets_files: List[str], output_file: str, output_page_info: str): else: mets_files_real.append(m) - current_columns = [] - - def valid_column_key(k): - if re.match("^[a-zA-Z0-9 _-]+$", k): - return True - else: - return False - - def insert_into_db(con, d: Dict): - # Create table if necessary - if not current_columns: - for k in d.keys(): - assert valid_column_key(k), f"\"{k}\" is not a valid column name" - current_columns.append(k) - con.execute(f"CREATE TABLE mods_info({",".join(f"\"{c}\"" for c in current_columns)})") - - # Add columns if necessary - for k in d.keys(): - if not k in current_columns: - assert valid_column_key(k), f"\"{k}\" is not a valid column name" - current_columns.append(k) - con.execute(f"ALTER TABLE mods_info ADD COLUMN \"{k}\"") - - # Insert - # Unfortunately, Python3's sqlite3 does not like named placeholders with spaces, so we - # have use qmark style here. - columns = d.keys() - con.execute( - "INSERT INTO mods_info" - f"( {",".join(f"\"{c}\"" for c in columns)} )" - "VALUES" - f"( {",".join("?" for c in columns)} )", - [str(d[c]) for c in columns] - ) # Process METS files output_file_sqlite3 = output_file + ".sqlite3" con = sqlite3.connect(output_file_sqlite3) + + if output_page_info: + output_page_info_sqlite3 = output_page_info + ".sqlite3" + con_page_info = sqlite3.connect(output_page_info_sqlite3) + with open(output_file + '.warnings.csv', 'w') as csvfile: csvwriter = csv.writer(csvfile) mods_info = [] @@ -463,11 +435,11 @@ def process(mets_files: List[str], output_file: str, output_page_info: str): if output_page_info: page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True) - insert_into_db(con, d) + insert_into_db(con, "mods_info", d) con.commit() - #TODO - #if output_page_info: - # page_info.extend(page_info_doc) + if output_page_info: + insert_into_db_multiple(con_page_info, "page_info", page_info_doc) + con_page_info.commit() if caught_warnings: # PyCharm thinks caught_warnings is not Iterable: @@ -478,18 +450,21 @@ def process(mets_files: List[str], output_file: str, output_page_info: str): logger.exception('Exception in {}'.format(mets_file)) # Convert the mods_info List[Dict] to a pandas DataFrame - mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier") - + # TODO + # mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier") + # Save the DataFrame - logger.info('Writing DataFrame to {}'.format(output_file)) - mods_info_df.to_parquet(output_file) + # TODO + #logger.info('Writing DataFrame to {}'.format(output_file)) + #mods_info_df.to_parquet(output_file) # Convert page_info - if output_page_info: - page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID")) - # Save the DataFrame - logger.info('Writing DataFrame to {}'.format(output_page_info)) - page_info_df.to_parquet(output_page_info) + # TODO + # if output_page_info: + # page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID")) + # # Save the DataFrame + # logger.info('Writing DataFrame to {}'.format(output_page_info)) + # page_info_df.to_parquet(output_page_info) def main():