From a1390699d4273b3a151a0f14398c5f2def0f094d Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Tue, 26 Nov 2024 16:27:43 +0100 Subject: [PATCH 01/37] =?UTF-8?q?=F0=9F=9A=A7=20Use=20a=20temporary=20sqli?= =?UTF-8?q?te=20db?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mods4pandas/mods4pandas.py | 51 ++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py index ef24d36..e12af4f 100755 --- a/src/mods4pandas/mods4pandas.py +++ b/src/mods4pandas/mods4pandas.py @@ -3,7 +3,9 @@ import csv import logging import os import re +import sqlite3 import warnings +import sys from lxml import etree as ET from itertools import groupby from operator import attrgetter @@ -394,7 +396,45 @@ def process(mets_files: List[str], output_file: str, output_page_info: str): else: mets_files_real.append(m) + current_columns = [] + + def valid_column_key(k): + if re.match("^[a-zA-Z0-9 _-]+$", k): + return True + else: + return False + + def insert_into_db(con, d: Dict): + # Create table if necessary + if not current_columns: + for k in d.keys(): + assert valid_column_key(k), f"\"{k}\" is not a valid column name" + current_columns.append(k) + con.execute(f"CREATE TABLE mods_info({",".join(f"\"{c}\"" for c in current_columns)})") + + # Add columns if necessary + for k in d.keys(): + if not k in current_columns: + assert valid_column_key(k), f"\"{k}\" is not a valid column name" + current_columns.append(k) + con.execute(f"ALTER TABLE mods_info ADD COLUMN \"{k}\"") + + # Insert + # Unfortunately, Python3's sqlite3 does not like named placeholders with spaces, so we + # have use qmark style here. + columns = d.keys() + con.execute( + "INSERT INTO mods_info" + f"( {",".join(f"\"{c}\"" for c in columns)} )" + "VALUES" + f"( {",".join("?" for c in columns)} )", + [str(d[c]) for c in columns] + ) + + # Process METS files + output_file_sqlite3 = output_file + ".sqlite3" + con = sqlite3.connect(output_file_sqlite3) with open(output_file + '.warnings.csv', 'w') as csvfile: csvwriter = csv.writer(csvfile) mods_info = [] @@ -423,9 +463,11 @@ def process(mets_files: List[str], output_file: str, output_page_info: str): if output_page_info: page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True) - mods_info.append(d) - if output_page_info: - page_info.extend(page_info_doc) + insert_into_db(con, d) + con.commit() + #TODO + #if output_page_info: + # page_info.extend(page_info_doc) if caught_warnings: # PyCharm thinks caught_warnings is not Iterable: @@ -433,8 +475,7 @@ def process(mets_files: List[str], output_file: str, output_page_info: str): for caught_warning in caught_warnings: csvwriter.writerow([mets_file, caught_warning.message]) except Exception as e: - logger.error('Exception in {}: {}'.format(mets_file, e)) - #import traceback; traceback.print_exc() + logger.exception('Exception in {}'.format(mets_file)) # Convert the mods_info List[Dict] to a pandas DataFrame mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier") From b385f273915e05f7274cc51811281313be8b60eb Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 27 Nov 2024 14:43:42 +0100 Subject: [PATCH 02/37] =?UTF-8?q?=F0=9F=9A=A7=20Write=20out=20page=5Finfo?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mods4pandas/lib.py | 43 ++++++++++++++++++++ src/mods4pandas/mods4pandas.py | 71 +++++++++++----------------------- 2 files changed, 66 insertions(+), 48 deletions(-) diff --git a/src/mods4pandas/lib.py b/src/mods4pandas/lib.py index d2e1f8f..302e4f2 100644 --- a/src/mods4pandas/lib.py +++ b/src/mods4pandas/lib.py @@ -2,6 +2,7 @@ from itertools import groupby import re import warnings from typing import List, Sequence, MutableMapping, Dict +from collections import defaultdict import pandas as pd import numpy as np @@ -328,3 +329,45 @@ def dicts_to_df(data_list: List[Dict], *, index_column) -> pd.DataFrame: df = pd.DataFrame(data=data, index=index, columns=columns) return df + + +def valid_column_key(k): + if re.match("^[a-zA-Z0-9 _-]+$", k): + return True + else: + return False + +current_columns = defaultdict(list) + +def insert_into_db(con, table, d: Dict): + """Insert the values from the dict into the table, creating columns if necessary""" + + # Create table if necessary + if not current_columns[table]: + for k in d.keys(): + assert valid_column_key(k), f"\"{k}\" is not a valid column name" + current_columns[table].append(k) + con.execute(f"CREATE TABLE {table} ({",".join(f"\"{c}\"" for c in current_columns[table])})") + + # Add columns if necessary + for k in d.keys(): + if not k in current_columns[table]: + assert valid_column_key(k), f"\"{k}\" is not a valid column name" + current_columns[table].append(k) + con.execute(f"ALTER TABLE {table} ADD COLUMN \"{k}\"") + + # Insert + # Unfortunately, Python3's sqlite3 does not like named placeholders with spaces, so we + # have use qmark style here. + columns = d.keys() + con.execute( + f"INSERT INTO {table}" + f"( {",".join(f"\"{c}\"" for c in columns)} )" + "VALUES" + f"( {",".join("?" for c in columns)} )", + [str(d[c]) for c in columns] + ) + +def insert_into_db_multiple(con, table, ld: List[Dict]): + for d in ld: + insert_into_db(con, table, d) diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py index e12af4f..aae282d 100755 --- a/src/mods4pandas/mods4pandas.py +++ b/src/mods4pandas/mods4pandas.py @@ -10,13 +10,14 @@ from lxml import etree as ET from itertools import groupby from operator import attrgetter from typing import Dict, List +from collections import defaultdict from collections.abc import MutableMapping, Sequence import click import pandas as pd from tqdm import tqdm -from .lib import sorted_groupby, TagGroup, ns, flatten, dicts_to_df +from .lib import sorted_groupby, TagGroup, ns, flatten, dicts_to_df, insert_into_db, insert_into_db_multiple @@ -396,45 +397,16 @@ def process(mets_files: List[str], output_file: str, output_page_info: str): else: mets_files_real.append(m) - current_columns = [] - - def valid_column_key(k): - if re.match("^[a-zA-Z0-9 _-]+$", k): - return True - else: - return False - - def insert_into_db(con, d: Dict): - # Create table if necessary - if not current_columns: - for k in d.keys(): - assert valid_column_key(k), f"\"{k}\" is not a valid column name" - current_columns.append(k) - con.execute(f"CREATE TABLE mods_info({",".join(f"\"{c}\"" for c in current_columns)})") - - # Add columns if necessary - for k in d.keys(): - if not k in current_columns: - assert valid_column_key(k), f"\"{k}\" is not a valid column name" - current_columns.append(k) - con.execute(f"ALTER TABLE mods_info ADD COLUMN \"{k}\"") - - # Insert - # Unfortunately, Python3's sqlite3 does not like named placeholders with spaces, so we - # have use qmark style here. - columns = d.keys() - con.execute( - "INSERT INTO mods_info" - f"( {",".join(f"\"{c}\"" for c in columns)} )" - "VALUES" - f"( {",".join("?" for c in columns)} )", - [str(d[c]) for c in columns] - ) # Process METS files output_file_sqlite3 = output_file + ".sqlite3" con = sqlite3.connect(output_file_sqlite3) + + if output_page_info: + output_page_info_sqlite3 = output_page_info + ".sqlite3" + con_page_info = sqlite3.connect(output_page_info_sqlite3) + with open(output_file + '.warnings.csv', 'w') as csvfile: csvwriter = csv.writer(csvfile) mods_info = [] @@ -463,11 +435,11 @@ def process(mets_files: List[str], output_file: str, output_page_info: str): if output_page_info: page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True) - insert_into_db(con, d) + insert_into_db(con, "mods_info", d) con.commit() - #TODO - #if output_page_info: - # page_info.extend(page_info_doc) + if output_page_info: + insert_into_db_multiple(con_page_info, "page_info", page_info_doc) + con_page_info.commit() if caught_warnings: # PyCharm thinks caught_warnings is not Iterable: @@ -478,18 +450,21 @@ def process(mets_files: List[str], output_file: str, output_page_info: str): logger.exception('Exception in {}'.format(mets_file)) # Convert the mods_info List[Dict] to a pandas DataFrame - mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier") - + # TODO + # mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier") + # Save the DataFrame - logger.info('Writing DataFrame to {}'.format(output_file)) - mods_info_df.to_parquet(output_file) + # TODO + #logger.info('Writing DataFrame to {}'.format(output_file)) + #mods_info_df.to_parquet(output_file) # Convert page_info - if output_page_info: - page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID")) - # Save the DataFrame - logger.info('Writing DataFrame to {}'.format(output_page_info)) - page_info_df.to_parquet(output_page_info) + # TODO + # if output_page_info: + # page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID")) + # # Save the DataFrame + # logger.info('Writing DataFrame to {}'.format(output_page_info)) + # page_info_df.to_parquet(output_page_info) def main(): From eeaad036860c5510633d2a664d8d55604b78cfa0 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 27 Nov 2024 16:38:18 +0100 Subject: [PATCH 03/37] =?UTF-8?q?=F0=9F=9A=A7=20Avoid=20nested=20quotes=20?= =?UTF-8?q?for=20Python=20<=203.12?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mods4pandas/lib.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/mods4pandas/lib.py b/src/mods4pandas/lib.py index 302e4f2..2f50f43 100644 --- a/src/mods4pandas/lib.py +++ b/src/mods4pandas/lib.py @@ -337,6 +337,14 @@ def valid_column_key(k): else: return False +def column_names_csv(columns): + """ + Format Column names (identifiers) as a comma-separated list. + + This uses double quotes per SQL standard. + """ + return ",".join('"' + c + '"' for c in columns) + current_columns = defaultdict(list) def insert_into_db(con, table, d: Dict): @@ -345,16 +353,16 @@ def insert_into_db(con, table, d: Dict): # Create table if necessary if not current_columns[table]: for k in d.keys(): - assert valid_column_key(k), f"\"{k}\" is not a valid column name" + assert valid_column_key(k), f'"{k}" is not a valid column name' current_columns[table].append(k) - con.execute(f"CREATE TABLE {table} ({",".join(f"\"{c}\"" for c in current_columns[table])})") + con.execute(f"CREATE TABLE {table} ({column_names_csv(current_columns[table])})") # Add columns if necessary for k in d.keys(): if not k in current_columns[table]: - assert valid_column_key(k), f"\"{k}\" is not a valid column name" + assert valid_column_key(k), f'"{k}" is not a valid column name' current_columns[table].append(k) - con.execute(f"ALTER TABLE {table} ADD COLUMN \"{k}\"") + con.execute(f'ALTER TABLE {table} ADD COLUMN "{k}"') # Insert # Unfortunately, Python3's sqlite3 does not like named placeholders with spaces, so we @@ -362,9 +370,9 @@ def insert_into_db(con, table, d: Dict): columns = d.keys() con.execute( f"INSERT INTO {table}" - f"( {",".join(f"\"{c}\"" for c in columns)} )" + f"( {column_names_csv(columns)} )" "VALUES" - f"( {",".join("?" for c in columns)} )", + f"( {','.join('?' for c in columns)} )", [str(d[c]) for c in columns] ) From 22b62d7a2fbbef453562c9c3738bb1a0c5f6db7c Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 27 Nov 2024 18:48:56 +0100 Subject: [PATCH 04/37] =?UTF-8?q?=F0=9F=90=9B=20Remove=20output=20files=20?= =?UTF-8?q?before=20writing=20them=20again?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mods4pandas/mods4pandas.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py index aae282d..215acd0 100755 --- a/src/mods4pandas/mods4pandas.py +++ b/src/mods4pandas/mods4pandas.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +import contextlib import csv import logging import os @@ -401,10 +402,14 @@ def process(mets_files: List[str], output_file: str, output_page_info: str): # Process METS files output_file_sqlite3 = output_file + ".sqlite3" + with contextlib.suppress(FileNotFoundError): + os.remove(output_file_sqlite3) con = sqlite3.connect(output_file_sqlite3) if output_page_info: output_page_info_sqlite3 = output_page_info + ".sqlite3" + with contextlib.suppress(FileNotFoundError): + os.remove(output_page_info_sqlite3) con_page_info = sqlite3.connect(output_page_info_sqlite3) with open(output_file + '.warnings.csv', 'w') as csvfile: From 11d7b012ec3cc2762b1baea77aee0efc949b99a4 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 27 Nov 2024 19:03:35 +0100 Subject: [PATCH 05/37] =?UTF-8?q?=F0=9F=90=9B=20Leave=20tqdm=20progress=20?= =?UTF-8?q?bar=20to=20avoid=20confusion=20through=20other=20left-over=20pr?= =?UTF-8?q?ogress=20bars?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mods4pandas/mods4pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py index 215acd0..c3cde97 100755 --- a/src/mods4pandas/mods4pandas.py +++ b/src/mods4pandas/mods4pandas.py @@ -417,7 +417,7 @@ def process(mets_files: List[str], output_file: str, output_page_info: str): mods_info = [] page_info = [] logger.info('Processing METS files') - for mets_file in tqdm(mets_files_real, leave=False): + for mets_file in tqdm(mets_files_real, leave=True): try: root = ET.parse(mets_file).getroot() mets = root # XXX .find('mets:mets', ns) does not work here From 939967edc8863d7781d2311631cab3e94c3fc1c3 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 27 Nov 2024 19:05:05 +0100 Subject: [PATCH 06/37] =?UTF-8?q?=F0=9F=90=9B=20De-couple=20mods=5Finfo=20?= =?UTF-8?q?from=20page=5Finfo?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mods4pandas/mods4pandas.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py index c3cde97..e08fe37 100755 --- a/src/mods4pandas/mods4pandas.py +++ b/src/mods4pandas/mods4pandas.py @@ -436,13 +436,13 @@ def process(mets_files: List[str], output_file: str, output_page_info: str): # "meta" d['mets_file'] = mets_file + # Save + insert_into_db(con, "mods_info", d) + con.commit() + # METS - per-page if output_page_info: page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True) - - insert_into_db(con, "mods_info", d) - con.commit() - if output_page_info: insert_into_db_multiple(con_page_info, "page_info", page_info_doc) con_page_info.commit() From 8ee4c3d0bcb4a65fec343c0317a6df211251f311 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 27 Nov 2024 19:56:36 +0100 Subject: [PATCH 07/37] =?UTF-8?q?=F0=9F=90=9B=20Normalize=20structure=20ty?= =?UTF-8?q?pe=20names=20to=20lower=20case?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mods4pandas/mods4pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py index e08fe37..b4051f9 100755 --- a/src/mods4pandas/mods4pandas.py +++ b/src/mods4pandas/mods4pandas.py @@ -362,7 +362,7 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: # Populate structure type indicator variables for struct_div in struct_divs: - type_ = struct_div.attrib.get("TYPE") + type_ = struct_div.attrib.get("TYPE").lower() assert type_ page_dict[f"structMap-LOGICAL_TYPE_{type_}"] = 1 From abb20b8ba9fca2e36d12df0198cb2b2de7830b19 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Thu, 28 Nov 2024 14:25:27 +0100 Subject: [PATCH 08/37] =?UTF-8?q?=F0=9F=90=9B=20Add=20multivolume=20type?= =?UTF-8?q?=20'multivolume=5Fmanuscript'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mods4pandas/mods4pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py index b4051f9..4427f13 100755 --- a/src/mods4pandas/mods4pandas.py +++ b/src/mods4pandas/mods4pandas.py @@ -277,7 +277,7 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: # This is expected in a multivolume work or periodical! if any( structMap_LOGICAL.find(f'./mets:div[@TYPE="{t}"]', ns) is not None - for t in ["multivolume_work", "MultivolumeWork", "periodical"] + for t in ["multivolume_work", "MultivolumeWork", "multivolume_manuscript", "periodical"] ): return [] else: From 11a04916f38631a2e55192bcb4db9b25df8f384b Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Thu, 28 Nov 2024 18:27:39 +0100 Subject: [PATCH 09/37] =?UTF-8?q?=F0=9F=90=9B=20Write=20mods=5Finfo=20Parq?= =?UTF-8?q?uet=20file=20again?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mods4pandas/mods4pandas.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py index 4427f13..4fabb52 100755 --- a/src/mods4pandas/mods4pandas.py +++ b/src/mods4pandas/mods4pandas.py @@ -399,19 +399,24 @@ def process(mets_files: List[str], output_file: str, output_page_info: str): mets_files_real.append(m) - - # Process METS files + # Prepare output files + with contextlib.suppress(FileNotFoundError): + os.remove(output_file) output_file_sqlite3 = output_file + ".sqlite3" with contextlib.suppress(FileNotFoundError): os.remove(output_file_sqlite3) + + logger.info('Writing SQLite DB to {}'.format(output_file_sqlite3)) con = sqlite3.connect(output_file_sqlite3) if output_page_info: output_page_info_sqlite3 = output_page_info + ".sqlite3" + logger.info('Writing SQLite DB to {}'.format(output_page_info_sqlite3)) with contextlib.suppress(FileNotFoundError): os.remove(output_page_info_sqlite3) con_page_info = sqlite3.connect(output_page_info_sqlite3) + # Process METS files with open(output_file + '.warnings.csv', 'w') as csvfile: csvwriter = csv.writer(csvfile) mods_info = [] @@ -454,14 +459,12 @@ def process(mets_files: List[str], output_file: str, output_page_info: str): except Exception as e: logger.exception('Exception in {}'.format(mets_file)) - # Convert the mods_info List[Dict] to a pandas DataFrame - # TODO - # mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier") - + # Convert the mods_info SQL to a pandas DataFrame + mods_info_df = pd.read_sql_query("SELECT * FROM mods_info", con, index_col="recordInfo_recordIdentifier") + # Save the DataFrame - # TODO - #logger.info('Writing DataFrame to {}'.format(output_file)) - #mods_info_df.to_parquet(output_file) + logger.info('Writing DataFrame to {}'.format(output_file)) + mods_info_df.to_parquet(output_file) # Convert page_info # TODO From 6981efb87cf236f531bb72af026bd18d7c601b4a Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Thu, 28 Nov 2024 18:32:40 +0100 Subject: [PATCH 10/37] =?UTF-8?q?=F0=9F=90=9B=20Write=20page=5Finfo=20Parq?= =?UTF-8?q?uet=20file=20again?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mods4pandas/mods4pandas.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py index 4fabb52..46ebfc8 100755 --- a/src/mods4pandas/mods4pandas.py +++ b/src/mods4pandas/mods4pandas.py @@ -461,18 +461,15 @@ def process(mets_files: List[str], output_file: str, output_page_info: str): # Convert the mods_info SQL to a pandas DataFrame mods_info_df = pd.read_sql_query("SELECT * FROM mods_info", con, index_col="recordInfo_recordIdentifier") - - # Save the DataFrame logger.info('Writing DataFrame to {}'.format(output_file)) mods_info_df.to_parquet(output_file) - # Convert page_info - # TODO - # if output_page_info: - # page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID")) - # # Save the DataFrame - # logger.info('Writing DataFrame to {}'.format(output_page_info)) - # page_info_df.to_parquet(output_page_info) + if output_page_info: + # Convert page_info SQL to a pandas DataFrama + page_info_df = pd.read_sql_query("SELECT * FROM page_info", con_page_info, index_col=["ppn", "ID"]) + # Save the DataFrame + logger.info('Writing DataFrame to {}'.format(output_page_info)) + page_info_df.to_parquet(output_page_info) def main(): From ca8f165955f36292958ce3a752b12469f6cdc824 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Thu, 28 Nov 2024 20:05:55 +0100 Subject: [PATCH 11/37] =?UTF-8?q?=F0=9F=A7=B9=20Remove=20redundant=20comme?= =?UTF-8?q?nt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mods4pandas/mods4pandas.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py index 46ebfc8..4479844 100755 --- a/src/mods4pandas/mods4pandas.py +++ b/src/mods4pandas/mods4pandas.py @@ -467,7 +467,6 @@ def process(mets_files: List[str], output_file: str, output_page_info: str): if output_page_info: # Convert page_info SQL to a pandas DataFrama page_info_df = pd.read_sql_query("SELECT * FROM page_info", con_page_info, index_col=["ppn", "ID"]) - # Save the DataFrame logger.info('Writing DataFrame to {}'.format(output_page_info)) page_info_df.to_parquet(output_page_info) From 39f7d8646a8433fa11472216b5fe27550a820f45 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Fri, 29 Nov 2024 15:53:00 +0100 Subject: [PATCH 12/37] =?UTF-8?q?=F0=9F=9A=A7=20Use=20temporary=20SQLite?= =?UTF-8?q?=20DB=20for=20alto4pandas,=20too?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mods4pandas/alto4pandas.py | 53 +++++++++++++++++----------------- src/mods4pandas/lib.py | 2 +- 2 files changed, 27 insertions(+), 28 deletions(-) diff --git a/src/mods4pandas/alto4pandas.py b/src/mods4pandas/alto4pandas.py index 1508150..8dde40a 100755 --- a/src/mods4pandas/alto4pandas.py +++ b/src/mods4pandas/alto4pandas.py @@ -5,6 +5,8 @@ import os import re import warnings import sys +import contextlib +import sqlite3 from xml.dom.expatbuilder import Namespaces from lxml import etree as ET from itertools import groupby @@ -17,7 +19,7 @@ import pandas as pd import numpy as np from tqdm import tqdm -from .lib import TagGroup, sorted_groupby, flatten, ns +from .lib import TagGroup, sorted_groupby, flatten, ns, insert_into_db logger = logging.getLogger('alto4pandas') @@ -121,18 +123,19 @@ def walk(m): @click.command() @click.argument('alto_files', type=click.Path(exists=True), required=True, nargs=-1) -@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output pickle file', - default='alto_info_df.pkl', show_default=True) -@click.option('--output-csv', type=click.Path(), help='Output CSV file') -@click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file') -def process(alto_files: List[str], output_file: str, output_csv: str, output_xlsx: str): +@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output Parquet file', + default='alto_info_df.parquet', show_default=True) +def process(alto_files: List[str], output_file: str): """ A tool to convert the ALTO metadata in INPUT to a pandas DataFrame. INPUT is assumed to be a ALTO document. INPUT may optionally be a directory. The tool then reads all files in the directory. - alto4pandas writes two output files: A pickled pandas DataFrame and a CSV file with all conversion warnings. + alto4pandas writes multiple output files: + - A Parquet DataFrame + - A SQLite database + - and a CSV file with all conversion warnings. """ # Extend file list if directories are given @@ -141,10 +144,19 @@ def process(alto_files: List[str], output_file: str, output_csv: str, output_xls for x in walk(m): alto_files_real.append(x) + # Prepare output files + with contextlib.suppress(FileNotFoundError): + os.remove(output_file) + output_file_sqlite3 = output_file + ".sqlite3" + with contextlib.suppress(FileNotFoundError): + os.remove(output_file_sqlite3) + + logger.info('Writing SQLite DB to {}'.format(output_file_sqlite3)) + con = sqlite3.connect(output_file_sqlite3) + # Process ALTO files with open(output_file + '.warnings.csv', 'w') as csvfile: csvwriter = csv.writer(csvfile) - alto_info = [] logger.info('Processing ALTO files') for alto_file in tqdm(alto_files_real, leave=False): try: @@ -160,7 +172,9 @@ def process(alto_files: List[str], output_file: str, output_csv: str, output_xls d['alto_file'] = alto_file d['alto_xmlns'] = ET.QName(alto).namespace - alto_info.append(d) + # Save + insert_into_db(con, "alto_info", d) + con.commit if caught_warnings: # PyCharm thinks caught_warnings is not Iterable: @@ -171,25 +185,10 @@ def process(alto_files: List[str], output_file: str, output_csv: str, output_xls logger.error('Exception in {}: {}'.format(alto_file, e)) import traceback; traceback.print_exc() - # Convert the alto_info List[Dict] to a pandas DataFrame - columns = [] - for m in alto_info: - for c in m.keys(): - if c not in columns: - columns.append(c) - data = [[m.get(c) for c in columns] for m in alto_info] - index = [m['alto_file'] for m in alto_info] # TODO use ppn + page? - alto_info_df = pd.DataFrame(data=data, index=index, columns=columns) - - # Pickle the DataFrame + # Convert the alto_info SQL to a pandas DataFrame + alto_info_df = pd.read_sql_query("SELECT * FROM alto_info", con, index_col="alto_file") logger.info('Writing DataFrame to {}'.format(output_file)) - alto_info_df.to_pickle(output_file) - if output_csv: - logger.info('Writing CSV to {}'.format(output_csv)) - alto_info_df.to_csv(output_csv) - if output_xlsx: - logger.info('Writing Excel .xlsx to {}'.format(output_xlsx)) - alto_info_df.to_excel(output_xlsx) + alto_info_df.to_parquet(output_file) def main(): diff --git a/src/mods4pandas/lib.py b/src/mods4pandas/lib.py index 2f50f43..11c88a3 100644 --- a/src/mods4pandas/lib.py +++ b/src/mods4pandas/lib.py @@ -332,7 +332,7 @@ def dicts_to_df(data_list: List[Dict], *, index_column) -> pd.DataFrame: def valid_column_key(k): - if re.match("^[a-zA-Z0-9 _-]+$", k): + if re.match("^[a-zA-Z0-9 _@/:\[\]-]+$", k): return True else: return False From 6af4a6f67128f83585f29710bfb7c5a62411a6ed Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Tue, 3 Dec 2024 17:02:12 +0100 Subject: [PATCH 13/37] =?UTF-8?q?=F0=9F=A7=B9=20Remove=20unused/obsolete?= =?UTF-8?q?=20code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mods4pandas/lib.py | 30 ------------------------------ src/mods4pandas/mods4pandas.py | 2 +- 2 files changed, 1 insertion(+), 31 deletions(-) diff --git a/src/mods4pandas/lib.py b/src/mods4pandas/lib.py index 11c88a3..8a65901 100644 --- a/src/mods4pandas/lib.py +++ b/src/mods4pandas/lib.py @@ -301,36 +301,6 @@ def flatten(d: MutableMapping, parent='', separator='_'): return dict(items) -def dicts_to_df(data_list: List[Dict], *, index_column) -> pd.DataFrame: - """ - Convert the given list of dicts to a Pandas DataFrame. - - The keys of the dicts make the columns. - """ - - # Build columns from keys - columns = [] - for m in data_list: - for c in m.keys(): - if c not in columns: - columns.append(c) - - # Build data table - data = [[m.get(c) for c in columns] for m in data_list] - - # Build index - if isinstance(index_column, str): - index = [m[index_column] for m in data_list] - elif isinstance(index_column, tuple): - index = [[m[c] for m in data_list] for c in index_column] - index = pd.MultiIndex.from_arrays(index, names=index_column) - else: - raise ValueError(f"index_column must") - - df = pd.DataFrame(data=data, index=index, columns=columns) - return df - - def valid_column_key(k): if re.match("^[a-zA-Z0-9 _@/:\[\]-]+$", k): return True diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py index 4479844..ee1f45d 100755 --- a/src/mods4pandas/mods4pandas.py +++ b/src/mods4pandas/mods4pandas.py @@ -18,7 +18,7 @@ import click import pandas as pd from tqdm import tqdm -from .lib import sorted_groupby, TagGroup, ns, flatten, dicts_to_df, insert_into_db, insert_into_db_multiple +from .lib import sorted_groupby, TagGroup, ns, flatten, insert_into_db, insert_into_db_multiple From ef4eeac7e249856439756b054b175470c338604f Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Tue, 3 Dec 2024 17:02:24 +0100 Subject: [PATCH 14/37] =?UTF-8?q?=F0=9F=A7=B9=20Remove=20unused/obsolete?= =?UTF-8?q?=20code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mods4pandas/mods4pandas.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py index ee1f45d..30d7c22 100755 --- a/src/mods4pandas/mods4pandas.py +++ b/src/mods4pandas/mods4pandas.py @@ -419,8 +419,6 @@ def process(mets_files: List[str], output_file: str, output_page_info: str): # Process METS files with open(output_file + '.warnings.csv', 'w') as csvfile: csvwriter = csv.writer(csvfile) - mods_info = [] - page_info = [] logger.info('Processing METS files') for mets_file in tqdm(mets_files_real, leave=True): try: From 4d6e1f4ff4e164c1319aaaae51cb2b366d30fb96 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Tue, 3 Dec 2024 17:24:24 +0100 Subject: [PATCH 15/37] =?UTF-8?q?=F0=9F=90=9B=20Add=20missing=20tag=20alto?= =?UTF-8?q?:fileIdentifier?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mods4pandas/alto4pandas.py | 2 + .../data/alto/PPN1844793923/00000017.xml | 663 ++++++++++++++++++ 2 files changed, 665 insertions(+) create mode 100644 src/mods4pandas/tests/data/alto/PPN1844793923/00000017.xml diff --git a/src/mods4pandas/alto4pandas.py b/src/mods4pandas/alto4pandas.py index 8dde40a..8c8f934 100755 --- a/src/mods4pandas/alto4pandas.py +++ b/src/mods4pandas/alto4pandas.py @@ -76,6 +76,8 @@ def alto_to_dict(alto, raise_errors=True): value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) elif localname == 'fileName': value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif localname == 'fileIdentifier': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() elif localname == 'Layout': value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) diff --git a/src/mods4pandas/tests/data/alto/PPN1844793923/00000017.xml b/src/mods4pandas/tests/data/alto/PPN1844793923/00000017.xml new file mode 100644 index 0000000..7f658fa --- /dev/null +++ b/src/mods4pandas/tests/data/alto/PPN1844793923/00000017.xml @@ -0,0 +1,663 @@ + + + + + + pixel + + 16_b079a_default.jpg + https://content.staatsbibliothek-berlin.de/dc/1844793923-0017/full/full/0/default.jpg + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From 88a6c5f26f310eea659036492b3a39f1d3fd2c20 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Tue, 3 Dec 2024 17:34:07 +0100 Subject: [PATCH 16/37] =?UTF-8?q?=F0=9F=90=9B=20alto4pandas:=20*Really*=20?= =?UTF-8?q?commit=20data=20to=20SQLite=20DB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mods4pandas/alto4pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mods4pandas/alto4pandas.py b/src/mods4pandas/alto4pandas.py index 8c8f934..77e23e2 100755 --- a/src/mods4pandas/alto4pandas.py +++ b/src/mods4pandas/alto4pandas.py @@ -176,7 +176,7 @@ def process(alto_files: List[str], output_file: str): # Save insert_into_db(con, "alto_info", d) - con.commit + con.commit() if caught_warnings: # PyCharm thinks caught_warnings is not Iterable: From be1c8609a3fcbf6461c7c4c17d88fa7dbe1ffb04 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 28 May 2025 20:44:12 +0200 Subject: [PATCH 17/37] =?UTF-8?q?=F0=9F=9A=A7=20Check=20dtypes=20(WIP)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- check_dtypes.py | 57 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 check_dtypes.py diff --git a/check_dtypes.py b/check_dtypes.py new file mode 100644 index 0000000..64fe514 --- /dev/null +++ b/check_dtypes.py @@ -0,0 +1,57 @@ +import pandas as pd +import re + + +# Fix +mods_info = pd.read_parquet("mods_info_df.parquet") +for c in mods_info.columns: + if c.endswith("-count"): + mods_info[c] = mods_info[c].astype('Int64') + + +# Tmp to parquet +mods_info.to_parquet("tmp.parquet") +mods_info = pd.read_parquet("tmp.parquet") + + +# Check +EXPECTED_TYPES = { + r"mets_file": ("object", ["str"]), + r"titleInfo_title": ("object", ["str"]), + r"titleInfo_subTitle": ("object", ["str", "NoneType"]), + r"titleInfo_partName": ("object", ["str", "NoneType"]), + r"identifier-.*": ("object", ["str", "NoneType"]), + r"location_.*t ": ("object", ["str", "NoneType"]), + r"name\d+_.*": ("object", ["str", "NoneType"]), + r"relatedItem-.*_recordInfo_recordIdentifier": ("object", ["str", "NoneType"]), + r".*-count": ("Int64", None), + + # XXX possibly sets: + r"genre-.*": ("object", ["str", "NoneType"]), + r"subject-.*": ("object", ["str", "NoneType"]), + r"language_.*Term": ("object", ["str", "NoneType"]), +} +def expected_types(c): + for r, types in EXPECTED_TYPES.items(): + if re.fullmatch(r, c): + edt = types[0] + einner_types = types[1] + if einner_types: + einner_types = set(einner_types) + return edt, einner_types + return None, None + +for c in mods_info.columns: + dt = mods_info.dtypes[c] + edt, einner_types = expected_types(c) + + if edt is None: + print(f"No expected dtype known for column {c}") + elif dt != edt: + print(f"Unexpected dtype {dt} for column {c} (expected {edt})") + + if edt == "object": + inner_types = set(type(v).__name__ for v in mods_info[c]) + if any(it not in einner_types for it in inner_types): + print(f"Unexpected inner types {inner_types} for column {c} (expected {einner_types})") + From 15f603671c1ef4e640d433ae921043a3052c19fa Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Mon, 2 Jun 2025 15:36:35 +0200 Subject: [PATCH 18/37] =?UTF-8?q?=F0=9F=9A=A7=20check=5Fdtypes:=20Fix=20lo?= =?UTF-8?q?cation=5F.*?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- check_dtypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/check_dtypes.py b/check_dtypes.py index 64fe514..991fd34 100644 --- a/check_dtypes.py +++ b/check_dtypes.py @@ -21,7 +21,7 @@ EXPECTED_TYPES = { r"titleInfo_subTitle": ("object", ["str", "NoneType"]), r"titleInfo_partName": ("object", ["str", "NoneType"]), r"identifier-.*": ("object", ["str", "NoneType"]), - r"location_.*t ": ("object", ["str", "NoneType"]), + r"location_.*": ("object", ["str", "NoneType"]), r"name\d+_.*": ("object", ["str", "NoneType"]), r"relatedItem-.*_recordInfo_recordIdentifier": ("object", ["str", "NoneType"]), r".*-count": ("Int64", None), From d8d3f12cb0fb6cf3fc17ff4e8f9ba3ac4c5406f2 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Mon, 2 Jun 2025 15:36:57 +0200 Subject: [PATCH 19/37] =?UTF-8?q?=F0=9F=9A=A7=20check=5Fdtypes:=20classifi?= =?UTF-8?q?cation-.*?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- check_dtypes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/check_dtypes.py b/check_dtypes.py index 991fd34..3a24a96 100644 --- a/check_dtypes.py +++ b/check_dtypes.py @@ -30,6 +30,7 @@ EXPECTED_TYPES = { r"genre-.*": ("object", ["str", "NoneType"]), r"subject-.*": ("object", ["str", "NoneType"]), r"language_.*Term": ("object", ["str", "NoneType"]), + r"classification-.*": ("object", ["str", "NoneType"]), } def expected_types(c): for r, types in EXPECTED_TYPES.items(): From 383c6b2d3d2bca3233a025e530e50233eaa06d5f Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Mon, 2 Jun 2025 15:38:37 +0200 Subject: [PATCH 20/37] =?UTF-8?q?=F0=9F=9A=A7=20check=5Fdtypes:=20typeOfRe?= =?UTF-8?q?source?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- check_dtypes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/check_dtypes.py b/check_dtypes.py index 3a24a96..bae19cd 100644 --- a/check_dtypes.py +++ b/check_dtypes.py @@ -25,6 +25,7 @@ EXPECTED_TYPES = { r"name\d+_.*": ("object", ["str", "NoneType"]), r"relatedItem-.*_recordInfo_recordIdentifier": ("object", ["str", "NoneType"]), r".*-count": ("Int64", None), + r"typeOfResource": ("object", ["str", "NoneType"]), # XXX possibly sets: r"genre-.*": ("object", ["str", "NoneType"]), From f5f2dc05a3ab5c737a27ffd618443ad0b17923b4 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Mon, 2 Jun 2025 15:40:43 +0200 Subject: [PATCH 21/37] =?UTF-8?q?=F0=9F=9A=A7=20check=5Fdtypes:=20accessCo?= =?UTF-8?q?ndition-.*?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- check_dtypes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/check_dtypes.py b/check_dtypes.py index bae19cd..082d445 100644 --- a/check_dtypes.py +++ b/check_dtypes.py @@ -26,6 +26,7 @@ EXPECTED_TYPES = { r"relatedItem-.*_recordInfo_recordIdentifier": ("object", ["str", "NoneType"]), r".*-count": ("Int64", None), r"typeOfResource": ("object", ["str", "NoneType"]), + r"accessCondition-.*": ("object", ["str", "NoneType"]), # XXX possibly sets: r"genre-.*": ("object", ["str", "NoneType"]), From ff39da49e875d33a5a87b03233c07e534502dae3 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Mon, 2 Jun 2025 15:43:50 +0200 Subject: [PATCH 22/37] =?UTF-8?q?=F0=9F=9A=A7=20check=5Fdtypes:=20originIn?= =?UTF-8?q?fo-.*?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- check_dtypes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/check_dtypes.py b/check_dtypes.py index 082d445..26a38d9 100644 --- a/check_dtypes.py +++ b/check_dtypes.py @@ -27,6 +27,7 @@ EXPECTED_TYPES = { r".*-count": ("Int64", None), r"typeOfResource": ("object", ["str", "NoneType"]), r"accessCondition-.*": ("object", ["str", "NoneType"]), + r"originInfo-.*": ("object", ["str", "NoneType"]), # XXX possibly sets: r"genre-.*": ("object", ["str", "NoneType"]), From bec59242a0d38627cc2be7a9d55a2972d8330b67 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Mon, 2 Jun 2025 15:44:11 +0200 Subject: [PATCH 23/37] =?UTF-8?q?=F0=9F=9A=A7=20check=5Fdtypes:=20group=20?= =?UTF-8?q?by=20types?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- check_dtypes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/check_dtypes.py b/check_dtypes.py index 26a38d9..3024d9b 100644 --- a/check_dtypes.py +++ b/check_dtypes.py @@ -24,11 +24,12 @@ EXPECTED_TYPES = { r"location_.*": ("object", ["str", "NoneType"]), r"name\d+_.*": ("object", ["str", "NoneType"]), r"relatedItem-.*_recordInfo_recordIdentifier": ("object", ["str", "NoneType"]), - r".*-count": ("Int64", None), r"typeOfResource": ("object", ["str", "NoneType"]), r"accessCondition-.*": ("object", ["str", "NoneType"]), r"originInfo-.*": ("object", ["str", "NoneType"]), + r".*-count": ("Int64", None), + # XXX possibly sets: r"genre-.*": ("object", ["str", "NoneType"]), r"subject-.*": ("object", ["str", "NoneType"]), From 8bc443f9fb1e4db7846781c19fe6aba922d8213d Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 4 Jun 2025 19:05:34 +0200 Subject: [PATCH 24/37] =?UTF-8?q?=F0=9F=8E=A8=20Install=20mypy=20and=20typ?= =?UTF-8?q?e=20stubs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README-DEV.md | 2 +- requirements-dev.txt | 7 +++++++ requirements-test.txt | 2 -- 3 files changed, 8 insertions(+), 3 deletions(-) create mode 100644 requirements-dev.txt delete mode 100644 requirements-test.txt diff --git a/README-DEV.md b/README-DEV.md index 134e784..33da234 100644 --- a/README-DEV.md +++ b/README-DEV.md @@ -1,5 +1,5 @@ ``` -pip install -r requirements-test.txt +pip install -r requirements-dev.txt ``` To run tests: diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..5020dd0 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,7 @@ +pytest +pytest-profiling + +mypy +types-lxml +types-tqdm +pandas-stubs diff --git a/requirements-test.txt b/requirements-test.txt deleted file mode 100644 index 6f0f369..0000000 --- a/requirements-test.txt +++ /dev/null @@ -1,2 +0,0 @@ -pytest -pytest-profiling From 14172e3b8183a5be66cdec6988949ec5ea253c44 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 4 Jun 2025 20:32:07 +0200 Subject: [PATCH 25/37] =?UTF-8?q?=F0=9F=9A=A7=20Save=20Python=20types=20fo?= =?UTF-8?q?r=20later=20conversion?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mods4pandas/lib.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/mods4pandas/lib.py b/src/mods4pandas/lib.py index 8a65901..082ed9a 100644 --- a/src/mods4pandas/lib.py +++ b/src/mods4pandas/lib.py @@ -315,7 +315,8 @@ def column_names_csv(columns): """ return ",".join('"' + c + '"' for c in columns) -current_columns = defaultdict(list) +current_columns: defaultdict = defaultdict(list) +current_columns_types: dict[dict] = defaultdict(dict) def insert_into_db(con, table, d: Dict): """Insert the values from the dict into the table, creating columns if necessary""" @@ -334,6 +335,11 @@ def insert_into_db(con, table, d: Dict): current_columns[table].append(k) con.execute(f'ALTER TABLE {table} ADD COLUMN "{k}"') + # Save types + for k in d.keys(): + if k not in current_columns_types[table]: + current_columns_types[table][k] = type(d[k]).__name__ + # Insert # Unfortunately, Python3's sqlite3 does not like named placeholders with spaces, so we # have use qmark style here. From ebe988cfff18b7849551aa7e1acc6ac26fd2d21f Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 4 Jun 2025 21:10:10 +0200 Subject: [PATCH 26/37] =?UTF-8?q?=F0=9F=9A=A7=20Restore=20types=20before?= =?UTF-8?q?=20saving=20as=20Parquet?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- check_dtypes.py | 10 ---------- src/mods4pandas/alto4pandas.py | 5 ++--- src/mods4pandas/lib.py | 21 +++++++++++++++++++++ src/mods4pandas/mods4pandas.py | 11 +++-------- 4 files changed, 26 insertions(+), 21 deletions(-) diff --git a/check_dtypes.py b/check_dtypes.py index 3024d9b..cbdfd70 100644 --- a/check_dtypes.py +++ b/check_dtypes.py @@ -2,17 +2,7 @@ import pandas as pd import re -# Fix mods_info = pd.read_parquet("mods_info_df.parquet") -for c in mods_info.columns: - if c.endswith("-count"): - mods_info[c] = mods_info[c].astype('Int64') - - -# Tmp to parquet -mods_info.to_parquet("tmp.parquet") -mods_info = pd.read_parquet("tmp.parquet") - # Check EXPECTED_TYPES = { diff --git a/src/mods4pandas/alto4pandas.py b/src/mods4pandas/alto4pandas.py index 77e23e2..0739f35 100755 --- a/src/mods4pandas/alto4pandas.py +++ b/src/mods4pandas/alto4pandas.py @@ -19,7 +19,7 @@ import pandas as pd import numpy as np from tqdm import tqdm -from .lib import TagGroup, sorted_groupby, flatten, ns, insert_into_db +from .lib import TagGroup, convert_db_to_parquet, sorted_groupby, flatten, ns, insert_into_db logger = logging.getLogger('alto4pandas') @@ -188,9 +188,8 @@ def process(alto_files: List[str], output_file: str): import traceback; traceback.print_exc() # Convert the alto_info SQL to a pandas DataFrame - alto_info_df = pd.read_sql_query("SELECT * FROM alto_info", con, index_col="alto_file") logger.info('Writing DataFrame to {}'.format(output_file)) - alto_info_df.to_parquet(output_file) + convert_db_to_parquet(con, "alto_info", "alto_file", output_file) def main(): diff --git a/src/mods4pandas/lib.py b/src/mods4pandas/lib.py index 082ed9a..32f717a 100644 --- a/src/mods4pandas/lib.py +++ b/src/mods4pandas/lib.py @@ -355,3 +355,24 @@ def insert_into_db(con, table, d: Dict): def insert_into_db_multiple(con, table, ld: List[Dict]): for d in ld: insert_into_db(con, table, d) + +def convert_db_to_parquet(con, table, index_col, output_file): + df = pd.read_sql_query(f"SELECT * FROM {table}", con, index_col) + + # Convert Python column type into Pandas type + for c in df.columns: + column_type = current_columns_types[table][c] + + if column_type == "str": + continue + elif column_type == "int": + df[c] = df[c].astype("Int64") + elif column_type == "float64": + df[c] = df[c].astype("Float64") + elif column_type == "set": + # TODO WIP + continue + else: + raise NotImplementedError(f"Column type {column_type} not implemented yet.") + + df.to_parquet(output_file) \ No newline at end of file diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py index 30d7c22..2da7c80 100755 --- a/src/mods4pandas/mods4pandas.py +++ b/src/mods4pandas/mods4pandas.py @@ -18,7 +18,7 @@ import click import pandas as pd from tqdm import tqdm -from .lib import sorted_groupby, TagGroup, ns, flatten, insert_into_db, insert_into_db_multiple +from .lib import convert_db_to_parquet, sorted_groupby, TagGroup, ns, flatten, insert_into_db, insert_into_db_multiple, current_columns_types @@ -457,16 +457,11 @@ def process(mets_files: List[str], output_file: str, output_page_info: str): except Exception as e: logger.exception('Exception in {}'.format(mets_file)) - # Convert the mods_info SQL to a pandas DataFrame - mods_info_df = pd.read_sql_query("SELECT * FROM mods_info", con, index_col="recordInfo_recordIdentifier") logger.info('Writing DataFrame to {}'.format(output_file)) - mods_info_df.to_parquet(output_file) - + convert_db_to_parquet(con, "mods_info", "recordInfo_recordIdentifier", output_file) if output_page_info: - # Convert page_info SQL to a pandas DataFrama - page_info_df = pd.read_sql_query("SELECT * FROM page_info", con_page_info, index_col=["ppn", "ID"]) logger.info('Writing DataFrame to {}'.format(output_page_info)) - page_info_df.to_parquet(output_page_info) + convert_db_to_parquet(con_page_info, "page_info", ["ppn", "ID"], output_page_info) def main(): From 44550ff926400410907d70d35b6a27b3323d1b61 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 11 Jun 2025 14:30:06 +0200 Subject: [PATCH 27/37] =?UTF-8?q?=F0=9F=A4=93=20requirements-dev:=20add=20?= =?UTF-8?q?ipython?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements-dev.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements-dev.txt b/requirements-dev.txt index 5020dd0..e63c022 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,6 +1,8 @@ pytest pytest-profiling +ipython + mypy types-lxml types-tqdm From 580442a4c9430964cd2ee07c16301cc737834667 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 11 Jun 2025 14:36:29 +0200 Subject: [PATCH 28/37] =?UTF-8?q?=F0=9F=A4=93=20Add=20type=20annotations?= =?UTF-8?q?=20(and=20related=20changes)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mods4pandas/lib.py | 55 ++++++++++++++++++---------------- src/mods4pandas/mods4pandas.py | 4 +-- 2 files changed, 32 insertions(+), 27 deletions(-) diff --git a/src/mods4pandas/lib.py b/src/mods4pandas/lib.py index 32f717a..cff8ea9 100644 --- a/src/mods4pandas/lib.py +++ b/src/mods4pandas/lib.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from itertools import groupby import re import warnings @@ -24,40 +26,40 @@ ns = { class TagGroup: """Helper class to simplify the parsing and checking of MODS metadata""" - def __init__(self, tag, group: List[ET.Element]): + def __init__(self, tag, group: List[ET._Element]): self.tag = tag self.group = group - def to_xml(self): + def to_xml(self) -> str: return '\n'.join(str(ET.tostring(e), 'utf-8').strip() for e in self.group) - def __str__(self): + def __str__(self) -> str: return f"TagGroup with content:\n{self.to_xml()}" - def is_singleton(self): + def is_singleton(self) -> TagGroup: if len(self.group) != 1: raise ValueError('More than one instance: {}'.format(self)) return self - def has_no_attributes(self): + def has_no_attributes(self) -> TagGroup: return self.has_attributes({}) - def has_attributes(self, attrib): + def has_attributes(self, attrib) -> TagGroup: if not isinstance(attrib, Sequence): attrib = [attrib] if not all(e.attrib in attrib for e in self.group): raise ValueError('One or more element has unexpected attributes: {}'.format(self)) return self - def ignore_attributes(self): + def ignore_attributes(self) -> TagGroup: # This serves as documentation for now. return self - def sort(self, key=None, reverse=False): + def sort(self, key=None, reverse=False) -> TagGroup: self.group = sorted(self.group, key=key, reverse=reverse) return self - def text(self, separator='\n'): + def text(self, separator='\n') -> str: t = '' for e in self.group: if t != '': @@ -66,13 +68,13 @@ class TagGroup: t += e.text return t - def text_set(self): + def text_set(self) -> set: return {e.text for e in self.group} - def descend(self, raise_errors): + def descend(self, raise_errors) -> dict: return _to_dict(self.is_singleton().group[0], raise_errors) - def filter(self, cond, warn=None): + def filter(self, cond, warn=None) -> TagGroup: new_group = [] for e in self.group: if cond(e): @@ -82,7 +84,7 @@ class TagGroup: warnings.warn('Filtered {} element ({})'.format(self.tag, warn)) return TagGroup(self.tag, new_group) - def force_singleton(self, warn=True): + def force_singleton(self, warn=True) -> TagGroup: if len(self.group) == 1: return self else: @@ -93,7 +95,7 @@ class TagGroup: RE_ISO8601_DATE = r'^\d{2}(\d{2}|XX)(-\d{2}-\d{2})?$' # Note: Includes non-specific century dates like '18XX' RE_GERMAN_DATE = r'^(?P
\d{2})\.(?P\d{2})\.(?P\d{4})$' - def fix_date(self): + def fix_date(self) -> TagGroup: for e in self.group: if e.attrib.get('encoding') == 'w3cdtf': @@ -103,6 +105,9 @@ class TagGroup: new_group = [] for e in self.group: + if e.text is None: + warnings.warn('Empty date') + continue if e.attrib.get('encoding') == 'iso8601' and re.match(self.RE_ISO8601_DATE, e.text): new_group.append(e) elif re.match(self.RE_ISO8601_DATE, e.text): @@ -131,7 +136,7 @@ class TagGroup: return self - def fix_event_type(self): + def fix_event_type(self) -> TagGroup: # According to MODS-AP 2.3.1, every originInfo should have its eventType set. # Fix this for special cases. @@ -161,7 +166,7 @@ class TagGroup: pass return self - def fix_script_term(self): + def fix_script_term(self) -> TagGroup: for e in self.group: # MODS-AP 2.3.1 is not clear about this, but it looks like that this should be lower case. if e.attrib['authority'] == 'ISO15924': @@ -169,7 +174,7 @@ class TagGroup: warnings.warn('Changed scriptTerm authority to lower case') return self - def merge_sub_tags_to_set(self): + def merge_sub_tags_to_set(self) -> dict: from .mods4pandas import mods_to_dict value = {} @@ -189,7 +194,7 @@ class TagGroup: value[sub_tag] = s return value - def attributes(self): + def attributes(self) -> dict[str, str]: """ Return a merged dict of all attributes of the tag group. @@ -204,7 +209,7 @@ class TagGroup: attrib[a_localname] = v return attrib - def subelement_counts(self): + def subelement_counts(self) -> dict[str, int]: counts = {} for e in self.group: for x in e.iter(): @@ -213,7 +218,7 @@ class TagGroup: counts[key] = counts.get(key, 0) + 1 return counts - def xpath_statistics(self, xpath_expr, namespaces): + def xpath_statistics(self, xpath_expr, namespaces) -> dict[str, float]: """ Extract values and calculate statistics @@ -235,7 +240,7 @@ class TagGroup: statistics[f'{xpath_expr}-max'] = np.max(values) return statistics - def xpath_count(self, xpath_expr, namespaces): + def xpath_count(self, xpath_expr, namespaces) -> dict[str, int]: """ Count all elements matching xpath_expr """ @@ -279,7 +284,7 @@ def _to_dict(root, raise_errors): raise ValueError(f"Unknown namespace {root_name.namespace}") -def flatten(d: MutableMapping, parent='', separator='_'): +def flatten(d: MutableMapping, parent='', separator='_') -> dict: """ Flatten the given nested dict. @@ -301,13 +306,13 @@ def flatten(d: MutableMapping, parent='', separator='_'): return dict(items) -def valid_column_key(k): - if re.match("^[a-zA-Z0-9 _@/:\[\]-]+$", k): +def valid_column_key(k) -> bool: + if re.match(r'^[a-zA-Z0-9 _@/:\[\]-]+$', k): return True else: return False -def column_names_csv(columns): +def column_names_csv(columns) -> str: """ Format Column names (identifiers) as a comma-separated list. diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py index 2da7c80..ea6a49f 100755 --- a/src/mods4pandas/mods4pandas.py +++ b/src/mods4pandas/mods4pandas.py @@ -376,7 +376,7 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: @click.option('--output', '-o', 'output_file', type=click.Path(), help='Output Parquet file', default='mods_info_df.parquet', show_default=True) @click.option('--output-page-info', type=click.Path(), help='Output page info Parquet file') -def process(mets_files: List[str], output_file: str, output_page_info: str): +def process(mets_files: list[str], output_file: str, output_page_info: str): """ A tool to convert the MODS metadata in INPUT to a pandas DataFrame. @@ -389,7 +389,7 @@ def process(mets_files: List[str], output_file: str, output_page_info: str): """ # Extend file list if directories are given - mets_files_real = [] + mets_files_real: list[str] = [] for m in mets_files: if os.path.isdir(m): logger.info('Scanning directory {}'.format(m)) From 62b93c760ba6d98ae7f00e3eba40cec9ebaef4b6 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 11 Jun 2025 14:56:26 +0200 Subject: [PATCH 29/37] =?UTF-8?q?=F0=9F=A4=93=20Add=20type=20annotations?= =?UTF-8?q?=20(and=20related=20changes)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mods4pandas/lib.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/mods4pandas/lib.py b/src/mods4pandas/lib.py index cff8ea9..44f1400 100644 --- a/src/mods4pandas/lib.py +++ b/src/mods4pandas/lib.py @@ -114,9 +114,8 @@ class TagGroup: warnings.warn('Added iso8601 encoding to date {}'.format(e.text)) e.attrib['encoding'] = 'iso8601' new_group.append(e) - elif re.match(self.RE_GERMAN_DATE, e.text): + elif m := re.match(self.RE_GERMAN_DATE, e.text): warnings.warn('Converted date {} to iso8601 encoding'.format(e.text)) - m = re.match(self.RE_GERMAN_DATE, e.text) e.text = '{}-{}-{}'.format(m.group('yyyy'), m.group('mm'), m.group('dd')) e.attrib['encoding'] = 'iso8601' new_group.append(e) @@ -210,7 +209,7 @@ class TagGroup: return attrib def subelement_counts(self) -> dict[str, int]: - counts = {} + counts: dict[str, int] = {} for e in self.group: for x in e.iter(): tag = ET.QName(x.tag).localname From e4db150cbaf736aa43b2cbd2c08269e4c72bc9b3 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 11 Jun 2025 15:49:09 +0200 Subject: [PATCH 30/37] =?UTF-8?q?=E2=9A=99=20=20vscode:=20Enable=20pytest?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .vscode/settings.json | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index de288e1..74a2cbb 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,8 @@ { - "python.formatting.provider": "black" + "python.formatting.provider": "black", + "python.testing.pytestArgs": [ + "." + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true } \ No newline at end of file From 5384e18ab5d24e7db6d9031f5a8e241c8d506cc7 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 11 Jun 2025 16:18:33 +0200 Subject: [PATCH 31/37] =?UTF-8?q?=F0=9F=9A=A7=20check=5Fdtypes:=20Check=20?= =?UTF-8?q?alto=5Finfo=20types?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- check_dtypes.py | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/check_dtypes.py b/check_dtypes.py index cbdfd70..cf3da08 100644 --- a/check_dtypes.py +++ b/check_dtypes.py @@ -3,9 +3,13 @@ import re mods_info = pd.read_parquet("mods_info_df.parquet") +alto_info = pd.read_parquet("alto_info_df.parquet") # Check EXPECTED_TYPES = { + + # mods_info + r"mets_file": ("object", ["str"]), r"titleInfo_title": ("object", ["str"]), r"titleInfo_subTitle": ("object", ["str", "NoneType"]), @@ -25,6 +29,19 @@ EXPECTED_TYPES = { r"subject-.*": ("object", ["str", "NoneType"]), r"language_.*Term": ("object", ["str", "NoneType"]), r"classification-.*": ("object", ["str", "NoneType"]), + + # alto_info + + r"Description_.*": ("object", ["str", "NoneType"]), + r"Layout_Page_ID": ("object", ["str", "NoneType"]), + r"Layout_Page_PHYSICAL_(IMG|IMAGE)_NR": ("object", ["str", "NoneType"]), + r"Layout_Page_PROCESSING": ("object", ["str", "NoneType"]), + r"Layout_Page_QUALITY": ("object", ["str", "NoneType"]), + r"Layout_Page_//alto:String/@WC-.*": ("Float64", None), + r"alto_xmlns": ("object", ["str", "NoneType"]), + + # XXX r"Layout_Page_(WIDTH|HEIGHT)": ("Int64", None), + r"Layout_Page_(WIDTH|HEIGHT)": ("object", ["str", "NoneType"]), } def expected_types(c): for r, types in EXPECTED_TYPES.items(): @@ -36,17 +53,21 @@ def expected_types(c): return edt, einner_types return None, None -for c in mods_info.columns: - dt = mods_info.dtypes[c] - edt, einner_types = expected_types(c) +def check_types(df): + for c in df.columns: + dt = df.dtypes[c] + edt, einner_types = expected_types(c) - if edt is None: - print(f"No expected dtype known for column {c}") - elif dt != edt: - print(f"Unexpected dtype {dt} for column {c} (expected {edt})") + if edt is None: + print(f"No expected dtype known for column {c}") + elif dt != edt: + print(f"Unexpected dtype {dt} for column {c} (expected {edt})") - if edt == "object": - inner_types = set(type(v).__name__ for v in mods_info[c]) - if any(it not in einner_types for it in inner_types): - print(f"Unexpected inner types {inner_types} for column {c} (expected {einner_types})") + if edt == "object": + inner_types = set(type(v).__name__ for v in df[c]) + if any(it not in einner_types for it in inner_types): + print(f"Unexpected inner types {inner_types} for column {c} (expected {einner_types})") + +check_types(mods_info) +check_types(alto_info) From a20c979351722f1a813a2c084b2d06aadced2e3f Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 11 Jun 2025 17:20:28 +0200 Subject: [PATCH 32/37] =?UTF-8?q?=F0=9F=A7=B9=20Filter=20annoying=20UserWa?= =?UTF-8?q?rning=20on=20every=20pandas=20import=20(on=20WSL)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- check_dtypes.py | 9 ++++++++- src/mods4pandas/alto4pandas.py | 7 ++++++- src/mods4pandas/lib.py | 7 ++++++- src/mods4pandas/mods4pandas.py | 6 +++++- 4 files changed, 25 insertions(+), 4 deletions(-) diff --git a/check_dtypes.py b/check_dtypes.py index cf3da08..502e4bb 100644 --- a/check_dtypes.py +++ b/check_dtypes.py @@ -1,5 +1,12 @@ -import pandas as pd import re +import warnings +import os + +with warnings.catch_warnings(): + # Filter warnings on WSL + if "Microsoft" in os.uname().release: + warnings.simplefilter("ignore") + import pandas as pd mods_info = pd.read_parquet("mods_info_df.parquet") diff --git a/src/mods4pandas/alto4pandas.py b/src/mods4pandas/alto4pandas.py index 0739f35..668d7f3 100755 --- a/src/mods4pandas/alto4pandas.py +++ b/src/mods4pandas/alto4pandas.py @@ -15,12 +15,17 @@ from typing import List from collections.abc import MutableMapping, Sequence import click -import pandas as pd import numpy as np from tqdm import tqdm from .lib import TagGroup, convert_db_to_parquet, sorted_groupby, flatten, ns, insert_into_db +with warnings.catch_warnings(): + # Filter warnings on WSL + if "Microsoft" in os.uname().release: + warnings.simplefilter("ignore") + import pandas as pd + logger = logging.getLogger('alto4pandas') diff --git a/src/mods4pandas/lib.py b/src/mods4pandas/lib.py index 44f1400..ab01fce 100644 --- a/src/mods4pandas/lib.py +++ b/src/mods4pandas/lib.py @@ -3,13 +3,18 @@ from __future__ import annotations from itertools import groupby import re import warnings +import os from typing import List, Sequence, MutableMapping, Dict from collections import defaultdict -import pandas as pd import numpy as np from lxml import etree as ET +with warnings.catch_warnings(): + # Filter warnings on WSL + if "Microsoft" in os.uname().release: + warnings.simplefilter("ignore") + import pandas as pd __all__ = ["ns"] diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py index ea6a49f..2d80c33 100755 --- a/src/mods4pandas/mods4pandas.py +++ b/src/mods4pandas/mods4pandas.py @@ -15,11 +15,15 @@ from collections import defaultdict from collections.abc import MutableMapping, Sequence import click -import pandas as pd from tqdm import tqdm from .lib import convert_db_to_parquet, sorted_groupby, TagGroup, ns, flatten, insert_into_db, insert_into_db_multiple, current_columns_types +with warnings.catch_warnings(): + # Filter warnings on WSL + if "Microsoft" in os.uname().release: + warnings.simplefilter("ignore") + import pandas as pd logger = logging.getLogger('mods4pandas') From 64ed7298da3097257ef1f0d9b9d6ef328ea00741 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 11 Jun 2025 19:13:38 +0200 Subject: [PATCH 33/37] =?UTF-8?q?=E2=9C=A8=20Make=20Layout=5FPage=5FWIDTH/?= =?UTF-8?q?HEIGHT=20integer=20values?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- check_dtypes.py | 3 +-- src/mods4pandas/alto4pandas.py | 6 ++++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/check_dtypes.py b/check_dtypes.py index 502e4bb..946c5fe 100644 --- a/check_dtypes.py +++ b/check_dtypes.py @@ -47,8 +47,7 @@ EXPECTED_TYPES = { r"Layout_Page_//alto:String/@WC-.*": ("Float64", None), r"alto_xmlns": ("object", ["str", "NoneType"]), - # XXX r"Layout_Page_(WIDTH|HEIGHT)": ("Int64", None), - r"Layout_Page_(WIDTH|HEIGHT)": ("object", ["str", "NoneType"]), + r"Layout_Page_(WIDTH|HEIGHT)": ("Int64", None), } def expected_types(c): for r, types in EXPECTED_TYPES.items(): diff --git a/src/mods4pandas/alto4pandas.py b/src/mods4pandas/alto4pandas.py index 668d7f3..1d7b748 100755 --- a/src/mods4pandas/alto4pandas.py +++ b/src/mods4pandas/alto4pandas.py @@ -89,6 +89,12 @@ def alto_to_dict(alto, raise_errors=True): elif localname == 'Page': value[localname] = {} value[localname].update(TagGroup(tag, group).is_singleton().attributes()) + for attr in ("WIDTH", "HEIGHT"): + if attr in value[localname]: + try: + value[localname][attr] = int(value[localname][attr]) + except ValueError: + del value[localname][attr] value[localname].update(TagGroup(tag, group).subelement_counts()) value[localname].update(TagGroup(tag, group).xpath_statistics("//alto:String/@WC", namespaces)) From d685454c5260e51deaf96c4671ca600201485d50 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 11 Jun 2025 20:41:13 +0200 Subject: [PATCH 34/37] =?UTF-8?q?=E2=9C=A8=20page=5Finfo:=20Use=20boolean?= =?UTF-8?q?=20for=20indicator=20variable,=20str=20for=20hrefs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- check_dtypes.py | 10 +++++++++- src/mods4pandas/lib.py | 4 +++- src/mods4pandas/mods4pandas.py | 4 +++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/check_dtypes.py b/check_dtypes.py index 946c5fe..5925b48 100644 --- a/check_dtypes.py +++ b/check_dtypes.py @@ -10,6 +10,7 @@ with warnings.catch_warnings(): mods_info = pd.read_parquet("mods_info_df.parquet") +page_info = pd.read_parquet("page_info_df.parquet") alto_info = pd.read_parquet("alto_info_df.parquet") # Check @@ -37,6 +38,11 @@ EXPECTED_TYPES = { r"language_.*Term": ("object", ["str", "NoneType"]), r"classification-.*": ("object", ["str", "NoneType"]), + # page_info + + r"fileGrp_.*_file_FLocat_href": ("object", ["str", "NoneType"]), + r"structMap-LOGICAL_TYPE_.*": ("boolean", None), + # alto_info r"Description_.*": ("object", ["str", "NoneType"]), @@ -49,6 +55,7 @@ EXPECTED_TYPES = { r"Layout_Page_(WIDTH|HEIGHT)": ("Int64", None), } + def expected_types(c): for r, types in EXPECTED_TYPES.items(): if re.fullmatch(r, c): @@ -65,7 +72,7 @@ def check_types(df): edt, einner_types = expected_types(c) if edt is None: - print(f"No expected dtype known for column {c}") + print(f"No expected dtype known for column {c} (got {dt})") elif dt != edt: print(f"Unexpected dtype {dt} for column {c} (expected {edt})") @@ -75,5 +82,6 @@ def check_types(df): print(f"Unexpected inner types {inner_types} for column {c} (expected {einner_types})") check_types(mods_info) +check_types(page_info) check_types(alto_info) diff --git a/src/mods4pandas/lib.py b/src/mods4pandas/lib.py index ab01fce..a0646fb 100644 --- a/src/mods4pandas/lib.py +++ b/src/mods4pandas/lib.py @@ -378,10 +378,12 @@ def convert_db_to_parquet(con, table, index_col, output_file): df[c] = df[c].astype("Int64") elif column_type == "float64": df[c] = df[c].astype("Float64") + elif column_type == "bool": + df[c] = df[c].map({"True": True, "False": False}).astype("boolean") elif column_type == "set": # TODO WIP continue else: - raise NotImplementedError(f"Column type {column_type} not implemented yet.") + raise NotImplementedError(f"Column {c}: type {column_type} not implemented yet.") df.to_parquet(output_file) \ No newline at end of file diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py index 2d80c33..7d45b47 100755 --- a/src/mods4pandas/mods4pandas.py +++ b/src/mods4pandas/mods4pandas.py @@ -327,6 +327,8 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: assert file_ is not None fileGrp_USE = file_.getparent().attrib.get("USE") file_FLocat_href = (file_.xpath('mets:FLocat/@xlink:href', namespaces=ns) or [None])[0] + if file_FLocat_href is not None: + file_FLocat_href = str(file_FLocat_href) page_dict[f"fileGrp_{fileGrp_USE}_file_FLocat_href"] = file_FLocat_href def get_struct_log(*, to_phys): @@ -368,7 +370,7 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: for struct_div in struct_divs: type_ = struct_div.attrib.get("TYPE").lower() assert type_ - page_dict[f"structMap-LOGICAL_TYPE_{type_}"] = 1 + page_dict[f"structMap-LOGICAL_TYPE_{type_}"] = True result.append(page_dict) From ebdded90d6162aa8de6e240059670899887f535d Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Thu, 12 Jun 2025 07:02:23 +0200 Subject: [PATCH 35/37] =?UTF-8?q?=F0=9F=A4=93=20Add=20type=20annotations?= =?UTF-8?q?=20(and=20related=20changes)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mods4pandas/lib.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/mods4pandas/lib.py b/src/mods4pandas/lib.py index a0646fb..4d85a9e 100644 --- a/src/mods4pandas/lib.py +++ b/src/mods4pandas/lib.py @@ -4,7 +4,7 @@ from itertools import groupby import re import warnings import os -from typing import List, Sequence, MutableMapping, Dict +from typing import Any, List, Sequence, MutableMapping, Dict from collections import defaultdict import numpy as np @@ -229,12 +229,14 @@ class TagGroup: Extract values using the given XPath expression, convert them to float and return descriptive statistics on the values. """ - values = [] - for e in self.group: - r = e.xpath(xpath_expr, namespaces=namespaces) - values += r - values = np.array([float(v) for v in values]) + def xpath_values(): + values = [] + for e in self.group: + r = e.xpath(xpath_expr, namespaces=namespaces) + values += r + return np.array([float(v) for v in values]) + values = xpath_values() statistics = {} if values.size > 0: statistics[f'{xpath_expr}-mean'] = np.mean(values) @@ -294,7 +296,7 @@ def flatten(d: MutableMapping, parent='', separator='_') -> dict: It is assumed that d maps strings to either another dictionary (similarly structured) or some other value. """ - items = [] + items: list[Any] = [] for k, v in d.items(): if parent: @@ -324,8 +326,8 @@ def column_names_csv(columns) -> str: """ return ",".join('"' + c + '"' for c in columns) -current_columns: defaultdict = defaultdict(list) -current_columns_types: dict[dict] = defaultdict(dict) +current_columns: dict[str, list] = defaultdict(list) +current_columns_types: dict[str, dict] = defaultdict(dict) def insert_into_db(con, table, d: Dict): """Insert the values from the dict into the table, creating columns if necessary""" From 215bfbb11fe816595fd3f8d637ba7458ae20e01e Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Thu, 12 Jun 2025 07:45:22 +0200 Subject: [PATCH 36/37] =?UTF-8?q?=E2=9C=A8=20Represent=20sets=20as=20array?= =?UTF-8?q?s=20in=20the=20Parquet=20file?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- check_dtypes.py | 10 +++++----- src/mods4pandas/lib.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/check_dtypes.py b/check_dtypes.py index 5925b48..b5736df 100644 --- a/check_dtypes.py +++ b/check_dtypes.py @@ -24,6 +24,7 @@ EXPECTED_TYPES = { r"titleInfo_partName": ("object", ["str", "NoneType"]), r"identifier-.*": ("object", ["str", "NoneType"]), r"location_.*": ("object", ["str", "NoneType"]), + r"name\d+_.*roleTerm": ("object", ["ndarray", "NoneType"]), r"name\d+_.*": ("object", ["str", "NoneType"]), r"relatedItem-.*_recordInfo_recordIdentifier": ("object", ["str", "NoneType"]), r"typeOfResource": ("object", ["str", "NoneType"]), @@ -32,11 +33,10 @@ EXPECTED_TYPES = { r".*-count": ("Int64", None), - # XXX possibly sets: - r"genre-.*": ("object", ["str", "NoneType"]), - r"subject-.*": ("object", ["str", "NoneType"]), - r"language_.*Term": ("object", ["str", "NoneType"]), - r"classification-.*": ("object", ["str", "NoneType"]), + r"genre-.*": ("object", ["ndarray", "NoneType"]), + r"subject-.*": ("object", ["ndarray", "NoneType"]), + r"language_.*Term": ("object", ["ndarray", "NoneType"]), + r"classification-.*": ("object", ["ndarray", "NoneType"]), # page_info diff --git a/src/mods4pandas/lib.py b/src/mods4pandas/lib.py index 4d85a9e..68050b1 100644 --- a/src/mods4pandas/lib.py +++ b/src/mods4pandas/lib.py @@ -1,5 +1,6 @@ from __future__ import annotations +import ast from itertools import groupby import re import warnings @@ -383,8 +384,7 @@ def convert_db_to_parquet(con, table, index_col, output_file): elif column_type == "bool": df[c] = df[c].map({"True": True, "False": False}).astype("boolean") elif column_type == "set": - # TODO WIP - continue + df[c] = df[c].apply(lambda s: list(ast.literal_eval(s)) if s else None) else: raise NotImplementedError(f"Column {c}: type {column_type} not implemented yet.") From ac8740c33fba027199699837c14b14a2f5639491 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Thu, 12 Jun 2025 09:42:29 +0200 Subject: [PATCH 37/37] =?UTF-8?q?=E2=9C=94=20=20Test=20if=20dtypes=20are?= =?UTF-8?q?=20as=20expected=20in=20produced=20Parquet=20files?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- check_dtypes.py | 87 ----------------------- src/mods4pandas/alto4pandas.py | 5 +- src/mods4pandas/mods4pandas.py | 6 +- src/mods4pandas/tests/test_alto.py | 53 +++++++++++++- src/mods4pandas/tests/test_mods4pandas.py | 71 +++++++++++++++++- 5 files changed, 130 insertions(+), 92 deletions(-) delete mode 100644 check_dtypes.py diff --git a/check_dtypes.py b/check_dtypes.py deleted file mode 100644 index b5736df..0000000 --- a/check_dtypes.py +++ /dev/null @@ -1,87 +0,0 @@ -import re -import warnings -import os - -with warnings.catch_warnings(): - # Filter warnings on WSL - if "Microsoft" in os.uname().release: - warnings.simplefilter("ignore") - import pandas as pd - - -mods_info = pd.read_parquet("mods_info_df.parquet") -page_info = pd.read_parquet("page_info_df.parquet") -alto_info = pd.read_parquet("alto_info_df.parquet") - -# Check -EXPECTED_TYPES = { - - # mods_info - - r"mets_file": ("object", ["str"]), - r"titleInfo_title": ("object", ["str"]), - r"titleInfo_subTitle": ("object", ["str", "NoneType"]), - r"titleInfo_partName": ("object", ["str", "NoneType"]), - r"identifier-.*": ("object", ["str", "NoneType"]), - r"location_.*": ("object", ["str", "NoneType"]), - r"name\d+_.*roleTerm": ("object", ["ndarray", "NoneType"]), - r"name\d+_.*": ("object", ["str", "NoneType"]), - r"relatedItem-.*_recordInfo_recordIdentifier": ("object", ["str", "NoneType"]), - r"typeOfResource": ("object", ["str", "NoneType"]), - r"accessCondition-.*": ("object", ["str", "NoneType"]), - r"originInfo-.*": ("object", ["str", "NoneType"]), - - r".*-count": ("Int64", None), - - r"genre-.*": ("object", ["ndarray", "NoneType"]), - r"subject-.*": ("object", ["ndarray", "NoneType"]), - r"language_.*Term": ("object", ["ndarray", "NoneType"]), - r"classification-.*": ("object", ["ndarray", "NoneType"]), - - # page_info - - r"fileGrp_.*_file_FLocat_href": ("object", ["str", "NoneType"]), - r"structMap-LOGICAL_TYPE_.*": ("boolean", None), - - # alto_info - - r"Description_.*": ("object", ["str", "NoneType"]), - r"Layout_Page_ID": ("object", ["str", "NoneType"]), - r"Layout_Page_PHYSICAL_(IMG|IMAGE)_NR": ("object", ["str", "NoneType"]), - r"Layout_Page_PROCESSING": ("object", ["str", "NoneType"]), - r"Layout_Page_QUALITY": ("object", ["str", "NoneType"]), - r"Layout_Page_//alto:String/@WC-.*": ("Float64", None), - r"alto_xmlns": ("object", ["str", "NoneType"]), - - r"Layout_Page_(WIDTH|HEIGHT)": ("Int64", None), -} - -def expected_types(c): - for r, types in EXPECTED_TYPES.items(): - if re.fullmatch(r, c): - edt = types[0] - einner_types = types[1] - if einner_types: - einner_types = set(einner_types) - return edt, einner_types - return None, None - -def check_types(df): - for c in df.columns: - dt = df.dtypes[c] - edt, einner_types = expected_types(c) - - if edt is None: - print(f"No expected dtype known for column {c} (got {dt})") - elif dt != edt: - print(f"Unexpected dtype {dt} for column {c} (expected {edt})") - - if edt == "object": - inner_types = set(type(v).__name__ for v in df[c]) - if any(it not in einner_types for it in inner_types): - print(f"Unexpected inner types {inner_types} for column {c} (expected {einner_types})") - -check_types(mods_info) -check_types(page_info) -check_types(alto_info) - diff --git a/src/mods4pandas/alto4pandas.py b/src/mods4pandas/alto4pandas.py index 1d7b748..359a26e 100755 --- a/src/mods4pandas/alto4pandas.py +++ b/src/mods4pandas/alto4pandas.py @@ -138,7 +138,7 @@ def walk(m): @click.argument('alto_files', type=click.Path(exists=True), required=True, nargs=-1) @click.option('--output', '-o', 'output_file', type=click.Path(), help='Output Parquet file', default='alto_info_df.parquet', show_default=True) -def process(alto_files: List[str], output_file: str): +def process_command(alto_files: List[str], output_file: str): """ A tool to convert the ALTO metadata in INPUT to a pandas DataFrame. @@ -151,6 +151,9 @@ def process(alto_files: List[str], output_file: str): - and a CSV file with all conversion warnings. """ + process(alto_files, output_file) + +def process(alto_files: List[str], output_file: str): # Extend file list if directories are given alto_files_real = [] for m in alto_files: diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py index 7d45b47..669c1e0 100755 --- a/src/mods4pandas/mods4pandas.py +++ b/src/mods4pandas/mods4pandas.py @@ -382,7 +382,7 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: @click.option('--output', '-o', 'output_file', type=click.Path(), help='Output Parquet file', default='mods_info_df.parquet', show_default=True) @click.option('--output-page-info', type=click.Path(), help='Output page info Parquet file') -def process(mets_files: list[str], output_file: str, output_page_info: str): +def process_command(mets_files: list[str], output_file: str, output_page_info: str): """ A tool to convert the MODS metadata in INPUT to a pandas DataFrame. @@ -393,7 +393,9 @@ def process(mets_files: list[str], output_file: str, output_page_info: str): Per-page information (e.g. structure information) can be output to a separate Parquet file. """ + process(mets_files, output_file, output_page_info) +def process(mets_files: list[str], output_file: str, output_page_info: str): # Extend file list if directories are given mets_files_real: list[str] = [] for m in mets_files: @@ -476,7 +478,7 @@ def main(): for prefix, uri in ns.items(): ET.register_namespace(prefix, uri) - process() + process_command() if __name__ == '__main__': diff --git a/src/mods4pandas/tests/test_alto.py b/src/mods4pandas/tests/test_alto.py index 827bc7a..adf931f 100644 --- a/src/mods4pandas/tests/test_alto.py +++ b/src/mods4pandas/tests/test_alto.py @@ -1,9 +1,13 @@ +from pathlib import Path +import re from lxml import etree as ET +import pandas as pd -from mods4pandas.alto4pandas import alto_to_dict +from mods4pandas.alto4pandas import alto_to_dict, process from mods4pandas.lib import flatten +TESTS_DATA_DIR = Path(__file__).parent / "data" def dict_fromstring(x): return flatten(alto_to_dict(ET.fromstring(x))) @@ -79,3 +83,50 @@ def test_String_TAGREF_counts(): """) assert d['Layout_Page_//alto:String[@TAGREFS]-count'] == 3 assert d['Layout_Page_String-count'] == 4 + + +def test_dtypes(tmp_path): + alto_dir = (TESTS_DATA_DIR / "alto").absolute().as_posix() + alto_info_df_parquet = (tmp_path / "test_dtypes_alto_info.parquet").as_posix() + process([alto_dir], alto_info_df_parquet) + alto_info_df = pd.read_parquet(alto_info_df_parquet) + + EXPECTED_TYPES = { + r"Description_.*": ("object", ["str", "NoneType"]), + r"Layout_Page_ID": ("object", ["str", "NoneType"]), + r"Layout_Page_PHYSICAL_(IMG|IMAGE)_NR": ("object", ["str", "NoneType"]), + r"Layout_Page_PROCESSING": ("object", ["str", "NoneType"]), + r"Layout_Page_QUALITY": ("object", ["str", "NoneType"]), + r"Layout_Page_//alto:String/@WC-.*": ("Float64", None), + r".*-count": ("Int64", None), + r"alto_xmlns": ("object", ["str", "NoneType"]), + + r"Layout_Page_(WIDTH|HEIGHT)": ("Int64", None), + } + def expected_types(c): + """Return the expected types for column c.""" + for r, types in EXPECTED_TYPES.items(): + if re.fullmatch(r, c): + edt = types[0] + einner_types = types[1] + if einner_types: + einner_types = set(einner_types) + return edt, einner_types + return None, None + + def check_types(df): + """Check the types of the DataFrame df.""" + for c in df.columns: + dt = df.dtypes[c] + edt, einner_types = expected_types(c) + print(c, dt, edt) + + assert edt is not None, f"No expected dtype known for column {c} (got {dt})" + assert dt == edt, f"Unexpected dtype {dt} for column {c} (expected {edt})" + + if edt == "object": + inner_types = set(type(v).__name__ for v in df[c]) + assert all(it in einner_types for it in inner_types), \ + f"Unexpected inner types {inner_types} for column {c} (expected {einner_types})" + + check_types(alto_info_df) \ No newline at end of file diff --git a/src/mods4pandas/tests/test_mods4pandas.py b/src/mods4pandas/tests/test_mods4pandas.py index f9a98d7..0707a74 100644 --- a/src/mods4pandas/tests/test_mods4pandas.py +++ b/src/mods4pandas/tests/test_mods4pandas.py @@ -1,10 +1,14 @@ +from pathlib import Path +import re from lxml import etree as ET +import pandas as pd import pytest -from mods4pandas.mods4pandas import mods_to_dict +from mods4pandas.mods4pandas import mods_to_dict, process from mods4pandas.lib import flatten +TESTS_DATA_DIR = Path(__file__).parent / "data" def dict_fromstring(x): """Helper function to parse a MODS XML string to a flattened dict""" @@ -151,3 +155,68 @@ def test_relatedItem(): """) assert d['relatedItem-original_recordInfo_recordIdentifier-dnb-ppn'] == '1236513355' + +def test_dtypes(tmp_path): + mets_files = [p.absolute().as_posix() for p in (TESTS_DATA_DIR / "mets-mods").glob("*.xml")] + mods_info_df_parquet = (tmp_path / "test_dtypes_mods_info.parquet").as_posix() + page_info_df_parquet = (tmp_path / "test_dtypes_page_info.parquet").as_posix() + process(mets_files, mods_info_df_parquet, page_info_df_parquet) + mods_info_df = pd.read_parquet(mods_info_df_parquet) + page_info_df = pd.read_parquet(page_info_df_parquet) + + EXPECTED_TYPES = { + # mods_info + + r"mets_file": ("object", ["str"]), + r"titleInfo_title": ("object", ["str"]), + r"titleInfo_subTitle": ("object", ["str", "NoneType"]), + r"titleInfo_partName": ("object", ["str", "NoneType"]), + r"identifier-.*": ("object", ["str", "NoneType"]), + r"location_.*": ("object", ["str", "NoneType"]), + r"name\d+_.*roleTerm": ("object", ["ndarray", "NoneType"]), + r"name\d+_.*": ("object", ["str", "NoneType"]), + r"relatedItem-.*_recordInfo_recordIdentifier": ("object", ["str", "NoneType"]), + r"typeOfResource": ("object", ["str", "NoneType"]), + r"accessCondition-.*": ("object", ["str", "NoneType"]), + r"originInfo-.*": ("object", ["str", "NoneType"]), + + r".*-count": ("Int64", None), + + r"genre-.*": ("object", ["ndarray", "NoneType"]), + r"subject-.*": ("object", ["ndarray", "NoneType"]), + r"language_.*Term": ("object", ["ndarray", "NoneType"]), + r"classification-.*": ("object", ["ndarray", "NoneType"]), + + # page_info + + r"fileGrp_.*_file_FLocat_href": ("object", ["str", "NoneType"]), + r"structMap-LOGICAL_TYPE_.*": ("boolean", None), + } + def expected_types(c): + """Return the expected types for column c.""" + for r, types in EXPECTED_TYPES.items(): + if re.fullmatch(r, c): + edt = types[0] + einner_types = types[1] + if einner_types: + einner_types = set(einner_types) + return edt, einner_types + return None, None + + def check_types(df): + """Check the types of the DataFrame df.""" + for c in df.columns: + dt = df.dtypes[c] + edt, einner_types = expected_types(c) + print(c, dt, edt) + + assert edt is not None, f"No expected dtype known for column {c} (got {dt})" + assert dt == edt, f"Unexpected dtype {dt} for column {c} (expected {edt})" + + if edt == "object": + inner_types = set(type(v).__name__ for v in df[c]) + assert all(it in einner_types for it in inner_types), \ + f"Unexpected inner types {inner_types} for column {c} (expected {einner_types})" + + check_types(mods_info_df) + check_types(page_info_df) \ No newline at end of file