From fef45e42d48764e600d1648b4db48a58e342f33d Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 6 Aug 2025 19:58:48 +0200 Subject: [PATCH 1/8] =?UTF-8?q?=E2=9A=99=20=20pre-commit:=20update=20confi?= =?UTF-8?q?g?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 085ca09..26ea3e1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,7 +16,7 @@ repos: - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.13 + rev: v0.12.7 hooks: - args: - --fix @@ -24,7 +24,7 @@ repos: id: ruff-check - repo: https://gitlab.com/vojko.pribudic.foss/pre-commit-update - rev: v0.7.0 + rev: v0.8.0 hooks: - id: pre-commit-update From f59bcfbd63b6ebe194f31b084bd6f867d6360b8b Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 6 Aug 2025 20:00:19 +0200 Subject: [PATCH 2/8] =?UTF-8?q?=F0=9F=90=9B=20Fix=20alto4pandas=20CLI?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mods4pandas/alto4pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mods4pandas/alto4pandas.py b/src/mods4pandas/alto4pandas.py index 2e9f6cb..500ac31 100755 --- a/src/mods4pandas/alto4pandas.py +++ b/src/mods4pandas/alto4pandas.py @@ -252,7 +252,7 @@ def main(): for prefix, uri in ns.items(): ET.register_namespace(prefix, uri) - process() + process_command() if __name__ == "__main__": From 4204e811f7c591f8184a4de71f7edd52a1ccf51f Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 6 Aug 2025 20:14:34 +0200 Subject: [PATCH 3/8] =?UTF-8?q?=E2=9A=99=20=20editorconfig:=20Update=20for?= =?UTF-8?q?=20VS=20Code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .editorconfig | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/.editorconfig b/.editorconfig index fac7a92..a63f7f6 100644 --- a/.editorconfig +++ b/.editorconfig @@ -1,2 +1,29 @@ +root = true + [*] +charset = utf-8 +end_of_line = lf +indent_size = 4 +indent_style = space +insert_final_newline = true +trim_trailing_whitespace = true max_line_length = 120 +tab_width = 4 + +[{*.cfg, *.ini, *.html, *.yaml, *.yml}] +indent_size = 2 + +[*.json] +indent_size = 2 +insert_final_newline = true + +# trailing spaces in markdown indicate word wrap +[*.md] +trim_trailing_whitespace = false + +[*.py] +multi_line_output = 3 +include_trailing_comma = True +force_grid_wrap = 0 +use_parentheses = True +ensure_newline_before_comments = True From a9d650e34563d8774808a11c90feda7a2457abed Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 6 Aug 2025 20:16:16 +0200 Subject: [PATCH 4/8] =?UTF-8?q?=E2=9C=94=20=20ALTO:=20Make=20sure=20we=20h?= =?UTF-8?q?ave=20inner=20types=20when=20testing=20SQLite=20conversion=20fo?= =?UTF-8?q?r=20'object'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mods4pandas/tests/test_alto.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mods4pandas/tests/test_alto.py b/src/mods4pandas/tests/test_alto.py index 849e076..a215f89 100644 --- a/src/mods4pandas/tests/test_alto.py +++ b/src/mods4pandas/tests/test_alto.py @@ -135,6 +135,7 @@ def test_dtypes(tmp_path): assert dt == edt, f"Unexpected dtype {dt} for column {c} (expected {edt})" if edt == "object": + assert einner_types is not None inner_types = set(type(v).__name__ for v in df[c]) assert all( it in einner_types for it in inner_types From eae273452cf867bdfc34592b396177bb0cb91d61 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 6 Aug 2025 20:17:10 +0200 Subject: [PATCH 5/8] =?UTF-8?q?=F0=9F=90=9B=20MODS/get=5Fmets=5Fdiv:=20Ret?= =?UTF-8?q?urn=20empty=20list=20in=20case=20an=20ID=20is=20not=20found?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mods4pandas/mods4pandas.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py index d1a3d4f..e12878d 100755 --- a/src/mods4pandas/mods4pandas.py +++ b/src/mods4pandas/mods4pandas.py @@ -432,6 +432,8 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: def get_mets_div(*, ID): if ID: return structMap_LOGICAL.findall(f'.//mets:div[@ID="{ID}"]', ns) + else: + return [] for page in div_physSequence: # TODO sort by ORDER? From 2f5c872563f75673681598e3bef813596d9db63c Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 6 Aug 2025 20:18:44 +0200 Subject: [PATCH 6/8] =?UTF-8?q?=F0=9F=90=9B=20Explicitly=20set=20con=5Fpag?= =?UTF-8?q?e=5Finfo=20to=20None=20if=20we=20don't=20output=20page=5Finfo?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mods4pandas/mods4pandas.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py index e12878d..3d71268 100755 --- a/src/mods4pandas/mods4pandas.py +++ b/src/mods4pandas/mods4pandas.py @@ -560,6 +560,8 @@ def process(mets_files: list[str], output_file: str, output_page_info: str): with contextlib.suppress(FileNotFoundError): os.remove(output_page_info_sqlite3) con_page_info = sqlite3.connect(output_page_info_sqlite3) + else: + con_page_info = None # Process METS files with open(output_file + ".warnings.csv", "w") as csvfile: From a3fc34fcdcbd43bb4f36488dcbdfa03568100529 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 6 Aug 2025 20:19:34 +0200 Subject: [PATCH 7/8] =?UTF-8?q?=E2=9C=94=20=20MODS:=20Check=20for=20Warnin?= =?UTF-8?q?gs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mods4pandas/tests/test_mods4pandas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mods4pandas/tests/test_mods4pandas.py b/src/mods4pandas/tests/test_mods4pandas.py index d7daf84..24f051f 100644 --- a/src/mods4pandas/tests/test_mods4pandas.py +++ b/src/mods4pandas/tests/test_mods4pandas.py @@ -153,6 +153,7 @@ def test_originInfo_no_event_type(): assert d == {} # empty assert len(ws) == 1 + assert isinstance(ws[0].message, Warning) assert ( ws[0].message.args[0] == "Filtered {http://www.loc.gov/mods/v3}originInfo element (has no eventType)" From 3afcd696ee39188f399ce4a983d23674cd863edf Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 6 Aug 2025 20:23:58 +0200 Subject: [PATCH 8/8] =?UTF-8?q?=E2=9C=92=20=20README-DEV:=20Give=20example?= =?UTF-8?q?=20how=20to=20run=20the=20CLIs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README-DEV.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README-DEV.md b/README-DEV.md index 0765657..b3a56c9 100644 --- a/README-DEV.md +++ b/README-DEV.md @@ -16,6 +16,14 @@ To run a test with profiling: pytest --profile-svg -k test_page_info ``` +To directly test the CLIs using our test data, run: + +``` +mods4pandas src/mods4pandas/tests/data/mets-mods +alto4pandas src/mods4pandas/tests/data/alto +``` + + # How to use pre-commit This project optionally uses [pre-commit](https://pre-commit.com) to check commits. To use it: