From 5384e18ab5d24e7db6d9031f5a8e241c8d506cc7 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 11 Jun 2025 16:18:33 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A7=20check=5Fdtypes:=20Check=20alto?= =?UTF-8?q?=5Finfo=20types?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- check_dtypes.py | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/check_dtypes.py b/check_dtypes.py index cbdfd70..cf3da08 100644 --- a/check_dtypes.py +++ b/check_dtypes.py @@ -3,9 +3,13 @@ import re mods_info = pd.read_parquet("mods_info_df.parquet") +alto_info = pd.read_parquet("alto_info_df.parquet") # Check EXPECTED_TYPES = { + + # mods_info + r"mets_file": ("object", ["str"]), r"titleInfo_title": ("object", ["str"]), r"titleInfo_subTitle": ("object", ["str", "NoneType"]), @@ -25,6 +29,19 @@ EXPECTED_TYPES = { r"subject-.*": ("object", ["str", "NoneType"]), r"language_.*Term": ("object", ["str", "NoneType"]), r"classification-.*": ("object", ["str", "NoneType"]), + + # alto_info + + r"Description_.*": ("object", ["str", "NoneType"]), + r"Layout_Page_ID": ("object", ["str", "NoneType"]), + r"Layout_Page_PHYSICAL_(IMG|IMAGE)_NR": ("object", ["str", "NoneType"]), + r"Layout_Page_PROCESSING": ("object", ["str", "NoneType"]), + r"Layout_Page_QUALITY": ("object", ["str", "NoneType"]), + r"Layout_Page_//alto:String/@WC-.*": ("Float64", None), + r"alto_xmlns": ("object", ["str", "NoneType"]), + + # XXX r"Layout_Page_(WIDTH|HEIGHT)": ("Int64", None), + r"Layout_Page_(WIDTH|HEIGHT)": ("object", ["str", "NoneType"]), } def expected_types(c): for r, types in EXPECTED_TYPES.items(): @@ -36,17 +53,21 @@ def expected_types(c): return edt, einner_types return None, None -for c in mods_info.columns: - dt = mods_info.dtypes[c] - edt, einner_types = expected_types(c) +def check_types(df): + for c in df.columns: + dt = df.dtypes[c] + edt, einner_types = expected_types(c) - if edt is None: - print(f"No expected dtype known for column {c}") - elif dt != edt: - print(f"Unexpected dtype {dt} for column {c} (expected {edt})") + if edt is None: + print(f"No expected dtype known for column {c}") + elif dt != edt: + print(f"Unexpected dtype {dt} for column {c} (expected {edt})") - if edt == "object": - inner_types = set(type(v).__name__ for v in mods_info[c]) - if any(it not in einner_types for it in inner_types): - print(f"Unexpected inner types {inner_types} for column {c} (expected {einner_types})") + if edt == "object": + inner_types = set(type(v).__name__ for v in df[c]) + if any(it not in einner_types for it in inner_types): + print(f"Unexpected inner types {inner_types} for column {c} (expected {einner_types})") + +check_types(mods_info) +check_types(alto_info)