diff --git a/check_dtypes.py b/check_dtypes.py index 946c5fe..5925b48 100644 --- a/check_dtypes.py +++ b/check_dtypes.py @@ -10,6 +10,7 @@ with warnings.catch_warnings(): mods_info = pd.read_parquet("mods_info_df.parquet") +page_info = pd.read_parquet("page_info_df.parquet") alto_info = pd.read_parquet("alto_info_df.parquet") # Check @@ -37,6 +38,11 @@ EXPECTED_TYPES = { r"language_.*Term": ("object", ["str", "NoneType"]), r"classification-.*": ("object", ["str", "NoneType"]), + # page_info + + r"fileGrp_.*_file_FLocat_href": ("object", ["str", "NoneType"]), + r"structMap-LOGICAL_TYPE_.*": ("boolean", None), + # alto_info r"Description_.*": ("object", ["str", "NoneType"]), @@ -49,6 +55,7 @@ EXPECTED_TYPES = { r"Layout_Page_(WIDTH|HEIGHT)": ("Int64", None), } + def expected_types(c): for r, types in EXPECTED_TYPES.items(): if re.fullmatch(r, c): @@ -65,7 +72,7 @@ def check_types(df): edt, einner_types = expected_types(c) if edt is None: - print(f"No expected dtype known for column {c}") + print(f"No expected dtype known for column {c} (got {dt})") elif dt != edt: print(f"Unexpected dtype {dt} for column {c} (expected {edt})") @@ -75,5 +82,6 @@ def check_types(df): print(f"Unexpected inner types {inner_types} for column {c} (expected {einner_types})") check_types(mods_info) +check_types(page_info) check_types(alto_info) diff --git a/src/mods4pandas/lib.py b/src/mods4pandas/lib.py index ab01fce..a0646fb 100644 --- a/src/mods4pandas/lib.py +++ b/src/mods4pandas/lib.py @@ -378,10 +378,12 @@ def convert_db_to_parquet(con, table, index_col, output_file): df[c] = df[c].astype("Int64") elif column_type == "float64": df[c] = df[c].astype("Float64") + elif column_type == "bool": + df[c] = df[c].map({"True": True, "False": False}).astype("boolean") elif column_type == "set": # TODO WIP continue else: - raise NotImplementedError(f"Column type {column_type} not implemented yet.") + raise NotImplementedError(f"Column {c}: type {column_type} not implemented yet.") df.to_parquet(output_file) \ No newline at end of file diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py index 2d80c33..7d45b47 100755 --- a/src/mods4pandas/mods4pandas.py +++ b/src/mods4pandas/mods4pandas.py @@ -327,6 +327,8 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: assert file_ is not None fileGrp_USE = file_.getparent().attrib.get("USE") file_FLocat_href = (file_.xpath('mets:FLocat/@xlink:href', namespaces=ns) or [None])[0] + if file_FLocat_href is not None: + file_FLocat_href = str(file_FLocat_href) page_dict[f"fileGrp_{fileGrp_USE}_file_FLocat_href"] = file_FLocat_href def get_struct_log(*, to_phys): @@ -368,7 +370,7 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: for struct_div in struct_divs: type_ = struct_div.attrib.get("TYPE").lower() assert type_ - page_dict[f"structMap-LOGICAL_TYPE_{type_}"] = 1 + page_dict[f"structMap-LOGICAL_TYPE_{type_}"] = True result.append(page_dict)