diff --git a/check_dtypes.py b/check_dtypes.py index 5925b48..b5736df 100644 --- a/check_dtypes.py +++ b/check_dtypes.py @@ -24,6 +24,7 @@ EXPECTED_TYPES = { r"titleInfo_partName": ("object", ["str", "NoneType"]), r"identifier-.*": ("object", ["str", "NoneType"]), r"location_.*": ("object", ["str", "NoneType"]), + r"name\d+_.*roleTerm": ("object", ["ndarray", "NoneType"]), r"name\d+_.*": ("object", ["str", "NoneType"]), r"relatedItem-.*_recordInfo_recordIdentifier": ("object", ["str", "NoneType"]), r"typeOfResource": ("object", ["str", "NoneType"]), @@ -32,11 +33,10 @@ EXPECTED_TYPES = { r".*-count": ("Int64", None), - # XXX possibly sets: - r"genre-.*": ("object", ["str", "NoneType"]), - r"subject-.*": ("object", ["str", "NoneType"]), - r"language_.*Term": ("object", ["str", "NoneType"]), - r"classification-.*": ("object", ["str", "NoneType"]), + r"genre-.*": ("object", ["ndarray", "NoneType"]), + r"subject-.*": ("object", ["ndarray", "NoneType"]), + r"language_.*Term": ("object", ["ndarray", "NoneType"]), + r"classification-.*": ("object", ["ndarray", "NoneType"]), # page_info diff --git a/src/mods4pandas/lib.py b/src/mods4pandas/lib.py index 4d85a9e..68050b1 100644 --- a/src/mods4pandas/lib.py +++ b/src/mods4pandas/lib.py @@ -1,5 +1,6 @@ from __future__ import annotations +import ast from itertools import groupby import re import warnings @@ -383,8 +384,7 @@ def convert_db_to_parquet(con, table, index_col, output_file): elif column_type == "bool": df[c] = df[c].map({"True": True, "False": False}).astype("boolean") elif column_type == "set": - # TODO WIP - continue + df[c] = df[c].apply(lambda s: list(ast.literal_eval(s)) if s else None) else: raise NotImplementedError(f"Column {c}: type {column_type} not implemented yet.")