From 215bfbb11fe816595fd3f8d637ba7458ae20e01e Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Thu, 12 Jun 2025 07:45:22 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20Represent=20sets=20as=20arrays=20in?= =?UTF-8?q?=20the=20Parquet=20file?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- check_dtypes.py | 10 +++++----- src/mods4pandas/lib.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/check_dtypes.py b/check_dtypes.py index 5925b48..b5736df 100644 --- a/check_dtypes.py +++ b/check_dtypes.py @@ -24,6 +24,7 @@ EXPECTED_TYPES = { r"titleInfo_partName": ("object", ["str", "NoneType"]), r"identifier-.*": ("object", ["str", "NoneType"]), r"location_.*": ("object", ["str", "NoneType"]), + r"name\d+_.*roleTerm": ("object", ["ndarray", "NoneType"]), r"name\d+_.*": ("object", ["str", "NoneType"]), r"relatedItem-.*_recordInfo_recordIdentifier": ("object", ["str", "NoneType"]), r"typeOfResource": ("object", ["str", "NoneType"]), @@ -32,11 +33,10 @@ EXPECTED_TYPES = { r".*-count": ("Int64", None), - # XXX possibly sets: - r"genre-.*": ("object", ["str", "NoneType"]), - r"subject-.*": ("object", ["str", "NoneType"]), - r"language_.*Term": ("object", ["str", "NoneType"]), - r"classification-.*": ("object", ["str", "NoneType"]), + r"genre-.*": ("object", ["ndarray", "NoneType"]), + r"subject-.*": ("object", ["ndarray", "NoneType"]), + r"language_.*Term": ("object", ["ndarray", "NoneType"]), + r"classification-.*": ("object", ["ndarray", "NoneType"]), # page_info diff --git a/src/mods4pandas/lib.py b/src/mods4pandas/lib.py index 4d85a9e..68050b1 100644 --- a/src/mods4pandas/lib.py +++ b/src/mods4pandas/lib.py @@ -1,5 +1,6 @@ from __future__ import annotations +import ast from itertools import groupby import re import warnings @@ -383,8 +384,7 @@ def convert_db_to_parquet(con, table, index_col, output_file): elif column_type == "bool": df[c] = df[c].map({"True": True, "False": False}).astype("boolean") elif column_type == "set": - # TODO WIP - continue + df[c] = df[c].apply(lambda s: list(ast.literal_eval(s)) if s else None) else: raise NotImplementedError(f"Column {c}: type {column_type} not implemented yet.")