1
0
Fork 0
mirror of https://github.com/qurator-spk/modstool.git synced 2025-06-25 19:49:54 +02:00

Represent sets as arrays in the Parquet file

This commit is contained in:
Mike Gerber 2025-06-12 07:45:22 +02:00
parent ebdded90d6
commit 215bfbb11f
2 changed files with 7 additions and 7 deletions

View file

@ -24,6 +24,7 @@ EXPECTED_TYPES = {
r"titleInfo_partName": ("object", ["str", "NoneType"]), r"titleInfo_partName": ("object", ["str", "NoneType"]),
r"identifier-.*": ("object", ["str", "NoneType"]), r"identifier-.*": ("object", ["str", "NoneType"]),
r"location_.*": ("object", ["str", "NoneType"]), r"location_.*": ("object", ["str", "NoneType"]),
r"name\d+_.*roleTerm": ("object", ["ndarray", "NoneType"]),
r"name\d+_.*": ("object", ["str", "NoneType"]), r"name\d+_.*": ("object", ["str", "NoneType"]),
r"relatedItem-.*_recordInfo_recordIdentifier": ("object", ["str", "NoneType"]), r"relatedItem-.*_recordInfo_recordIdentifier": ("object", ["str", "NoneType"]),
r"typeOfResource": ("object", ["str", "NoneType"]), r"typeOfResource": ("object", ["str", "NoneType"]),
@ -32,11 +33,10 @@ EXPECTED_TYPES = {
r".*-count": ("Int64", None), r".*-count": ("Int64", None),
# XXX possibly sets: r"genre-.*": ("object", ["ndarray", "NoneType"]),
r"genre-.*": ("object", ["str", "NoneType"]), r"subject-.*": ("object", ["ndarray", "NoneType"]),
r"subject-.*": ("object", ["str", "NoneType"]), r"language_.*Term": ("object", ["ndarray", "NoneType"]),
r"language_.*Term": ("object", ["str", "NoneType"]), r"classification-.*": ("object", ["ndarray", "NoneType"]),
r"classification-.*": ("object", ["str", "NoneType"]),
# page_info # page_info

View file

@ -1,5 +1,6 @@
from __future__ import annotations from __future__ import annotations
import ast
from itertools import groupby from itertools import groupby
import re import re
import warnings import warnings
@ -383,8 +384,7 @@ def convert_db_to_parquet(con, table, index_col, output_file):
elif column_type == "bool": elif column_type == "bool":
df[c] = df[c].map({"True": True, "False": False}).astype("boolean") df[c] = df[c].map({"True": True, "False": False}).astype("boolean")
elif column_type == "set": elif column_type == "set":
# TODO WIP df[c] = df[c].apply(lambda s: list(ast.literal_eval(s)) if s else None)
continue
else: else:
raise NotImplementedError(f"Column {c}: type {column_type} not implemented yet.") raise NotImplementedError(f"Column {c}: type {column_type} not implemented yet.")