mirror of
https://github.com/qurator-spk/modstool.git
synced 2025-06-25 19:49:54 +02:00
✨ Represent sets as arrays in the Parquet file
This commit is contained in:
parent
ebdded90d6
commit
215bfbb11f
2 changed files with 7 additions and 7 deletions
|
@ -24,6 +24,7 @@ EXPECTED_TYPES = {
|
|||
r"titleInfo_partName": ("object", ["str", "NoneType"]),
|
||||
r"identifier-.*": ("object", ["str", "NoneType"]),
|
||||
r"location_.*": ("object", ["str", "NoneType"]),
|
||||
r"name\d+_.*roleTerm": ("object", ["ndarray", "NoneType"]),
|
||||
r"name\d+_.*": ("object", ["str", "NoneType"]),
|
||||
r"relatedItem-.*_recordInfo_recordIdentifier": ("object", ["str", "NoneType"]),
|
||||
r"typeOfResource": ("object", ["str", "NoneType"]),
|
||||
|
@ -32,11 +33,10 @@ EXPECTED_TYPES = {
|
|||
|
||||
r".*-count": ("Int64", None),
|
||||
|
||||
# XXX possibly sets:
|
||||
r"genre-.*": ("object", ["str", "NoneType"]),
|
||||
r"subject-.*": ("object", ["str", "NoneType"]),
|
||||
r"language_.*Term": ("object", ["str", "NoneType"]),
|
||||
r"classification-.*": ("object", ["str", "NoneType"]),
|
||||
r"genre-.*": ("object", ["ndarray", "NoneType"]),
|
||||
r"subject-.*": ("object", ["ndarray", "NoneType"]),
|
||||
r"language_.*Term": ("object", ["ndarray", "NoneType"]),
|
||||
r"classification-.*": ("object", ["ndarray", "NoneType"]),
|
||||
|
||||
# page_info
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import ast
|
||||
from itertools import groupby
|
||||
import re
|
||||
import warnings
|
||||
|
@ -383,8 +384,7 @@ def convert_db_to_parquet(con, table, index_col, output_file):
|
|||
elif column_type == "bool":
|
||||
df[c] = df[c].map({"True": True, "False": False}).astype("boolean")
|
||||
elif column_type == "set":
|
||||
# TODO WIP
|
||||
continue
|
||||
df[c] = df[c].apply(lambda s: list(ast.literal_eval(s)) if s else None)
|
||||
else:
|
||||
raise NotImplementedError(f"Column {c}: type {column_type} not implemented yet.")
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue