🧹 Extract a function to convert list[dict] to a DataFrame

2026-07-21 19:19:12 +02:00 · 2023-11-23 15:00:06 +01:00 · 2023-11-23 15:00:06 +01:00 · 968572168e
commit 968572168e
parent 5c2dfa8505
2 changed files with 27 additions and 10 deletions
--- a/qurator/mods4pandas/lib.py
+++ b/qurator/mods4pandas/lib.py
@ -1,8 +1,9 @@
 from itertools import groupby
 import re
 import warnings
-from typing import List, Sequence, MutableMapping
+from typing import List, Sequence, MutableMapping, Dict

+import pandas as pd
 import numpy as np
 from lxml import etree as ET

@ -298,3 +299,26 @@ def flatten(d: MutableMapping, parent='', separator='_'):

    return dict(items)

+
+def dicts_to_df(data_list: List[Dict], *, index_column: str) -> pd.DataFrame:
+    """
+    Convert the given list of dicts to a Pandas DataFrame.
+
+    The keys of the dicts make the columns.
+    """
+
+    # Build columns from keys
+    columns = []
+    for m in data_list:
+        for c in m.keys():
+            if c not in columns:
+                columns.append(c)
+
+    # Build data table
+    data = [[m.get(c) for c in columns] for m in data_list]
+
+    # Build index
+    index = [m[index_column] for m in data_list]
+
+    df = pd.DataFrame(data=data, index=index, columns=columns)
+    return df
--- a/qurator/mods4pandas/mods4pandas.py
+++ b/qurator/mods4pandas/mods4pandas.py
@ -14,7 +14,7 @@ import click
 import pandas as pd
 from tqdm import tqdm

-from .lib import sorted_groupby, TagGroup, ns, flatten
+from .lib import sorted_groupby, TagGroup, ns, flatten, dicts_to_df



@ -404,14 +404,7 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
                #import traceback; traceback.print_exc()

    # Convert the mods_info List[Dict] to a pandas DataFrame
-    columns = []
-    for m in mods_info:
-        for c in m.keys():
-            if c not in columns:
-                columns.append(c)
-    data = [[m.get(c) for c in columns] for m in mods_info]
-    index = [m['recordInfo_recordIdentifier'] for m in mods_info]  # PPN
-    mods_info_df = pd.DataFrame(data=data, index=index, columns=columns)
+    mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier")

    # Pickle the DataFrame
    logger.info('Writing DataFrame to {}'.format(output_file))