diff --git a/qurator/mods4pandas/lib.py b/qurator/mods4pandas/lib.py index f3c6078..cb44656 100644 --- a/qurator/mods4pandas/lib.py +++ b/qurator/mods4pandas/lib.py @@ -1,8 +1,9 @@ from itertools import groupby import re import warnings -from typing import List, Sequence, MutableMapping +from typing import List, Sequence, MutableMapping, Dict +import pandas as pd import numpy as np from lxml import etree as ET @@ -298,3 +299,26 @@ def flatten(d: MutableMapping, parent='', separator='_'): return dict(items) + +def dicts_to_df(data_list: List[Dict], *, index_column: str) -> pd.DataFrame: + """ + Convert the given list of dicts to a Pandas DataFrame. + + The keys of the dicts make the columns. + """ + + # Build columns from keys + columns = [] + for m in data_list: + for c in m.keys(): + if c not in columns: + columns.append(c) + + # Build data table + data = [[m.get(c) for c in columns] for m in data_list] + + # Build index + index = [m[index_column] for m in data_list] + + df = pd.DataFrame(data=data, index=index, columns=columns) + return df diff --git a/qurator/mods4pandas/mods4pandas.py b/qurator/mods4pandas/mods4pandas.py index 94bddd3..f651556 100755 --- a/qurator/mods4pandas/mods4pandas.py +++ b/qurator/mods4pandas/mods4pandas.py @@ -14,7 +14,7 @@ import click import pandas as pd from tqdm import tqdm -from .lib import sorted_groupby, TagGroup, ns, flatten +from .lib import sorted_groupby, TagGroup, ns, flatten, dicts_to_df @@ -404,14 +404,7 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls #import traceback; traceback.print_exc() # Convert the mods_info List[Dict] to a pandas DataFrame - columns = [] - for m in mods_info: - for c in m.keys(): - if c not in columns: - columns.append(c) - data = [[m.get(c) for c in columns] for m in mods_info] - index = [m['recordInfo_recordIdentifier'] for m in mods_info] # PPN - mods_info_df = pd.DataFrame(data=data, index=index, columns=columns) + mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier") # Pickle the DataFrame logger.info('Writing DataFrame to {}'.format(output_file))