diff --git a/qurator/mods4pandas/lib.py b/qurator/mods4pandas/lib.py index cb44656..d2e1f8f 100644 --- a/qurator/mods4pandas/lib.py +++ b/qurator/mods4pandas/lib.py @@ -300,7 +300,7 @@ def flatten(d: MutableMapping, parent='', separator='_'): return dict(items) -def dicts_to_df(data_list: List[Dict], *, index_column: str) -> pd.DataFrame: +def dicts_to_df(data_list: List[Dict], *, index_column) -> pd.DataFrame: """ Convert the given list of dicts to a Pandas DataFrame. @@ -318,7 +318,13 @@ def dicts_to_df(data_list: List[Dict], *, index_column: str) -> pd.DataFrame: data = [[m.get(c) for c in columns] for m in data_list] # Build index - index = [m[index_column] for m in data_list] + if isinstance(index_column, str): + index = [m[index_column] for m in data_list] + elif isinstance(index_column, tuple): + index = [[m[c] for m in data_list] for c in index_column] + index = pd.MultiIndex.from_arrays(index, names=index_column) + else: + raise ValueError(f"index_column must") df = pd.DataFrame(data=data, index=index, columns=columns) return df diff --git a/qurator/mods4pandas/mods4pandas.py b/qurator/mods4pandas/mods4pandas.py index 62f453c..7229d37 100755 --- a/qurator/mods4pandas/mods4pandas.py +++ b/qurator/mods4pandas/mods4pandas.py @@ -415,6 +415,13 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls logger.info('Writing Excel .xlsx to {}'.format(output_xlsx)) mods_info_df.to_excel(output_xlsx) + # Convert page_info + # XXX hardcoded filenames + other formats + page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID")) + # Pickle the DataFrame + logger.info('Writing DataFrame to {}'.format("page_info_df.pkl")) + page_info_df.to_pickle("page_info_df.pkl") + def main(): logging.basicConfig(level=logging.INFO)