🚧 Write out page_info

master
Mike Gerber 12 months ago
parent e51fa5750f
commit c5332ae80d

@ -300,7 +300,7 @@ def flatten(d: MutableMapping, parent='', separator='_'):
return dict(items) return dict(items)
def dicts_to_df(data_list: List[Dict], *, index_column: str) -> pd.DataFrame: def dicts_to_df(data_list: List[Dict], *, index_column) -> pd.DataFrame:
""" """
Convert the given list of dicts to a Pandas DataFrame. Convert the given list of dicts to a Pandas DataFrame.
@ -318,7 +318,13 @@ def dicts_to_df(data_list: List[Dict], *, index_column: str) -> pd.DataFrame:
data = [[m.get(c) for c in columns] for m in data_list] data = [[m.get(c) for c in columns] for m in data_list]
# Build index # Build index
index = [m[index_column] for m in data_list] if isinstance(index_column, str):
index = [m[index_column] for m in data_list]
elif isinstance(index_column, tuple):
index = [[m[c] for m in data_list] for c in index_column]
index = pd.MultiIndex.from_arrays(index, names=index_column)
else:
raise ValueError(f"index_column must")
df = pd.DataFrame(data=data, index=index, columns=columns) df = pd.DataFrame(data=data, index=index, columns=columns)
return df return df

@ -415,6 +415,13 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
logger.info('Writing Excel .xlsx to {}'.format(output_xlsx)) logger.info('Writing Excel .xlsx to {}'.format(output_xlsx))
mods_info_df.to_excel(output_xlsx) mods_info_df.to_excel(output_xlsx)
# Convert page_info
# XXX hardcoded filenames + other formats
page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID"))
# Pickle the DataFrame
logger.info('Writing DataFrame to {}'.format("page_info_df.pkl"))
page_info_df.to_pickle("page_info_df.pkl")
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)

Loading…
Cancel
Save