From a1f333f4a4794f9d60e66237e706b61f6962a819 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 31 Jul 2024 10:27:46 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20Fix=20converting/writing=20out?= =?UTF-8?q?=20per-page=20information=20(e.g.=20structure=20information)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 11 +++++++---- src/mods4pandas/mods4pandas.py | 15 +++++++-------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 69d6b38..6d00619 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,9 @@ instead of ordered lists for topics, etc. Parts of the tool are specific to our environment/needs at the State Library Berlin and may need to be changed for your library. +Per-page information (e.g. structure information from the METS structMap) can +be converted as well (`--output-page-info`). + **alto4pandas** converts the metadata from ALTO files into a pandas DataFrame. Column names are derived from the corresponding ALTO elements. Some columns @@ -31,14 +34,14 @@ In this example we convert the MODS metadata contained in the METS files in `mods_info_df.parquet`. This file can then be read by your data scientist using `pd.read_parquet()`. -~~~ +``` % mods4pandas /srv/data/digisam_mets-sample-300 INFO:root:Scanning directory /srv/data/digisam_mets-sample-300 301it [00:00, 19579.19it/s] INFO:root:Processing METS files 100%|████████████████████████████████████████| 301/301 [00:01<00:00, 162.59it/s] -INFO:root:Writing DataFrame to mods_info_df.pkl -~~~ +INFO:root:Writing DataFrame to mods_info_df.parquet +``` In the next example we convert the metadata from the ALTO files in the test data directory: @@ -56,5 +59,5 @@ Scanning directory qurator/mods4pandas/tests/data/alto/PPN715049151 Scanning directory qurator/mods4pandas/tests/data/alto/749782137 Scanning directory qurator/mods4pandas/tests/data/alto/weird-ns INFO:alto4pandas:Processing ALTO files -INFO:alto4pandas:Writing DataFrame to alto_info_df.pkl +INFO:alto4pandas:Writing DataFrame to alto_info_df.parquet ~~~ diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py index 8edb659..65d7ada 100755 --- a/src/mods4pandas/mods4pandas.py +++ b/src/mods4pandas/mods4pandas.py @@ -373,8 +373,8 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: default='mods_info_df.parquet', show_default=True) @click.option('--output-csv', type=click.Path(), help='Output CSV file') @click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file') -@click.option('--page-info', is_flag=True, show_default=True, default=False, help='Save page info') -def process(mets_files: List[str], output_file: str, output_csv: str, output_xlsx: str, page_info: bool): +@click.option('--output-page-info', type=click.Path(), help='Save page info') +def process(mets_files: List[str], output_file: str, output_csv: str, output_xlsx: str, output_page_info: str): """ A tool to convert the MODS metadata in INPUT to a pandas DataFrame. @@ -420,11 +420,11 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls d['mets_file'] = mets_file # METS - per-page - if page_info: + if output_page_info: page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True) mods_info.append(d) - if page_info: + if output_page_info: page_info.extend(page_info_doc) if caught_warnings: @@ -450,12 +450,11 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls mods_info_df.to_excel(output_xlsx) # Convert page_info - # XXX hardcoded filenames + other formats - if page_info: + if output_page_info: page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID")) # Save the DataFrame - logger.info('Writing DataFrame to {}'.format("page_info_df.parquet")) - page_info_df.to_parquet("page_info_df.parquet") + logger.info('Writing DataFrame to {}'.format(output_page_info)) + page_info_df.to_parquet(output_page_info) def main():