🐛 Fix converting/writing out per-page information (e.g. structure information)

2025-12-11 21:04:16 +01:00 · 2024-07-31 10:27:46 +02:00 · 2024-07-31 10:27:46 +02:00 · a1f333f4a4
commit a1f333f4a4
parent 1bf86bfb4c
2 changed files with 14 additions and 12 deletions
--- a/README.md
+++ b/README.md
@ -10,6 +10,9 @@ instead of ordered lists for topics, etc. Parts of the tool are specific to
 our environment/needs at the State Library Berlin and may need to be changed for
 your library.

+Per-page information (e.g. structure information from the METS structMap) can
+be converted as well (`--output-page-info`).
+
 **alto4pandas** converts the metadata from ALTO files into a pandas DataFrame.

 Column names are derived from the corresponding ALTO elements. Some columns
@ -31,14 +34,14 @@ In this example we convert the MODS metadata contained in the METS files in
 `mods_info_df.parquet`. This file can then be read by your data scientist using
 `pd.read_parquet()`.

-~~~
+```
 % mods4pandas /srv/data/digisam_mets-sample-300
 INFO:root:Scanning directory /srv/data/digisam_mets-sample-300
 301it [00:00, 19579.19it/s]
 INFO:root:Processing METS files
 100%|████████████████████████████████████████| 301/301 [00:01<00:00, 162.59it/s]
-INFO:root:Writing DataFrame to mods_info_df.pkl
-~~~
+INFO:root:Writing DataFrame to mods_info_df.parquet
+```

 In the next example we convert the metadata from the ALTO files in the test data
 directory:
@ -56,5 +59,5 @@ Scanning directory qurator/mods4pandas/tests/data/alto/PPN715049151
 Scanning directory qurator/mods4pandas/tests/data/alto/749782137
 Scanning directory qurator/mods4pandas/tests/data/alto/weird-ns
 INFO:alto4pandas:Processing ALTO files
-INFO:alto4pandas:Writing DataFrame to alto_info_df.pkl
+INFO:alto4pandas:Writing DataFrame to alto_info_df.parquet
 ~~~
--- a/src/mods4pandas/mods4pandas.py
+++ b/src/mods4pandas/mods4pandas.py
@ -373,8 +373,8 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
              default='mods_info_df.parquet', show_default=True)
@click.option('--output-csv', type=click.Path(), help='Output CSV file')
@click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file')
-@click.option('--page-info', is_flag=True, show_default=True, default=False, help='Save page info')
-def process(mets_files: List[str], output_file: str, output_csv: str, output_xlsx: str, page_info: bool):
+@click.option('--output-page-info', type=click.Path(), help='Save page info')
+def process(mets_files: List[str], output_file: str, output_csv: str, output_xlsx: str, output_page_info: str):
    """
    A tool to convert the MODS metadata in INPUT to a pandas DataFrame.

@ -420,11 +420,11 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
                    d['mets_file'] = mets_file

                    # METS - per-page
-                    if page_info:
+                    if output_page_info:
                        page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True)

                    mods_info.append(d)
-                    if page_info:
+                    if output_page_info:
                        page_info.extend(page_info_doc)

                    if caught_warnings:
@ -450,12 +450,11 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
        mods_info_df.to_excel(output_xlsx)

    # Convert page_info
-    # XXX hardcoded filenames + other formats
-    if page_info:
+    if output_page_info:
        page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID"))
        # Save the DataFrame
-        logger.info('Writing DataFrame to {}'.format("page_info_df.parquet"))
-        page_info_df.to_parquet("page_info_df.parquet")
+        logger.info('Writing DataFrame to {}'.format(output_page_info))
+        page_info_df.to_parquet(output_page_info)


 def main():