From a1f333f4a4794f9d60e66237e706b61f6962a819 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 31 Jul 2024 10:27:46 +0200
Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20Fix=20converting/writing=20out?=
 =?UTF-8?q?=20per-page=20information=20(e.g.=20structure=20information)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md                      | 11 +++++++----
 src/mods4pandas/mods4pandas.py | 15 +++++++--------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 69d6b38..6d00619 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,9 @@ instead of ordered lists for topics, etc. Parts of the tool are specific to
 our environment/needs at the State Library Berlin and may need to be changed for
 your library.
 
+Per-page information (e.g. structure information from the METS structMap) can
+be converted as well (`--output-page-info`).
+
 **alto4pandas** converts the metadata from ALTO files into a pandas DataFrame.
 
 Column names are derived from the corresponding ALTO elements. Some columns
@@ -31,14 +34,14 @@ In this example we convert the MODS metadata contained in the METS files in
 `mods_info_df.parquet`. This file can then be read by your data scientist using
 `pd.read_parquet()`.
 
-~~~
+```
 % mods4pandas /srv/data/digisam_mets-sample-300
 INFO:root:Scanning directory /srv/data/digisam_mets-sample-300
 301it [00:00, 19579.19it/s]
 INFO:root:Processing METS files
 100%|████████████████████████████████████████| 301/301 [00:01<00:00, 162.59it/s]
-INFO:root:Writing DataFrame to mods_info_df.pkl
-~~~
+INFO:root:Writing DataFrame to mods_info_df.parquet
+```
 
 In the next example we convert the metadata from the ALTO files in the test data
 directory:
@@ -56,5 +59,5 @@ Scanning directory qurator/mods4pandas/tests/data/alto/PPN715049151
 Scanning directory qurator/mods4pandas/tests/data/alto/749782137
 Scanning directory qurator/mods4pandas/tests/data/alto/weird-ns
 INFO:alto4pandas:Processing ALTO files
-INFO:alto4pandas:Writing DataFrame to alto_info_df.pkl
+INFO:alto4pandas:Writing DataFrame to alto_info_df.parquet
 ~~~
diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py
index 8edb659..65d7ada 100755
--- a/src/mods4pandas/mods4pandas.py
+++ b/src/mods4pandas/mods4pandas.py
@@ -373,8 +373,8 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
               default='mods_info_df.parquet', show_default=True)
 @click.option('--output-csv', type=click.Path(), help='Output CSV file')
 @click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file')
-@click.option('--page-info', is_flag=True, show_default=True, default=False, help='Save page info')
-def process(mets_files: List[str], output_file: str, output_csv: str, output_xlsx: str, page_info: bool):
+@click.option('--output-page-info', type=click.Path(), help='Save page info')
+def process(mets_files: List[str], output_file: str, output_csv: str, output_xlsx: str, output_page_info: str):
     """
     A tool to convert the MODS metadata in INPUT to a pandas DataFrame.
 
@@ -420,11 +420,11 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
                     d['mets_file'] = mets_file
 
                     # METS - per-page
-                    if page_info:
+                    if output_page_info:
                         page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True)
 
                     mods_info.append(d)
-                    if page_info:
+                    if output_page_info:
                         page_info.extend(page_info_doc)
 
                     if caught_warnings:
@@ -450,12 +450,11 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
         mods_info_df.to_excel(output_xlsx)
 
     # Convert page_info
-    # XXX hardcoded filenames + other formats
-    if page_info:
+    if output_page_info:
         page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID"))
         # Save the DataFrame
-        logger.info('Writing DataFrame to {}'.format("page_info_df.parquet"))
-        page_info_df.to_parquet("page_info_df.parquet")
+        logger.info('Writing DataFrame to {}'.format(output_page_info))
+        page_info_df.to_parquet(output_page_info)
 
 
 def main():