From dd4febf24d0f09dd0fcf1f592872c0d4bdb74f52 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Sat, 27 Jul 2024 12:57:33 +0200
Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A7=20Write=20a=20Parquet=20file?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md                      |  4 ++--
 requirements.txt               |  5 +++--
 src/mods4pandas/mods4pandas.py | 14 +++++++-------
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index 2fee8b7..69d6b38 100644
--- a/README.md
+++ b/README.md
@@ -28,8 +28,8 @@ alto4pandas /path/to/a/directory/full/of/alto_files
 ## Example
 In this example we convert the MODS metadata contained in the METS files in
 `/srv/data/digisam_mets-sample-300` to a pandas DataFrame under
-`mods_info_df.pkl`. This file can then be read by your data scientist using
-`pd.read_pickle()`.
+`mods_info_df.parquet`. This file can then be read by your data scientist using
+`pd.read_parquet()`.
 
 ~~~
 % mods4pandas /srv/data/digisam_mets-sample-300
diff --git a/requirements.txt b/requirements.txt
index 647c298..4b587e1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,7 @@
 click
-pandas ~= 1.1.5
-numpy < 2
+pandas
+numpy
 tqdm
 lxml
 openpyxl
+pyarrow
diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py
index 6a8c1c6..9fadf42 100755
--- a/src/mods4pandas/mods4pandas.py
+++ b/src/mods4pandas/mods4pandas.py
@@ -369,8 +369,8 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
 
 @click.command()
 @click.argument('mets_files', type=click.Path(exists=True), required=True, nargs=-1)
-@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output pickle file',
-              default='mods_info_df.pkl', show_default=True)
+@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output file',
+              default='mods_info_df.parquet', show_default=True)
 @click.option('--output-csv', type=click.Path(), help='Output CSV file')
 @click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file')
 def process(mets_files: List[str], output_file: str, output_csv: str, output_xlsx: str):
@@ -436,9 +436,9 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
     # Convert the mods_info List[Dict] to a pandas DataFrame
     mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier")
 
-    # Pickle the DataFrame
+    # Save the DataFrame
     logger.info('Writing DataFrame to {}'.format(output_file))
-    mods_info_df.to_pickle(output_file)
+    mods_info_df.to_parquet(output_file)
     if output_csv:
         logger.info('Writing CSV to {}'.format(output_csv))
         mods_info_df.to_csv(output_csv)
@@ -449,9 +449,9 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
     # Convert page_info
     # XXX hardcoded filenames + other formats
     page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID"))
-    # Pickle the DataFrame
-    logger.info('Writing DataFrame to {}'.format("page_info_df.pkl"))
-    page_info_df.to_pickle("page_info_df.pkl")
+    # Save the DataFrame
+    logger.info('Writing DataFrame to {}'.format("page_info_df.parquet"))
+    page_info_df.to_parquet("page_info_df.parquet")
 
 
 def main():