From dd4febf24d0f09dd0fcf1f592872c0d4bdb74f52 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Sat, 27 Jul 2024 12:57:33 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A7=20Write=20a=20Parquet=20file?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 4 ++-- requirements.txt | 5 +++-- src/mods4pandas/mods4pandas.py | 14 +++++++------- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 2fee8b7..69d6b38 100644 --- a/README.md +++ b/README.md @@ -28,8 +28,8 @@ alto4pandas /path/to/a/directory/full/of/alto_files ## Example In this example we convert the MODS metadata contained in the METS files in `/srv/data/digisam_mets-sample-300` to a pandas DataFrame under -`mods_info_df.pkl`. This file can then be read by your data scientist using -`pd.read_pickle()`. +`mods_info_df.parquet`. This file can then be read by your data scientist using +`pd.read_parquet()`. ~~~ % mods4pandas /srv/data/digisam_mets-sample-300 diff --git a/requirements.txt b/requirements.txt index 647c298..4b587e1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ click -pandas ~= 1.1.5 -numpy < 2 +pandas +numpy tqdm lxml openpyxl +pyarrow diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py index 6a8c1c6..9fadf42 100755 --- a/src/mods4pandas/mods4pandas.py +++ b/src/mods4pandas/mods4pandas.py @@ -369,8 +369,8 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: @click.command() @click.argument('mets_files', type=click.Path(exists=True), required=True, nargs=-1) -@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output pickle file', - default='mods_info_df.pkl', show_default=True) +@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output file', + default='mods_info_df.parquet', show_default=True) @click.option('--output-csv', type=click.Path(), help='Output CSV file') @click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file') def process(mets_files: List[str], output_file: str, output_csv: str, output_xlsx: str): @@ -436,9 +436,9 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls # Convert the mods_info List[Dict] to a pandas DataFrame mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier") - # Pickle the DataFrame + # Save the DataFrame logger.info('Writing DataFrame to {}'.format(output_file)) - mods_info_df.to_pickle(output_file) + mods_info_df.to_parquet(output_file) if output_csv: logger.info('Writing CSV to {}'.format(output_csv)) mods_info_df.to_csv(output_csv) @@ -449,9 +449,9 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls # Convert page_info # XXX hardcoded filenames + other formats page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID")) - # Pickle the DataFrame - logger.info('Writing DataFrame to {}'.format("page_info_df.pkl")) - page_info_df.to_pickle("page_info_df.pkl") + # Save the DataFrame + logger.info('Writing DataFrame to {}'.format("page_info_df.parquet")) + page_info_df.to_parquet("page_info_df.parquet") def main():