diff --git a/README.md b/README.md index 6d00619..969578b 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,18 @@ mods4pandas /path/to/a/directory/containing/mets_files alto4pandas /path/to/a/directory/full/of/alto_files ~~~ +### Conversion to other formats + +CSV: +``` +python -c 'import pandas as pd; pd.read_parquet("mods_info_df.parquet").to_csv("mods_info_df.csv")' +``` +Excel (requires `XlsxWriter`): +``` +python -c 'import pandas as pd; pd.read_parquet("mods_info_df.parquet").to_excel("mods_info_df.xlsx" +, engine="xlsxwriter")' +``` + ## Example In this example we convert the MODS metadata contained in the METS files in `/srv/data/digisam_mets-sample-300` to a pandas DataFrame under diff --git a/requirements.txt b/requirements.txt index 4b587e1..6cc6778 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,5 +3,5 @@ pandas numpy tqdm lxml -openpyxl pyarrow +XslxWriter diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py index 65d7ada..ef24d36 100755 --- a/src/mods4pandas/mods4pandas.py +++ b/src/mods4pandas/mods4pandas.py @@ -369,19 +369,19 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: @click.command() @click.argument('mets_files', type=click.Path(exists=True), required=True, nargs=-1) -@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output file', +@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output Parquet file', default='mods_info_df.parquet', show_default=True) -@click.option('--output-csv', type=click.Path(), help='Output CSV file') -@click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file') -@click.option('--output-page-info', type=click.Path(), help='Save page info') -def process(mets_files: List[str], output_file: str, output_csv: str, output_xlsx: str, output_page_info: str): +@click.option('--output-page-info', type=click.Path(), help='Output page info Parquet file') +def process(mets_files: List[str], output_file: str, output_page_info: str): """ A tool to convert the MODS metadata in INPUT to a pandas DataFrame. INPUT is assumed to be a METS document with MODS metadata. INPUT may optionally be a directory. The tool then reads all files in the directory. - mods4pandas writes two output files: A pickled pandas DataFrame and a CSV file with all conversion warnings. + mods4pandas writes two output files: A pandas DataFrame (as Parquet) and a CSV file with all conversion warnings. + + Per-page information (e.g. structure information) can be output to a separate Parquet file. """ # Extend file list if directories are given @@ -442,12 +442,6 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls # Save the DataFrame logger.info('Writing DataFrame to {}'.format(output_file)) mods_info_df.to_parquet(output_file) - if output_csv: - logger.info('Writing CSV to {}'.format(output_csv)) - mods_info_df.to_csv(output_csv) - if output_xlsx: - logger.info('Writing Excel .xlsx to {}'.format(output_xlsx)) - mods_info_df.to_excel(output_xlsx) # Convert page_info if output_page_info: