mirror of
https://github.com/qurator-spk/modstool.git
synced 2025-06-07 19:05:06 +02:00
Remove direct CSV/Excel support
Remove direct CVS/Excel support from the CLI; README now has instructions to convert. Closes gh-40.
This commit is contained in:
parent
a1f333f4a4
commit
7122f0265f
3 changed files with 19 additions and 13 deletions
12
README.md
12
README.md
|
@ -28,6 +28,18 @@ mods4pandas /path/to/a/directory/containing/mets_files
|
||||||
alto4pandas /path/to/a/directory/full/of/alto_files
|
alto4pandas /path/to/a/directory/full/of/alto_files
|
||||||
~~~
|
~~~
|
||||||
|
|
||||||
|
### Conversion to other formats
|
||||||
|
|
||||||
|
CSV:
|
||||||
|
```
|
||||||
|
python -c 'import pandas as pd; pd.read_parquet("mods_info_df.parquet").to_csv("mods_info_df.csv")'
|
||||||
|
```
|
||||||
|
Excel (requires `XlsxWriter`):
|
||||||
|
```
|
||||||
|
python -c 'import pandas as pd; pd.read_parquet("mods_info_df.parquet").to_excel("mods_info_df.xlsx"
|
||||||
|
, engine="xlsxwriter")'
|
||||||
|
```
|
||||||
|
|
||||||
## Example
|
## Example
|
||||||
In this example we convert the MODS metadata contained in the METS files in
|
In this example we convert the MODS metadata contained in the METS files in
|
||||||
`/srv/data/digisam_mets-sample-300` to a pandas DataFrame under
|
`/srv/data/digisam_mets-sample-300` to a pandas DataFrame under
|
||||||
|
|
|
@ -3,5 +3,5 @@ pandas
|
||||||
numpy
|
numpy
|
||||||
tqdm
|
tqdm
|
||||||
lxml
|
lxml
|
||||||
openpyxl
|
|
||||||
pyarrow
|
pyarrow
|
||||||
|
XslxWriter
|
||||||
|
|
|
@ -369,19 +369,19 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
|
||||||
|
|
||||||
@click.command()
|
@click.command()
|
||||||
@click.argument('mets_files', type=click.Path(exists=True), required=True, nargs=-1)
|
@click.argument('mets_files', type=click.Path(exists=True), required=True, nargs=-1)
|
||||||
@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output file',
|
@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output Parquet file',
|
||||||
default='mods_info_df.parquet', show_default=True)
|
default='mods_info_df.parquet', show_default=True)
|
||||||
@click.option('--output-csv', type=click.Path(), help='Output CSV file')
|
@click.option('--output-page-info', type=click.Path(), help='Output page info Parquet file')
|
||||||
@click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file')
|
def process(mets_files: List[str], output_file: str, output_page_info: str):
|
||||||
@click.option('--output-page-info', type=click.Path(), help='Save page info')
|
|
||||||
def process(mets_files: List[str], output_file: str, output_csv: str, output_xlsx: str, output_page_info: str):
|
|
||||||
"""
|
"""
|
||||||
A tool to convert the MODS metadata in INPUT to a pandas DataFrame.
|
A tool to convert the MODS metadata in INPUT to a pandas DataFrame.
|
||||||
|
|
||||||
INPUT is assumed to be a METS document with MODS metadata. INPUT may optionally be a directory. The tool then reads
|
INPUT is assumed to be a METS document with MODS metadata. INPUT may optionally be a directory. The tool then reads
|
||||||
all files in the directory.
|
all files in the directory.
|
||||||
|
|
||||||
mods4pandas writes two output files: A pickled pandas DataFrame and a CSV file with all conversion warnings.
|
mods4pandas writes two output files: A pandas DataFrame (as Parquet) and a CSV file with all conversion warnings.
|
||||||
|
|
||||||
|
Per-page information (e.g. structure information) can be output to a separate Parquet file.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Extend file list if directories are given
|
# Extend file list if directories are given
|
||||||
|
@ -442,12 +442,6 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
|
||||||
# Save the DataFrame
|
# Save the DataFrame
|
||||||
logger.info('Writing DataFrame to {}'.format(output_file))
|
logger.info('Writing DataFrame to {}'.format(output_file))
|
||||||
mods_info_df.to_parquet(output_file)
|
mods_info_df.to_parquet(output_file)
|
||||||
if output_csv:
|
|
||||||
logger.info('Writing CSV to {}'.format(output_csv))
|
|
||||||
mods_info_df.to_csv(output_csv)
|
|
||||||
if output_xlsx:
|
|
||||||
logger.info('Writing Excel .xlsx to {}'.format(output_xlsx))
|
|
||||||
mods_info_df.to_excel(output_xlsx)
|
|
||||||
|
|
||||||
# Convert page_info
|
# Convert page_info
|
||||||
if output_page_info:
|
if output_page_info:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue