mirror of
https://github.com/qurator-spk/modstool.git
synced 2025-06-09 11:49:55 +02:00
⚙ Make saving per-page information optional
This commit is contained in:
parent
dd4febf24d
commit
191867cdef
1 changed files with 11 additions and 7 deletions
|
@ -373,7 +373,8 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
|
||||||
default='mods_info_df.parquet', show_default=True)
|
default='mods_info_df.parquet', show_default=True)
|
||||||
@click.option('--output-csv', type=click.Path(), help='Output CSV file')
|
@click.option('--output-csv', type=click.Path(), help='Output CSV file')
|
||||||
@click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file')
|
@click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file')
|
||||||
def process(mets_files: List[str], output_file: str, output_csv: str, output_xlsx: str):
|
@click.option('--page-info', is_flag=True, show_default=True, default=False, help='Save page info')
|
||||||
|
def process(mets_files: List[str], output_file: str, output_csv: str, output_xlsx: str, page_info: bool):
|
||||||
"""
|
"""
|
||||||
A tool to convert the MODS metadata in INPUT to a pandas DataFrame.
|
A tool to convert the MODS metadata in INPUT to a pandas DataFrame.
|
||||||
|
|
||||||
|
@ -419,9 +420,11 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
|
||||||
d['mets_file'] = mets_file
|
d['mets_file'] = mets_file
|
||||||
|
|
||||||
# METS - per-page
|
# METS - per-page
|
||||||
|
if page_info:
|
||||||
page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True)
|
page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True)
|
||||||
|
|
||||||
mods_info.append(d)
|
mods_info.append(d)
|
||||||
|
if page_info:
|
||||||
page_info.extend(page_info_doc)
|
page_info.extend(page_info_doc)
|
||||||
|
|
||||||
if caught_warnings:
|
if caught_warnings:
|
||||||
|
@ -448,6 +451,7 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
|
||||||
|
|
||||||
# Convert page_info
|
# Convert page_info
|
||||||
# XXX hardcoded filenames + other formats
|
# XXX hardcoded filenames + other formats
|
||||||
|
if page_info:
|
||||||
page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID"))
|
page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID"))
|
||||||
# Save the DataFrame
|
# Save the DataFrame
|
||||||
logger.info('Writing DataFrame to {}'.format("page_info_df.parquet"))
|
logger.info('Writing DataFrame to {}'.format("page_info_df.parquet"))
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue