🐛 Fix converting/writing out per-page information (e.g. structure information)

master
Mike Gerber 5 months ago
parent 1bf86bfb4c
commit a1f333f4a4

@ -10,6 +10,9 @@ instead of ordered lists for topics, etc. Parts of the tool are specific to
our environment/needs at the State Library Berlin and may need to be changed for our environment/needs at the State Library Berlin and may need to be changed for
your library. your library.
Per-page information (e.g. structure information from the METS structMap) can
be converted as well (`--output-page-info`).
**alto4pandas** converts the metadata from ALTO files into a pandas DataFrame. **alto4pandas** converts the metadata from ALTO files into a pandas DataFrame.
Column names are derived from the corresponding ALTO elements. Some columns Column names are derived from the corresponding ALTO elements. Some columns
@ -31,14 +34,14 @@ In this example we convert the MODS metadata contained in the METS files in
`mods_info_df.parquet`. This file can then be read by your data scientist using `mods_info_df.parquet`. This file can then be read by your data scientist using
`pd.read_parquet()`. `pd.read_parquet()`.
~~~ ```
% mods4pandas /srv/data/digisam_mets-sample-300 % mods4pandas /srv/data/digisam_mets-sample-300
INFO:root:Scanning directory /srv/data/digisam_mets-sample-300 INFO:root:Scanning directory /srv/data/digisam_mets-sample-300
301it [00:00, 19579.19it/s] 301it [00:00, 19579.19it/s]
INFO:root:Processing METS files INFO:root:Processing METS files
100%|████████████████████████████████████████| 301/301 [00:01<00:00, 162.59it/s] 100%|████████████████████████████████████████| 301/301 [00:01<00:00, 162.59it/s]
INFO:root:Writing DataFrame to mods_info_df.pkl INFO:root:Writing DataFrame to mods_info_df.parquet
~~~ ```
In the next example we convert the metadata from the ALTO files in the test data In the next example we convert the metadata from the ALTO files in the test data
directory: directory:
@ -56,5 +59,5 @@ Scanning directory qurator/mods4pandas/tests/data/alto/PPN715049151
Scanning directory qurator/mods4pandas/tests/data/alto/749782137 Scanning directory qurator/mods4pandas/tests/data/alto/749782137
Scanning directory qurator/mods4pandas/tests/data/alto/weird-ns Scanning directory qurator/mods4pandas/tests/data/alto/weird-ns
INFO:alto4pandas:Processing ALTO files INFO:alto4pandas:Processing ALTO files
INFO:alto4pandas:Writing DataFrame to alto_info_df.pkl INFO:alto4pandas:Writing DataFrame to alto_info_df.parquet
~~~ ~~~

@ -373,8 +373,8 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
default='mods_info_df.parquet', show_default=True) default='mods_info_df.parquet', show_default=True)
@click.option('--output-csv', type=click.Path(), help='Output CSV file') @click.option('--output-csv', type=click.Path(), help='Output CSV file')
@click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file') @click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file')
@click.option('--page-info', is_flag=True, show_default=True, default=False, help='Save page info') @click.option('--output-page-info', type=click.Path(), help='Save page info')
def process(mets_files: List[str], output_file: str, output_csv: str, output_xlsx: str, page_info: bool): def process(mets_files: List[str], output_file: str, output_csv: str, output_xlsx: str, output_page_info: str):
""" """
A tool to convert the MODS metadata in INPUT to a pandas DataFrame. A tool to convert the MODS metadata in INPUT to a pandas DataFrame.
@ -420,11 +420,11 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
d['mets_file'] = mets_file d['mets_file'] = mets_file
# METS - per-page # METS - per-page
if page_info: if output_page_info:
page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True) page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True)
mods_info.append(d) mods_info.append(d)
if page_info: if output_page_info:
page_info.extend(page_info_doc) page_info.extend(page_info_doc)
if caught_warnings: if caught_warnings:
@ -450,12 +450,11 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
mods_info_df.to_excel(output_xlsx) mods_info_df.to_excel(output_xlsx)
# Convert page_info # Convert page_info
# XXX hardcoded filenames + other formats if output_page_info:
if page_info:
page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID")) page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID"))
# Save the DataFrame # Save the DataFrame
logger.info('Writing DataFrame to {}'.format("page_info_df.parquet")) logger.info('Writing DataFrame to {}'.format(output_page_info))
page_info_df.to_parquet("page_info_df.parquet") page_info_df.to_parquet(output_page_info)
def main(): def main():

Loading…
Cancel
Save