mirror of
https://github.com/qurator-spk/modstool.git
synced 2025-06-07 19:05:06 +02:00
🐛 Fix converting/writing out per-page information (e.g. structure information)
This commit is contained in:
parent
1bf86bfb4c
commit
a1f333f4a4
2 changed files with 14 additions and 12 deletions
11
README.md
11
README.md
|
@ -10,6 +10,9 @@ instead of ordered lists for topics, etc. Parts of the tool are specific to
|
||||||
our environment/needs at the State Library Berlin and may need to be changed for
|
our environment/needs at the State Library Berlin and may need to be changed for
|
||||||
your library.
|
your library.
|
||||||
|
|
||||||
|
Per-page information (e.g. structure information from the METS structMap) can
|
||||||
|
be converted as well (`--output-page-info`).
|
||||||
|
|
||||||
**alto4pandas** converts the metadata from ALTO files into a pandas DataFrame.
|
**alto4pandas** converts the metadata from ALTO files into a pandas DataFrame.
|
||||||
|
|
||||||
Column names are derived from the corresponding ALTO elements. Some columns
|
Column names are derived from the corresponding ALTO elements. Some columns
|
||||||
|
@ -31,14 +34,14 @@ In this example we convert the MODS metadata contained in the METS files in
|
||||||
`mods_info_df.parquet`. This file can then be read by your data scientist using
|
`mods_info_df.parquet`. This file can then be read by your data scientist using
|
||||||
`pd.read_parquet()`.
|
`pd.read_parquet()`.
|
||||||
|
|
||||||
~~~
|
```
|
||||||
% mods4pandas /srv/data/digisam_mets-sample-300
|
% mods4pandas /srv/data/digisam_mets-sample-300
|
||||||
INFO:root:Scanning directory /srv/data/digisam_mets-sample-300
|
INFO:root:Scanning directory /srv/data/digisam_mets-sample-300
|
||||||
301it [00:00, 19579.19it/s]
|
301it [00:00, 19579.19it/s]
|
||||||
INFO:root:Processing METS files
|
INFO:root:Processing METS files
|
||||||
100%|████████████████████████████████████████| 301/301 [00:01<00:00, 162.59it/s]
|
100%|████████████████████████████████████████| 301/301 [00:01<00:00, 162.59it/s]
|
||||||
INFO:root:Writing DataFrame to mods_info_df.pkl
|
INFO:root:Writing DataFrame to mods_info_df.parquet
|
||||||
~~~
|
```
|
||||||
|
|
||||||
In the next example we convert the metadata from the ALTO files in the test data
|
In the next example we convert the metadata from the ALTO files in the test data
|
||||||
directory:
|
directory:
|
||||||
|
@ -56,5 +59,5 @@ Scanning directory qurator/mods4pandas/tests/data/alto/PPN715049151
|
||||||
Scanning directory qurator/mods4pandas/tests/data/alto/749782137
|
Scanning directory qurator/mods4pandas/tests/data/alto/749782137
|
||||||
Scanning directory qurator/mods4pandas/tests/data/alto/weird-ns
|
Scanning directory qurator/mods4pandas/tests/data/alto/weird-ns
|
||||||
INFO:alto4pandas:Processing ALTO files
|
INFO:alto4pandas:Processing ALTO files
|
||||||
INFO:alto4pandas:Writing DataFrame to alto_info_df.pkl
|
INFO:alto4pandas:Writing DataFrame to alto_info_df.parquet
|
||||||
~~~
|
~~~
|
||||||
|
|
|
@ -373,8 +373,8 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
|
||||||
default='mods_info_df.parquet', show_default=True)
|
default='mods_info_df.parquet', show_default=True)
|
||||||
@click.option('--output-csv', type=click.Path(), help='Output CSV file')
|
@click.option('--output-csv', type=click.Path(), help='Output CSV file')
|
||||||
@click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file')
|
@click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file')
|
||||||
@click.option('--page-info', is_flag=True, show_default=True, default=False, help='Save page info')
|
@click.option('--output-page-info', type=click.Path(), help='Save page info')
|
||||||
def process(mets_files: List[str], output_file: str, output_csv: str, output_xlsx: str, page_info: bool):
|
def process(mets_files: List[str], output_file: str, output_csv: str, output_xlsx: str, output_page_info: str):
|
||||||
"""
|
"""
|
||||||
A tool to convert the MODS metadata in INPUT to a pandas DataFrame.
|
A tool to convert the MODS metadata in INPUT to a pandas DataFrame.
|
||||||
|
|
||||||
|
@ -420,11 +420,11 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
|
||||||
d['mets_file'] = mets_file
|
d['mets_file'] = mets_file
|
||||||
|
|
||||||
# METS - per-page
|
# METS - per-page
|
||||||
if page_info:
|
if output_page_info:
|
||||||
page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True)
|
page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True)
|
||||||
|
|
||||||
mods_info.append(d)
|
mods_info.append(d)
|
||||||
if page_info:
|
if output_page_info:
|
||||||
page_info.extend(page_info_doc)
|
page_info.extend(page_info_doc)
|
||||||
|
|
||||||
if caught_warnings:
|
if caught_warnings:
|
||||||
|
@ -450,12 +450,11 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
|
||||||
mods_info_df.to_excel(output_xlsx)
|
mods_info_df.to_excel(output_xlsx)
|
||||||
|
|
||||||
# Convert page_info
|
# Convert page_info
|
||||||
# XXX hardcoded filenames + other formats
|
if output_page_info:
|
||||||
if page_info:
|
|
||||||
page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID"))
|
page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID"))
|
||||||
# Save the DataFrame
|
# Save the DataFrame
|
||||||
logger.info('Writing DataFrame to {}'.format("page_info_df.parquet"))
|
logger.info('Writing DataFrame to {}'.format(output_page_info))
|
||||||
page_info_df.to_parquet("page_info_df.parquet")
|
page_info_df.to_parquet(output_page_info)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue