1
0
Fork 0
mirror of https://github.com/qurator-spk/modstool.git synced 2025-06-26 12:09:55 +02:00

✔ Test if dtypes are as expected in produced Parquet files

This commit is contained in:
Mike Gerber 2025-06-12 09:42:29 +02:00
parent 215bfbb11f
commit ac8740c33f
5 changed files with 130 additions and 92 deletions

View file

@ -382,7 +382,7 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output Parquet file',
default='mods_info_df.parquet', show_default=True)
@click.option('--output-page-info', type=click.Path(), help='Output page info Parquet file')
def process(mets_files: list[str], output_file: str, output_page_info: str):
def process_command(mets_files: list[str], output_file: str, output_page_info: str):
"""
A tool to convert the MODS metadata in INPUT to a pandas DataFrame.
@ -393,7 +393,9 @@ def process(mets_files: list[str], output_file: str, output_page_info: str):
Per-page information (e.g. structure information) can be output to a separate Parquet file.
"""
process(mets_files, output_file, output_page_info)
def process(mets_files: list[str], output_file: str, output_page_info: str):
# Extend file list if directories are given
mets_files_real: list[str] = []
for m in mets_files:
@ -476,7 +478,7 @@ def main():
for prefix, uri in ns.items():
ET.register_namespace(prefix, uri)
process()
process_command()
if __name__ == '__main__':