1
0
Fork 0
mirror of https://github.com/qurator-spk/modstool.git synced 2025-08-14 03:59:53 +02:00

Add --mets-files-list option to give a list of input files

This commit is contained in:
Gerber, Mike 2025-08-07 21:16:32 +02:00
parent 4178f1e380
commit 0855ccb66b

View file

@ -513,8 +513,14 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
return result return result
def validate_mets_files(ctx, param, mets_files):
if not mets_files and "mets_files_list" not in ctx.params:
raise click.BadParameter("Neither mets_files nor mets_files_list given")
return mets_files
@click.command() @click.command()
@click.argument("mets_files", type=click.Path(exists=True), required=True, nargs=-1) @click.argument("mets_files", type=click.Path(exists=True), nargs=-1, callback=validate_mets_files)
@click.option( @click.option(
"--output", "--output",
"-o", "-o",
@ -527,7 +533,10 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
@click.option( @click.option(
"--output-page-info", type=click.Path(), help="Output page info Parquet file" "--output-page-info", type=click.Path(), help="Output page info Parquet file"
) )
def process_command(mets_files: list[str], output_file: str, output_page_info: str): @click.option(
"--mets-files-list", type=click.Path(), help="Read list of METS files from this file"
)
def process_command(mets_files: list[str], output_file: str, output_page_info: str, mets_files_list: str):
""" """
A tool to convert the MODS metadata in INPUT to a pandas DataFrame. A tool to convert the MODS metadata in INPUT to a pandas DataFrame.
@ -538,12 +547,19 @@ def process_command(mets_files: list[str], output_file: str, output_page_info: s
Per-page information (e.g. structure information) can be output to a separate Parquet file. Per-page information (e.g. structure information) can be output to a separate Parquet file.
""" """
process(mets_files, output_file, output_page_info) process(mets_files, output_file, output_page_info, mets_files_list)
def process(mets_files: list[str], output_file: str, output_page_info: str):
# Extend file list if directories are given
def process(mets_files: list[str], output_file: str, output_page_info: str, mets_files_list: str):
mets_files_real: list[str] = [] mets_files_real: list[str] = []
if mets_files_list:
with open(mets_files_list) as f:
mets_files_real = [line.strip() for line in f.readlines()]
# Extend file list if directories are given
for m in mets_files: for m in mets_files:
if os.path.isdir(m): if os.path.isdir(m):
logger.info("Scanning directory {}".format(m)) logger.info("Scanning directory {}".format(m))