From 0855ccb66bde5bbe9ee3250f54a1f4453b8287ee Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 7 Aug 2025 21:16:32 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20Add=20--mets-files-list=20option=20?= =?UTF-8?q?to=20give=20a=20list=20of=20input=20files?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mods4pandas/mods4pandas.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py index b595da3..212e158 100755 --- a/src/mods4pandas/mods4pandas.py +++ b/src/mods4pandas/mods4pandas.py @@ -513,8 +513,14 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: return result +def validate_mets_files(ctx, param, mets_files): + if not mets_files and "mets_files_list" not in ctx.params: + raise click.BadParameter("Neither mets_files nor mets_files_list given") + return mets_files + + @click.command() -@click.argument("mets_files", type=click.Path(exists=True), required=True, nargs=-1) +@click.argument("mets_files", type=click.Path(exists=True), nargs=-1, callback=validate_mets_files) @click.option( "--output", "-o", @@ -527,7 +533,10 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: @click.option( "--output-page-info", type=click.Path(), help="Output page info Parquet file" ) -def process_command(mets_files: list[str], output_file: str, output_page_info: str): +@click.option( + "--mets-files-list", type=click.Path(), help="Read list of METS files from this file" +) +def process_command(mets_files: list[str], output_file: str, output_page_info: str, mets_files_list: str): """ A tool to convert the MODS metadata in INPUT to a pandas DataFrame. @@ -538,12 +547,19 @@ def process_command(mets_files: list[str], output_file: str, output_page_info: s Per-page information (e.g. structure information) can be output to a separate Parquet file. """ - process(mets_files, output_file, output_page_info) + process(mets_files, output_file, output_page_info, mets_files_list) -def process(mets_files: list[str], output_file: str, output_page_info: str): - # Extend file list if directories are given + + +def process(mets_files: list[str], output_file: str, output_page_info: str, mets_files_list: str): mets_files_real: list[str] = [] + + if mets_files_list: + with open(mets_files_list) as f: + mets_files_real = [line.strip() for line in f.readlines()] + + # Extend file list if directories are given for m in mets_files: if os.path.isdir(m): logger.info("Scanning directory {}".format(m))