From 8b4ee20a40328d0eb0b7f4cc6236341545001788 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 21 Oct 2020 16:30:48 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20Add=20a=20new=20CLI=20tool=20dingle?= =?UTF-8?q?hopper-extract=20to=20just=20give=20the=20extracted=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 5 +++++ qurator/dinglehopper/cli_extract.py | 27 +++++++++++++++++++++++++++ setup.py | 1 + 3 files changed, 33 insertions(+) create mode 100644 qurator/dinglehopper/cli_extract.py diff --git a/README.md b/README.md index 3cf95a6..8c39217 100644 --- a/README.md +++ b/README.md @@ -76,6 +76,11 @@ parameter: ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -P metrics false ~~~ +The tool `dinglehopper-extract` extracts the text of the given input file on +stdout, for example: + +`dinglehopper-extract OCR-D-GT-PAGE/00000024.page.xml` + Developer information --------------------- *Please refer to [README-DEV.md](README-DEV.md).* diff --git a/qurator/dinglehopper/cli_extract.py b/qurator/dinglehopper/cli_extract.py new file mode 100644 index 0000000..a5d36d8 --- /dev/null +++ b/qurator/dinglehopper/cli_extract.py @@ -0,0 +1,27 @@ +import os + +import click + +from .extracted_text import ExtractedText +from .ocr_files import extract + + +@click.command() +@click.argument('input_file', type=click.Path(exists=True)) +@click.option('--textequiv-level', default='region', help='PAGE TextEquiv level to extract text from', metavar='LEVEL') +def main(input_file, textequiv_level): + """ + Extract the text of the given INPUT_FILE. + + dinglehopper detects if INPUT_FILE is an ALTO or PAGE XML document to extract + its text and falls back to plain text if no ALTO or PAGE is detected. + + By default, the text of PAGE files is extracted on 'region' level. You may + use "--textequiv-level line" to extract from the level of TextLine tags. + """ + input_text = extract(input_file, textequiv_level=textequiv_level).text + print(input_text) + + +if __name__ == '__main__': + main() diff --git a/setup.py b/setup.py index f7a6113..7b8107a 100644 --- a/setup.py +++ b/setup.py @@ -22,6 +22,7 @@ setup( entry_points={ 'console_scripts': [ 'dinglehopper=qurator.dinglehopper.cli:main', + 'dinglehopper-extract=qurator.dinglehopper.cli_extract:main', 'ocrd-dinglehopper=qurator.dinglehopper.ocrd_cli:ocrd_dinglehopper', ] }