✨ Add a new CLI tool dinglehopper-extract to just give the extracted text

2025-07-18 23:09:54 +02:00 · 2020-10-21 16:30:48 +02:00 · 2020-10-21 16:30:48 +02:00 · 8b4ee20a40
commit 8b4ee20a40
parent b23b75b601
3 changed files with 33 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -76,6 +76,11 @@ parameter:
 ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -P metrics false
 ~~~
 The tool `dinglehopper-extract` extracts the text of the given input file on
 stdout, for example:
 `dinglehopper-extract OCR-D-GT-PAGE/00000024.page.xml`
 Developer information
 ---------------------
 *Please refer to [README-DEV.md](README-DEV.md).*
--- a/qurator/dinglehopper/cli_extract.py
+++ b/qurator/dinglehopper/cli_extract.py
@ -0,0 +1,27 @@
 import os
 import click
 from .extracted_text import ExtractedText
 from .ocr_files import extract
@click.command()
@click.argument('input_file', type=click.Path(exists=True))
@click.option('--textequiv-level', default='region', help='PAGE TextEquiv level to extract text from', metavar='LEVEL')
 def main(input_file, textequiv_level):
    """
    Extract the text of the given INPUT_FILE.
    dinglehopper detects if INPUT_FILE is an ALTO or PAGE XML document to extract
    its text and falls back to plain text if no ALTO or PAGE is detected.
    By default, the text of PAGE files is extracted on 'region' level. You may
    use "--textequiv-level line" to extract from the level of TextLine tags.
    """
    input_text = extract(input_file, textequiv_level=textequiv_level).text
    print(input_text)
 if __name__ == '__main__':
    main()
--- a/setup.py
+++ b/setup.py
@ -22,6 +22,7 @@ setup(
    entry_points={
      'console_scripts': [
        'dinglehopper=qurator.dinglehopper.cli:main',
        'dinglehopper-extract=qurator.dinglehopper.cli_extract:main',
        'ocrd-dinglehopper=qurator.dinglehopper.ocrd_cli:ocrd_dinglehopper',
      ]
    }