diff --git a/README.md b/README.md index 3cf95a6..8c39217 100644 --- a/README.md +++ b/README.md @@ -76,6 +76,11 @@ parameter: ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -P metrics false ~~~ +The tool `dinglehopper-extract` extracts the text of the given input file on +stdout, for example: + +`dinglehopper-extract OCR-D-GT-PAGE/00000024.page.xml` + Developer information --------------------- *Please refer to [README-DEV.md](README-DEV.md).* diff --git a/qurator/dinglehopper/cli_extract.py b/qurator/dinglehopper/cli_extract.py new file mode 100644 index 0000000..a5d36d8 --- /dev/null +++ b/qurator/dinglehopper/cli_extract.py @@ -0,0 +1,27 @@ +import os + +import click + +from .extracted_text import ExtractedText +from .ocr_files import extract + + +@click.command() +@click.argument('input_file', type=click.Path(exists=True)) +@click.option('--textequiv-level', default='region', help='PAGE TextEquiv level to extract text from', metavar='LEVEL') +def main(input_file, textequiv_level): + """ + Extract the text of the given INPUT_FILE. + + dinglehopper detects if INPUT_FILE is an ALTO or PAGE XML document to extract + its text and falls back to plain text if no ALTO or PAGE is detected. + + By default, the text of PAGE files is extracted on 'region' level. You may + use "--textequiv-level line" to extract from the level of TextLine tags. + """ + input_text = extract(input_file, textequiv_level=textequiv_level).text + print(input_text) + + +if __name__ == '__main__': + main() diff --git a/setup.py b/setup.py index f7a6113..7b8107a 100644 --- a/setup.py +++ b/setup.py @@ -22,6 +22,7 @@ setup( entry_points={ 'console_scripts': [ 'dinglehopper=qurator.dinglehopper.cli:main', + 'dinglehopper-extract=qurator.dinglehopper.cli_extract:main', 'ocrd-dinglehopper=qurator.dinglehopper.ocrd_cli:ocrd_dinglehopper', ] }