✨ Add a new CLI tool dinglehopper-extract to just give the extracted text

2025-12-12 21:54:12 +01:00 · 2020-10-21 16:30:48 +02:00 · 2020-10-21 16:30:48 +02:00 · 8b4ee20a40
commit 8b4ee20a40
parent b23b75b601
3 changed files with 33 additions and 0 deletions
--- a/qurator/dinglehopper/cli_extract.py
+++ b/qurator/dinglehopper/cli_extract.py
@ -0,0 +1,27 @@
+import os
+
+import click
+
+from .extracted_text import ExtractedText
+from .ocr_files import extract
+
+
+@click.command()
+@click.argument('input_file', type=click.Path(exists=True))
+@click.option('--textequiv-level', default='region', help='PAGE TextEquiv level to extract text from', metavar='LEVEL')
+def main(input_file, textequiv_level):
+    """
+    Extract the text of the given INPUT_FILE.
+
+    dinglehopper detects if INPUT_FILE is an ALTO or PAGE XML document to extract
+    its text and falls back to plain text if no ALTO or PAGE is detected.
+
+    By default, the text of PAGE files is extracted on 'region' level. You may
+    use "--textequiv-level line" to extract from the level of TextLine tags.
+    """
+    input_text = extract(input_file, textequiv_level=textequiv_level).text
+    print(input_text)
+
+
+if __name__ == '__main__':
+    main()