mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-09 11:50:00 +02:00
✨ Add a new CLI tool dinglehopper-extract to just give the extracted text
This commit is contained in:
parent
b23b75b601
commit
8b4ee20a40
3 changed files with 33 additions and 0 deletions
27
qurator/dinglehopper/cli_extract.py
Normal file
27
qurator/dinglehopper/cli_extract.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
import os
|
||||
|
||||
import click
|
||||
|
||||
from .extracted_text import ExtractedText
|
||||
from .ocr_files import extract
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument('input_file', type=click.Path(exists=True))
|
||||
@click.option('--textequiv-level', default='region', help='PAGE TextEquiv level to extract text from', metavar='LEVEL')
|
||||
def main(input_file, textequiv_level):
|
||||
"""
|
||||
Extract the text of the given INPUT_FILE.
|
||||
|
||||
dinglehopper detects if INPUT_FILE is an ALTO or PAGE XML document to extract
|
||||
its text and falls back to plain text if no ALTO or PAGE is detected.
|
||||
|
||||
By default, the text of PAGE files is extracted on 'region' level. You may
|
||||
use "--textequiv-level line" to extract from the level of TextLine tags.
|
||||
"""
|
||||
input_text = extract(input_file, textequiv_level=textequiv_level).text
|
||||
print(input_text)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Add table
Add a link
Reference in a new issue