You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
35 lines
921 B
Python
35 lines
921 B
Python
4 years ago
|
import os
|
||
|
|
||
|
import click
|
||
3 years ago
|
from ocrd_utils import initLogging
|
||
4 years ago
|
|
||
|
from .extracted_text import ExtractedText
|
||
|
from .ocr_files import extract
|
||
|
|
||
|
|
||
|
@click.command()
|
||
4 years ago
|
@click.argument("input_file", type=click.Path(exists=True))
|
||
|
@click.option(
|
||
|
"--textequiv-level",
|
||
|
default="region",
|
||
|
help="PAGE TextEquiv level to extract text from",
|
||
|
metavar="LEVEL",
|
||
|
)
|
||
4 years ago
|
def main(input_file, textequiv_level):
|
||
|
"""
|
||
|
Extract the text of the given INPUT_FILE.
|
||
|
|
||
|
dinglehopper detects if INPUT_FILE is an ALTO or PAGE XML document to extract
|
||
|
its text and falls back to plain text if no ALTO or PAGE is detected.
|
||
|
|
||
|
By default, the text of PAGE files is extracted on 'region' level. You may
|
||
|
use "--textequiv-level line" to extract from the level of TextLine tags.
|
||
|
"""
|
||
3 years ago
|
initLogging()
|
||
4 years ago
|
input_text = extract(input_file, textequiv_level=textequiv_level).text
|
||
|
print(input_text)
|
||
|
|
||
|
|
||
4 years ago
|
if __name__ == "__main__":
|
||
4 years ago
|
main()
|