mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-07-07 01:19:56 +02:00
✨ Add a new CLI tool dinglehopper-extract to just give the extracted text
This commit is contained in:
parent
b23b75b601
commit
8b4ee20a40
3 changed files with 33 additions and 0 deletions
|
@ -76,6 +76,11 @@ parameter:
|
||||||
ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -P metrics false
|
ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -P metrics false
|
||||||
~~~
|
~~~
|
||||||
|
|
||||||
|
The tool `dinglehopper-extract` extracts the text of the given input file on
|
||||||
|
stdout, for example:
|
||||||
|
|
||||||
|
`dinglehopper-extract OCR-D-GT-PAGE/00000024.page.xml`
|
||||||
|
|
||||||
Developer information
|
Developer information
|
||||||
---------------------
|
---------------------
|
||||||
*Please refer to [README-DEV.md](README-DEV.md).*
|
*Please refer to [README-DEV.md](README-DEV.md).*
|
||||||
|
|
27
qurator/dinglehopper/cli_extract.py
Normal file
27
qurator/dinglehopper/cli_extract.py
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
import os
|
||||||
|
|
||||||
|
import click
|
||||||
|
|
||||||
|
from .extracted_text import ExtractedText
|
||||||
|
from .ocr_files import extract
|
||||||
|
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.argument('input_file', type=click.Path(exists=True))
|
||||||
|
@click.option('--textequiv-level', default='region', help='PAGE TextEquiv level to extract text from', metavar='LEVEL')
|
||||||
|
def main(input_file, textequiv_level):
|
||||||
|
"""
|
||||||
|
Extract the text of the given INPUT_FILE.
|
||||||
|
|
||||||
|
dinglehopper detects if INPUT_FILE is an ALTO or PAGE XML document to extract
|
||||||
|
its text and falls back to plain text if no ALTO or PAGE is detected.
|
||||||
|
|
||||||
|
By default, the text of PAGE files is extracted on 'region' level. You may
|
||||||
|
use "--textequiv-level line" to extract from the level of TextLine tags.
|
||||||
|
"""
|
||||||
|
input_text = extract(input_file, textequiv_level=textequiv_level).text
|
||||||
|
print(input_text)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
1
setup.py
1
setup.py
|
@ -22,6 +22,7 @@ setup(
|
||||||
entry_points={
|
entry_points={
|
||||||
'console_scripts': [
|
'console_scripts': [
|
||||||
'dinglehopper=qurator.dinglehopper.cli:main',
|
'dinglehopper=qurator.dinglehopper.cli:main',
|
||||||
|
'dinglehopper-extract=qurator.dinglehopper.cli_extract:main',
|
||||||
'ocrd-dinglehopper=qurator.dinglehopper.ocrd_cli:ocrd_dinglehopper',
|
'ocrd-dinglehopper=qurator.dinglehopper.ocrd_cli:ocrd_dinglehopper',
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue