mirror of
https://github.com/qurator-spk/page2tsv.git
synced 2025-06-10 03:49:57 +02:00
use OCR-D/core PAGE API for reading order and recursive regions
This commit is contained in:
parent
e21fbc09a1
commit
d80b02c56d
3 changed files with 25 additions and 44 deletions
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
*.egg-info
|
||||||
|
__pycache__
|
|
@ -1,5 +1,4 @@
|
||||||
numpy
|
ocrd >= 2.23.2
|
||||||
pandas
|
pandas
|
||||||
click
|
|
||||||
requests
|
requests
|
||||||
matplotlib
|
matplotlib
|
||||||
|
|
|
@ -1,13 +1,16 @@
|
||||||
import numpy as np
|
|
||||||
import click
|
|
||||||
import pandas as pd
|
|
||||||
from io import StringIO
|
|
||||||
import os
|
|
||||||
import xml.etree.ElementTree as ET
|
|
||||||
import requests
|
|
||||||
import json
|
import json
|
||||||
import glob
|
import glob
|
||||||
import re
|
import re
|
||||||
|
import os
|
||||||
|
from io import StringIO
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import click
|
||||||
|
import pandas as pd
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from ocrd_models.ocrd_page import parse
|
||||||
|
from ocrd_utils import bbox_from_points
|
||||||
|
|
||||||
from .ned import ned
|
from .ned import ned
|
||||||
from .ner import ner
|
from .ner import ner
|
||||||
|
@ -75,12 +78,10 @@ def annotate_tsv(tsv_file, annotated_tsv_file):
|
||||||
@click.option('--max-confidence', type=float, default=None)
|
@click.option('--max-confidence', type=float, default=None)
|
||||||
def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, ned_rest_endpoint, noproxy, scale_factor,
|
def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, ned_rest_endpoint, noproxy, scale_factor,
|
||||||
ned_threshold, min_confidence, max_confidence):
|
ned_threshold, min_confidence, max_confidence):
|
||||||
|
|
||||||
if purpose == "NERD":
|
if purpose == "NERD":
|
||||||
out_columns = ['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'ID', 'url_id', 'left', 'right', 'top', 'bottom', 'conf']
|
out_columns = ['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'ID', 'url_id', 'left', 'right', 'top', 'bottom', 'conf']
|
||||||
elif purpose == "OCR":
|
elif purpose == "OCR":
|
||||||
out_columns = ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom', 'conf']
|
out_columns = ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom', 'conf']
|
||||||
|
|
||||||
if min_confidence is not None and max_confidence is not None:
|
if min_confidence is not None and max_confidence is not None:
|
||||||
out_columns += ['ocrconf']
|
out_columns += ['ocrconf']
|
||||||
else:
|
else:
|
||||||
|
@ -89,57 +90,36 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint,
|
||||||
if noproxy:
|
if noproxy:
|
||||||
os.environ['no_proxy'] = '*'
|
os.environ['no_proxy'] = '*'
|
||||||
|
|
||||||
tree = ET.parse(page_xml_file)
|
|
||||||
xmlns = tree.getroot().tag.split('}')[0].strip('{')
|
|
||||||
|
|
||||||
urls = []
|
urls = []
|
||||||
if os.path.exists(tsv_out_file):
|
if os.path.exists(tsv_out_file):
|
||||||
parts = extract_doc_links(tsv_out_file)
|
parts = extract_doc_links(tsv_out_file)
|
||||||
|
|
||||||
urls = [part['url'] for part in parts]
|
urls = [part['url'] for part in parts]
|
||||||
else:
|
else:
|
||||||
pd.DataFrame([], columns=out_columns).to_csv(tsv_out_file, sep="\t", quoting=3, index=False)
|
pd.DataFrame([], columns=out_columns).to_csv(tsv_out_file, sep="\t", quoting=3, index=False)
|
||||||
|
|
||||||
|
pcgts = parse(page_xml_file)
|
||||||
tsv = []
|
tsv = []
|
||||||
line_info = []
|
line_info = []
|
||||||
for rgn_number, region in enumerate(tree.findall('.//{%s}TextRegion' % xmlns)):
|
|
||||||
|
|
||||||
for text_line in region.findall('.//{%s}TextLine' % xmlns):
|
for region_idx, region in enumerate(pcgts.get_Page().get_AllRegions(classes=['Text'], order='reading-order')):
|
||||||
|
for text_line in region.get_TextLine():
|
||||||
points = [int(scale_factor * float(pos)) for coords in text_line.findall('./{%s}Coords' % xmlns) for p in
|
left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(text_line.get_Coords().points)]
|
||||||
coords.attrib['points'].split(' ') for pos in p.split(',')]
|
|
||||||
|
|
||||||
x_points, y_points = points[0::2], points[1::2]
|
|
||||||
|
|
||||||
left, right, top, bottom = min(x_points), max(x_points), min(y_points), max(y_points)
|
|
||||||
|
|
||||||
if min_confidence is not None and max_confidence is not None:
|
if min_confidence is not None and max_confidence is not None:
|
||||||
conf = np.max([float(text.attrib['conf']) for text in text_line.findall('./{%s}TextEquiv' % xmlns)])
|
conf = np.max([textequiv.conf for textequiv in text_line.get_TextEquiv()])
|
||||||
else:
|
else:
|
||||||
conf = np.nan
|
conf = np.nan
|
||||||
|
|
||||||
line_info.append((len(urls), left, right, top, bottom, conf))
|
line_info.append((len(urls), left, right, top, bottom, conf))
|
||||||
|
|
||||||
for word in text_line.findall('./{%s}Word' % xmlns):
|
for word in text_line.get_Word():
|
||||||
|
for text_equiv in word.get_TextEquiv():
|
||||||
for text_equiv in word.findall('./{%s}TextEquiv/{%s}Unicode' % (xmlns, xmlns)):
|
|
||||||
text = text_equiv.text
|
|
||||||
|
|
||||||
points = []
|
|
||||||
|
|
||||||
for coords in word.findall('./{%s}Coords' % xmlns):
|
|
||||||
|
|
||||||
# transform OCR coordinates using `scale_factor` to derive
|
# transform OCR coordinates using `scale_factor` to derive
|
||||||
# correct coordinates for the web presentation image
|
# correct coordinates for the web presentation image
|
||||||
points += [int(scale_factor * float(pos))
|
left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(word.get_Coords().points)]
|
||||||
for p in coords.attrib['points'].split(' ') for pos in p.split(',')]
|
|
||||||
|
|
||||||
x_points, y_points = points[0::2], points[1::2]
|
tsv.append((region_idx, len(line_info) - 1, left + (right - left) / 2.0,
|
||||||
|
text_equiv.get_Unicode(), len(urls), left, right, top, bottom))
|
||||||
left, right, top, bottom = min(x_points), max(x_points), min(y_points), max(y_points)
|
|
||||||
|
|
||||||
tsv.append((rgn_number, len(line_info)-1, left + (right - left) / 2.0, text,
|
|
||||||
len(urls), left, right, top, bottom))
|
|
||||||
|
|
||||||
line_info = pd.DataFrame(line_info, columns=['url_id', 'left', 'right', 'top', 'bottom', 'conf'])
|
line_info = pd.DataFrame(line_info, columns=['url_id', 'left', 'right', 'top', 'bottom', 'conf'])
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue