use OCR-D/core PAGE API for reading order and recursive regions

pull/4/head
Konstantin Baierer 4 years ago
parent e21fbc09a1
commit d80b02c56d

2
.gitignore vendored

@ -0,0 +1,2 @@
*.egg-info
__pycache__

@ -1,5 +1,4 @@
numpy ocrd >= 2.23.2
pandas pandas
click
requests requests
matplotlib matplotlib

@ -1,13 +1,16 @@
import json
import glob
import re
import os
from io import StringIO
import numpy as np import numpy as np
import click import click
import pandas as pd import pandas as pd
from io import StringIO
import os
import xml.etree.ElementTree as ET
import requests import requests
import json
import glob from ocrd_models.ocrd_page import parse
import re from ocrd_utils import bbox_from_points
from .ned import ned from .ned import ned
from .ner import ner from .ner import ner
@ -75,12 +78,10 @@ def annotate_tsv(tsv_file, annotated_tsv_file):
@click.option('--max-confidence', type=float, default=None) @click.option('--max-confidence', type=float, default=None)
def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, ned_rest_endpoint, noproxy, scale_factor, def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, ned_rest_endpoint, noproxy, scale_factor,
ned_threshold, min_confidence, max_confidence): ned_threshold, min_confidence, max_confidence):
if purpose == "NERD": if purpose == "NERD":
out_columns = ['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'ID', 'url_id', 'left', 'right', 'top', 'bottom', 'conf'] out_columns = ['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'ID', 'url_id', 'left', 'right', 'top', 'bottom', 'conf']
elif purpose == "OCR": elif purpose == "OCR":
out_columns = ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom', 'conf'] out_columns = ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom', 'conf']
if min_confidence is not None and max_confidence is not None: if min_confidence is not None and max_confidence is not None:
out_columns += ['ocrconf'] out_columns += ['ocrconf']
else: else:
@ -89,57 +90,36 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint,
if noproxy: if noproxy:
os.environ['no_proxy'] = '*' os.environ['no_proxy'] = '*'
tree = ET.parse(page_xml_file)
xmlns = tree.getroot().tag.split('}')[0].strip('{')
urls = [] urls = []
if os.path.exists(tsv_out_file): if os.path.exists(tsv_out_file):
parts = extract_doc_links(tsv_out_file) parts = extract_doc_links(tsv_out_file)
urls = [part['url'] for part in parts] urls = [part['url'] for part in parts]
else: else:
pd.DataFrame([], columns=out_columns).to_csv(tsv_out_file, sep="\t", quoting=3, index=False) pd.DataFrame([], columns=out_columns).to_csv(tsv_out_file, sep="\t", quoting=3, index=False)
pcgts = parse(page_xml_file)
tsv = [] tsv = []
line_info = [] line_info = []
for rgn_number, region in enumerate(tree.findall('.//{%s}TextRegion' % xmlns)):
for text_line in region.findall('.//{%s}TextLine' % xmlns):
points = [int(scale_factor * float(pos)) for coords in text_line.findall('./{%s}Coords' % xmlns) for p in
coords.attrib['points'].split(' ') for pos in p.split(',')]
x_points, y_points = points[0::2], points[1::2]
left, right, top, bottom = min(x_points), max(x_points), min(y_points), max(y_points) for region_idx, region in enumerate(pcgts.get_Page().get_AllRegions(classes=['Text'], order='reading-order')):
for text_line in region.get_TextLine():
left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(text_line.get_Coords().points)]
if min_confidence is not None and max_confidence is not None: if min_confidence is not None and max_confidence is not None:
conf = np.max([float(text.attrib['conf']) for text in text_line.findall('./{%s}TextEquiv' % xmlns)]) conf = np.max([textequiv.conf for textequiv in text_line.get_TextEquiv()])
else: else:
conf = np.nan conf = np.nan
line_info.append((len(urls), left, right, top, bottom, conf)) line_info.append((len(urls), left, right, top, bottom, conf))
for word in text_line.findall('./{%s}Word' % xmlns): for word in text_line.get_Word():
for text_equiv in word.get_TextEquiv():
for text_equiv in word.findall('./{%s}TextEquiv/{%s}Unicode' % (xmlns, xmlns)):
text = text_equiv.text
points = []
for coords in word.findall('./{%s}Coords' % xmlns):
# transform OCR coordinates using `scale_factor` to derive # transform OCR coordinates using `scale_factor` to derive
# correct coordinates for the web presentation image # correct coordinates for the web presentation image
points += [int(scale_factor * float(pos)) left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(word.get_Coords().points)]
for p in coords.attrib['points'].split(' ') for pos in p.split(',')]
x_points, y_points = points[0::2], points[1::2]
left, right, top, bottom = min(x_points), max(x_points), min(y_points), max(y_points)
tsv.append((rgn_number, len(line_info)-1, left + (right - left) / 2.0, text, tsv.append((region_idx, len(line_info) - 1, left + (right - left) / 2.0,
len(urls), left, right, top, bottom)) text_equiv.get_Unicode(), len(urls), left, right, top, bottom))
line_info = pd.DataFrame(line_info, columns=['url_id', 'left', 'right', 'top', 'bottom', 'conf']) line_info = pd.DataFrame(line_info, columns=['url_id', 'left', 'right', 'top', 'bottom', 'conf'])

Loading…
Cancel
Save