|
|
|
@ -7,17 +7,17 @@ import ocrd_models.ocrd_page
|
|
|
|
|
from ocrd import Processor
|
|
|
|
|
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
|
|
|
|
|
from ocrd_modelfactory import page_from_file
|
|
|
|
|
from ocrd_models import OcrdFile
|
|
|
|
|
from ocrd_models.ocrd_page_generateds import MetadataItemType, LabelsType, LabelType, \
|
|
|
|
|
CoordsType
|
|
|
|
|
from ocrd_models.ocrd_page_generateds import CoordsType, PageType
|
|
|
|
|
from ocrd_utils import (
|
|
|
|
|
assert_file_grp_cardinality,
|
|
|
|
|
getLogger,
|
|
|
|
|
make_file_id,
|
|
|
|
|
MIMETYPE_PAGE,
|
|
|
|
|
coordinates_for_segment,
|
|
|
|
|
polygon_from_points, points_from_polygon,
|
|
|
|
|
)
|
|
|
|
|
import numpy as np
|
|
|
|
|
from shapely.geometry import Polygon, asPolygon
|
|
|
|
|
from shapely.ops import unary_union
|
|
|
|
|
|
|
|
|
|
from pkg_resources import resource_string
|
|
|
|
|
|
|
|
|
@ -63,8 +63,8 @@ class OcrdSbbTextlineDetectorRecognize(Processor):
|
|
|
|
|
page = pcgts.get_Page()
|
|
|
|
|
page_image, page_coords, page_image_info = \
|
|
|
|
|
self.workspace.image_from_page(
|
|
|
|
|
page, page_id,
|
|
|
|
|
feature_filter='cropped,binarized,grayscale_normalized'
|
|
|
|
|
page, page_id,
|
|
|
|
|
feature_filter='cropped,binarized,grayscale_normalized'
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
with tempfile.TemporaryDirectory() as tmp_dirname:
|
|
|
|
@ -84,64 +84,134 @@ class OcrdSbbTextlineDetectorRecognize(Processor):
|
|
|
|
|
|
|
|
|
|
# Create a new PAGE file from the input file
|
|
|
|
|
pcgts.set_pcGtsId(file_id)
|
|
|
|
|
page = pcgts.get_Page()
|
|
|
|
|
|
|
|
|
|
# Merge results → PAGE file
|
|
|
|
|
|
|
|
|
|
# 1. Border
|
|
|
|
|
if page.get_Border():
|
|
|
|
|
log.warning("Page already contained a border")
|
|
|
|
|
log.warning("Removing existing page border")
|
|
|
|
|
page.set_Border(None)
|
|
|
|
|
# We need to translate the coordinates:
|
|
|
|
|
text_border = tmp_page.get_Border()
|
|
|
|
|
coords = text_border.get_Coords().get_points()
|
|
|
|
|
polygon = polygon_from_points(coords)
|
|
|
|
|
polygon_new = coordinates_for_segment(polygon, page_image, page_coords)
|
|
|
|
|
points_new = points_from_polygon(polygon_new)
|
|
|
|
|
coords_new = CoordsType(points=points_new)
|
|
|
|
|
text_border.set_Coords(coords_new)
|
|
|
|
|
page.set_Border(text_border)
|
|
|
|
|
text_border = adapt_coords(tmp_page.get_Border(), page, page_coords)
|
|
|
|
|
if text_border is None:
|
|
|
|
|
# intersection is empty (border outside of rotated original image)
|
|
|
|
|
log.warning("new border would be empty, skipping")
|
|
|
|
|
else:
|
|
|
|
|
page.set_Border(text_border)
|
|
|
|
|
|
|
|
|
|
# 2. ReadingOrder
|
|
|
|
|
if page.get_ReadingOrder():
|
|
|
|
|
log.warning("Page already contained a reading order")
|
|
|
|
|
log.warning("Removing existing regions' reading order")
|
|
|
|
|
page.set_ReadingOrder(tmp_page.get_ReadingOrder())
|
|
|
|
|
|
|
|
|
|
# 3. TextRegion
|
|
|
|
|
# FIXME: what about table and image regions?
|
|
|
|
|
if page.get_TextRegion():
|
|
|
|
|
log.warning("Page already contained text regions")
|
|
|
|
|
log.warning("Removing existing text regions")
|
|
|
|
|
# We need to translate the coordinates:
|
|
|
|
|
text_regions_new = []
|
|
|
|
|
for text_region in tmp_page.get_TextRegion():
|
|
|
|
|
coords = text_region.get_Coords().get_points()
|
|
|
|
|
polygon = polygon_from_points(coords)
|
|
|
|
|
polygon_new = coordinates_for_segment(polygon, page_image, page_coords)
|
|
|
|
|
points_new = points_from_polygon(polygon_new)
|
|
|
|
|
coords_new = CoordsType(points=points_new)
|
|
|
|
|
text_region.set_Coords(coords_new)
|
|
|
|
|
text_region = adapt_coords(text_region, page, page_coords)
|
|
|
|
|
if text_region is None:
|
|
|
|
|
# intersection is empty (polygon outside of above border)
|
|
|
|
|
log.warning("new text region polygon would be empty, skipping")
|
|
|
|
|
continue
|
|
|
|
|
text_regions_new.append(text_region)
|
|
|
|
|
text_lines_new = []
|
|
|
|
|
for text_line in text_region.get_TextLine():
|
|
|
|
|
text_line = adapt_coords(text_line, text_region, page_coords)
|
|
|
|
|
if text_line is None:
|
|
|
|
|
# intersection is empty (polygon outside of region)
|
|
|
|
|
log.warning("new text line polygon would be empty, skipping")
|
|
|
|
|
continue
|
|
|
|
|
text_lines_new.append(text_line)
|
|
|
|
|
text_region.set_TextLine(text_lines_new)
|
|
|
|
|
page.set_TextRegion(text_regions_new)
|
|
|
|
|
|
|
|
|
|
# Save metadata about this operation
|
|
|
|
|
metadata = pcgts.get_Metadata()
|
|
|
|
|
metadata.add_MetadataItem(
|
|
|
|
|
MetadataItemType(type_="processingStep",
|
|
|
|
|
name=self.ocrd_tool['steps'][0],
|
|
|
|
|
value=TOOL,
|
|
|
|
|
Labels=[LabelsType(
|
|
|
|
|
externalModel="ocrd-tool",
|
|
|
|
|
externalId="parameters",
|
|
|
|
|
Label=[LabelType(type_=name, value=self.parameter[name])
|
|
|
|
|
for name in self.parameter.keys()])]))
|
|
|
|
|
self.add_metadata(pcgts)
|
|
|
|
|
|
|
|
|
|
self.workspace.add_file(
|
|
|
|
|
ID=file_id,
|
|
|
|
|
file_grp=self.output_file_grp,
|
|
|
|
|
pageId=page_id,
|
|
|
|
|
mimetype='application/vnd.prima.page+xml',
|
|
|
|
|
local_filename=os.path.join(self.output_file_grp, file_id) + '.xml',
|
|
|
|
|
content=ocrd_models.ocrd_page.to_xml(pcgts)
|
|
|
|
|
ID=file_id,
|
|
|
|
|
file_grp=self.output_file_grp,
|
|
|
|
|
pageId=page_id,
|
|
|
|
|
mimetype='application/vnd.prima.page+xml',
|
|
|
|
|
local_filename=os.path.join(self.output_file_grp, file_id) + '.xml',
|
|
|
|
|
content=ocrd_models.ocrd_page.to_xml(pcgts)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def adapt_coords(segment, parent, transform):
|
|
|
|
|
points = segment.get_Coords().get_points()
|
|
|
|
|
polygon = polygon_from_points(points)
|
|
|
|
|
# polygon absolute coords (after transforming back from page coords, e.g. deskewing)
|
|
|
|
|
polygon_new = coordinates_for_segment(polygon, None, transform)
|
|
|
|
|
# intersection with parent polygon
|
|
|
|
|
polygon_new = polygon_for_parent(polygon_new, parent)
|
|
|
|
|
if polygon_new is None:
|
|
|
|
|
return None
|
|
|
|
|
points_new = points_from_polygon(polygon_new)
|
|
|
|
|
segment.set_Coords(CoordsType(points=points_new))
|
|
|
|
|
return segment
|
|
|
|
|
|
|
|
|
|
# from ocrd_tesserocr, to be integrated into core (somehow)...
|
|
|
|
|
def polygon_for_parent(polygon, parent):
|
|
|
|
|
"""Clip polygon to parent polygon range.
|
|
|
|
|
|
|
|
|
|
(Should be moved to ocrd_utils.coordinates_for_segment.)
|
|
|
|
|
"""
|
|
|
|
|
childp = Polygon(polygon)
|
|
|
|
|
if isinstance(parent, PageType):
|
|
|
|
|
if parent.get_Border():
|
|
|
|
|
parentp = Polygon(polygon_from_points(parent.get_Border().get_Coords().points))
|
|
|
|
|
else:
|
|
|
|
|
parentp = Polygon([[0, 0], [0, parent.get_imageHeight()],
|
|
|
|
|
[parent.get_imageWidth(), parent.get_imageHeight()],
|
|
|
|
|
[parent.get_imageWidth(), 0]])
|
|
|
|
|
else:
|
|
|
|
|
parentp = Polygon(polygon_from_points(parent.get_Coords().points))
|
|
|
|
|
# check if clipping is necessary
|
|
|
|
|
if childp.within(parentp):
|
|
|
|
|
return polygon
|
|
|
|
|
# ensure input coords have valid paths (without self-intersection)
|
|
|
|
|
# (this can happen when shapes valid in floating point are rounded)
|
|
|
|
|
childp = make_valid(childp)
|
|
|
|
|
parentp = make_valid(parentp)
|
|
|
|
|
# clip to parent
|
|
|
|
|
interp = childp.intersection(parentp)
|
|
|
|
|
if interp.is_empty or interp.area == 0.0:
|
|
|
|
|
# this happens if Tesseract "finds" something
|
|
|
|
|
# outside of the valid Border of a deskewed/cropped page
|
|
|
|
|
# (empty corners created by masking); will be ignored
|
|
|
|
|
return None
|
|
|
|
|
if interp.type == 'GeometryCollection':
|
|
|
|
|
# heterogeneous result: filter zero-area shapes (LineString, Point)
|
|
|
|
|
interp = unary_union([geom for geom in interp.geoms if geom.area > 0])
|
|
|
|
|
if interp.type == 'MultiPolygon':
|
|
|
|
|
# homogeneous result: construct convex hull to connect
|
|
|
|
|
# FIXME: construct concave hull / alpha shape
|
|
|
|
|
interp = interp.convex_hull
|
|
|
|
|
if interp.minimum_clearance < 1.0:
|
|
|
|
|
# follow-up calculations will necessarily be integer;
|
|
|
|
|
# so anticipate rounding here and then ensure validity
|
|
|
|
|
interp = asPolygon(np.round(interp.exterior.coords))
|
|
|
|
|
interp = make_valid(interp)
|
|
|
|
|
return interp.exterior.coords[:-1] # keep open
|
|
|
|
|
|
|
|
|
|
# from ocrd_tesserocr, to be integrated into core (somehow)...
|
|
|
|
|
def make_valid(polygon):
|
|
|
|
|
for split in range(1, len(polygon.exterior.coords)-1):
|
|
|
|
|
if polygon.is_valid or polygon.simplify(polygon.area).is_valid:
|
|
|
|
|
break
|
|
|
|
|
# simplification may not be possible (at all) due to ordering
|
|
|
|
|
# in that case, try another starting point
|
|
|
|
|
polygon = Polygon(polygon.exterior.coords[-split:]+polygon.exterior.coords[:-split])
|
|
|
|
|
for tolerance in range(1, int(polygon.area)):
|
|
|
|
|
if polygon.is_valid:
|
|
|
|
|
break
|
|
|
|
|
# simplification may require a larger tolerance
|
|
|
|
|
polygon = polygon.simplify(tolerance)
|
|
|
|
|
return polygon
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
ocrd_sbb_textline_detector()
|
|
|
|
|