diff --git a/qurator/sbb_textline_detector/ocrd_cli.py b/qurator/sbb_textline_detector/ocrd_cli.py index ce00096..80002f0 100644 --- a/qurator/sbb_textline_detector/ocrd_cli.py +++ b/qurator/sbb_textline_detector/ocrd_cli.py @@ -7,17 +7,17 @@ import ocrd_models.ocrd_page from ocrd import Processor from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor from ocrd_modelfactory import page_from_file -from ocrd_models import OcrdFile -from ocrd_models.ocrd_page_generateds import MetadataItemType, LabelsType, LabelType, \ - CoordsType +from ocrd_models.ocrd_page_generateds import CoordsType, PageType from ocrd_utils import ( assert_file_grp_cardinality, getLogger, make_file_id, - MIMETYPE_PAGE, coordinates_for_segment, polygon_from_points, points_from_polygon, ) +import numpy as np +from shapely.geometry import Polygon, asPolygon +from shapely.ops import unary_union from pkg_resources import resource_string @@ -63,8 +63,8 @@ class OcrdSbbTextlineDetectorRecognize(Processor): page = pcgts.get_Page() page_image, page_coords, page_image_info = \ self.workspace.image_from_page( - page, page_id, - feature_filter='cropped,binarized,grayscale_normalized' + page, page_id, + feature_filter='cropped,binarized,grayscale_normalized' ) with tempfile.TemporaryDirectory() as tmp_dirname: @@ -84,64 +84,134 @@ class OcrdSbbTextlineDetectorRecognize(Processor): # Create a new PAGE file from the input file pcgts.set_pcGtsId(file_id) - page = pcgts.get_Page() # Merge results → PAGE file # 1. Border if page.get_Border(): - log.warning("Page already contained a border") + log.warning("Removing existing page border") + page.set_Border(None) # We need to translate the coordinates: - text_border = tmp_page.get_Border() - coords = text_border.get_Coords().get_points() - polygon = polygon_from_points(coords) - polygon_new = coordinates_for_segment(polygon, page_image, page_coords) - points_new = points_from_polygon(polygon_new) - coords_new = CoordsType(points=points_new) - text_border.set_Coords(coords_new) - page.set_Border(text_border) + text_border = adapt_coords(tmp_page.get_Border(), page, page_coords) + if text_border is None: + # intersection is empty (border outside of rotated original image) + log.warning("new border would be empty, skipping") + else: + page.set_Border(text_border) # 2. ReadingOrder if page.get_ReadingOrder(): - log.warning("Page already contained a reading order") + log.warning("Removing existing regions' reading order") page.set_ReadingOrder(tmp_page.get_ReadingOrder()) # 3. TextRegion + # FIXME: what about table and image regions? if page.get_TextRegion(): - log.warning("Page already contained text regions") + log.warning("Removing existing text regions") # We need to translate the coordinates: text_regions_new = [] for text_region in tmp_page.get_TextRegion(): - coords = text_region.get_Coords().get_points() - polygon = polygon_from_points(coords) - polygon_new = coordinates_for_segment(polygon, page_image, page_coords) - points_new = points_from_polygon(polygon_new) - coords_new = CoordsType(points=points_new) - text_region.set_Coords(coords_new) + text_region = adapt_coords(text_region, page, page_coords) + if text_region is None: + # intersection is empty (polygon outside of above border) + log.warning("new text region polygon would be empty, skipping") + continue text_regions_new.append(text_region) + text_lines_new = [] + for text_line in text_region.get_TextLine(): + text_line = adapt_coords(text_line, text_region, page_coords) + if text_line is None: + # intersection is empty (polygon outside of region) + log.warning("new text line polygon would be empty, skipping") + continue + text_lines_new.append(text_line) + text_region.set_TextLine(text_lines_new) page.set_TextRegion(text_regions_new) # Save metadata about this operation - metadata = pcgts.get_Metadata() - metadata.add_MetadataItem( - MetadataItemType(type_="processingStep", - name=self.ocrd_tool['steps'][0], - value=TOOL, - Labels=[LabelsType( - externalModel="ocrd-tool", - externalId="parameters", - Label=[LabelType(type_=name, value=self.parameter[name]) - for name in self.parameter.keys()])])) + self.add_metadata(pcgts) self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=page_id, - mimetype='application/vnd.prima.page+xml', - local_filename=os.path.join(self.output_file_grp, file_id) + '.xml', - content=ocrd_models.ocrd_page.to_xml(pcgts) + ID=file_id, + file_grp=self.output_file_grp, + pageId=page_id, + mimetype='application/vnd.prima.page+xml', + local_filename=os.path.join(self.output_file_grp, file_id) + '.xml', + content=ocrd_models.ocrd_page.to_xml(pcgts) ) +def adapt_coords(segment, parent, transform): + points = segment.get_Coords().get_points() + polygon = polygon_from_points(points) + # polygon absolute coords (after transforming back from page coords, e.g. deskewing) + polygon_new = coordinates_for_segment(polygon, None, transform) + # intersection with parent polygon + polygon_new = polygon_for_parent(polygon_new, parent) + if polygon_new is None: + return None + points_new = points_from_polygon(polygon_new) + segment.set_Coords(CoordsType(points=points_new)) + return segment + +# from ocrd_tesserocr, to be integrated into core (somehow)... +def polygon_for_parent(polygon, parent): + """Clip polygon to parent polygon range. + + (Should be moved to ocrd_utils.coordinates_for_segment.) + """ + childp = Polygon(polygon) + if isinstance(parent, PageType): + if parent.get_Border(): + parentp = Polygon(polygon_from_points(parent.get_Border().get_Coords().points)) + else: + parentp = Polygon([[0, 0], [0, parent.get_imageHeight()], + [parent.get_imageWidth(), parent.get_imageHeight()], + [parent.get_imageWidth(), 0]]) + else: + parentp = Polygon(polygon_from_points(parent.get_Coords().points)) + # check if clipping is necessary + if childp.within(parentp): + return polygon + # ensure input coords have valid paths (without self-intersection) + # (this can happen when shapes valid in floating point are rounded) + childp = make_valid(childp) + parentp = make_valid(parentp) + # clip to parent + interp = childp.intersection(parentp) + if interp.is_empty or interp.area == 0.0: + # this happens if Tesseract "finds" something + # outside of the valid Border of a deskewed/cropped page + # (empty corners created by masking); will be ignored + return None + if interp.type == 'GeometryCollection': + # heterogeneous result: filter zero-area shapes (LineString, Point) + interp = unary_union([geom for geom in interp.geoms if geom.area > 0]) + if interp.type == 'MultiPolygon': + # homogeneous result: construct convex hull to connect + # FIXME: construct concave hull / alpha shape + interp = interp.convex_hull + if interp.minimum_clearance < 1.0: + # follow-up calculations will necessarily be integer; + # so anticipate rounding here and then ensure validity + interp = asPolygon(np.round(interp.exterior.coords)) + interp = make_valid(interp) + return interp.exterior.coords[:-1] # keep open + +# from ocrd_tesserocr, to be integrated into core (somehow)... +def make_valid(polygon): + for split in range(1, len(polygon.exterior.coords)-1): + if polygon.is_valid or polygon.simplify(polygon.area).is_valid: + break + # simplification may not be possible (at all) due to ordering + # in that case, try another starting point + polygon = Polygon(polygon.exterior.coords[-split:]+polygon.exterior.coords[:-split]) + for tolerance in range(1, int(polygon.area)): + if polygon.is_valid: + break + # simplification may require a larger tolerance + polygon = polygon.simplify(tolerance) + return polygon + if __name__ == '__main__': ocrd_sbb_textline_detector() diff --git a/requirements.txt b/requirements.txt index befcdfe..7a12381 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,5 @@ scikit-learn numpy == 1.18.* # XXX for tensorflow-gpu 1.15 tensorflow-gpu ~=1.15.2 scipy -ocrd >= 2.0.0 +ocrd >= 2.18.0 +shapely >= 1.7.1