@ -7,17 +7,17 @@ import ocrd_models.ocrd_page
from ocrd import Processor
from ocrd . decorators import ocrd_cli_options , ocrd_cli_wrap_processor
from ocrd_modelfactory import page_from_file
from ocrd_models import OcrdFile
from ocrd_models . ocrd_page_generateds import MetadataItemType , LabelsType , LabelType , \
CoordsType
from ocrd_models . ocrd_page_generateds import CoordsType , PageType
from ocrd_utils import (
assert_file_grp_cardinality ,
getLogger ,
make_file_id ,
MIMETYPE_PAGE ,
coordinates_for_segment ,
polygon_from_points , points_from_polygon ,
)
import numpy as np
from shapely . geometry import Polygon , asPolygon
from shapely . ops import unary_union
from pkg_resources import resource_string
@ -63,8 +63,8 @@ class OcrdSbbTextlineDetectorRecognize(Processor):
page = pcgts . get_Page ( )
page_image , page_coords , page_image_info = \
self . workspace . image_from_page (
page , page_id ,
feature_filter = ' cropped,binarized,grayscale_normalized '
page , page_id ,
feature_filter = ' cropped,binarized,grayscale_normalized '
)
with tempfile . TemporaryDirectory ( ) as tmp_dirname :
@ -84,64 +84,134 @@ class OcrdSbbTextlineDetectorRecognize(Processor):
# Create a new PAGE file from the input file
pcgts . set_pcGtsId ( file_id )
page = pcgts . get_Page ( )
# Merge results → PAGE file
# 1. Border
if page . get_Border ( ) :
log . warning ( " Page already contained a border " )
log . warning ( " Removing existing page border " )
page . set_Border ( None )
# We need to translate the coordinates:
text_border = tmp_page . get_Border ( )
coords = text_border . get_Coords ( ) . get_points ( )
polygon = polygon_from_points ( coords )
polygon_new = coordinates_for_segment ( polygon , page_image , page_coords )
points_new = points_from_polygon ( polygon_new )
coords_new = CoordsType ( points = points_new )
text_border . set_Coords ( coords_new )
page . set_Border ( text_border )
text_border = adapt_coords ( tmp_page . get_Border ( ) , page , page_coords )
if text_border is None :
# intersection is empty (border outside of rotated original image)
log . warning ( " new border would be empty, skipping " )
else :
page . set_Border ( text_border )
# 2. ReadingOrder
if page . get_ReadingOrder ( ) :
log . warning ( " Page already contained a reading order" )
log . warning ( " Removing existing regions' reading order" )
page . set_ReadingOrder ( tmp_page . get_ReadingOrder ( ) )
# 3. TextRegion
# FIXME: what about table and image regions?
if page . get_TextRegion ( ) :
log . warning ( " Page already contained text regions" )
log . warning ( " Removing existing text regions" )
# We need to translate the coordinates:
text_regions_new = [ ]
for text_region in tmp_page . get_TextRegion ( ) :
coords = text_region . get_Coords ( ) . get_points ( )
polygon = polygon_from_points ( coords )
polygon_new = coordinates_for_segment ( polygon , page_image , page_coords )
points_new = points_from_polygon ( polygon_new )
coords_new = CoordsType ( points = points_new )
text_region . set_Coords ( coords_new )
text_region = adapt_coords ( text_region , page , page_coords )
if text_region is None :
# intersection is empty (polygon outside of above border)
log . warning ( " new text region polygon would be empty, skipping " )
continue
text_regions_new . append ( text_region )
text_lines_new = [ ]
for text_line in text_region . get_TextLine ( ) :
text_line = adapt_coords ( text_line , text_region , page_coords )
if text_line is None :
# intersection is empty (polygon outside of region)
log . warning ( " new text line polygon would be empty, skipping " )
continue
text_lines_new . append ( text_line )
text_region . set_TextLine ( text_lines_new )
page . set_TextRegion ( text_regions_new )
# Save metadata about this operation
metadata = pcgts . get_Metadata ( )
metadata . add_MetadataItem (
MetadataItemType ( type_ = " processingStep " ,
name = self . ocrd_tool [ ' steps ' ] [ 0 ] ,
value = TOOL ,
Labels = [ LabelsType (
externalModel = " ocrd-tool " ,
externalId = " parameters " ,
Label = [ LabelType ( type_ = name , value = self . parameter [ name ] )
for name in self . parameter . keys ( ) ] ) ] ) )
self . add_metadata ( pcgts )
self . workspace . add_file (
ID = file_id ,
file_grp = self . output_file_grp ,
pageId = page_id ,
mimetype = ' application/vnd.prima.page+xml ' ,
local_filename = os . path . join ( self . output_file_grp , file_id ) + ' .xml ' ,
content = ocrd_models . ocrd_page . to_xml ( pcgts )
ID = file_id ,
file_grp = self . output_file_grp ,
pageId = page_id ,
mimetype = ' application/vnd.prima.page+xml ' ,
local_filename = os . path . join ( self . output_file_grp , file_id ) + ' .xml ' ,
content = ocrd_models . ocrd_page . to_xml ( pcgts )
)
def adapt_coords ( segment , parent , transform ) :
points = segment . get_Coords ( ) . get_points ( )
polygon = polygon_from_points ( points )
# polygon absolute coords (after transforming back from page coords, e.g. deskewing)
polygon_new = coordinates_for_segment ( polygon , None , transform )
# intersection with parent polygon
polygon_new = polygon_for_parent ( polygon_new , parent )
if polygon_new is None :
return None
points_new = points_from_polygon ( polygon_new )
segment . set_Coords ( CoordsType ( points = points_new ) )
return segment
# from ocrd_tesserocr, to be integrated into core (somehow)...
def polygon_for_parent ( polygon , parent ) :
""" Clip polygon to parent polygon range.
( Should be moved to ocrd_utils . coordinates_for_segment . )
"""
childp = Polygon ( polygon )
if isinstance ( parent , PageType ) :
if parent . get_Border ( ) :
parentp = Polygon ( polygon_from_points ( parent . get_Border ( ) . get_Coords ( ) . points ) )
else :
parentp = Polygon ( [ [ 0 , 0 ] , [ 0 , parent . get_imageHeight ( ) ] ,
[ parent . get_imageWidth ( ) , parent . get_imageHeight ( ) ] ,
[ parent . get_imageWidth ( ) , 0 ] ] )
else :
parentp = Polygon ( polygon_from_points ( parent . get_Coords ( ) . points ) )
# check if clipping is necessary
if childp . within ( parentp ) :
return polygon
# ensure input coords have valid paths (without self-intersection)
# (this can happen when shapes valid in floating point are rounded)
childp = make_valid ( childp )
parentp = make_valid ( parentp )
# clip to parent
interp = childp . intersection ( parentp )
if interp . is_empty or interp . area == 0.0 :
# this happens if Tesseract "finds" something
# outside of the valid Border of a deskewed/cropped page
# (empty corners created by masking); will be ignored
return None
if interp . type == ' GeometryCollection ' :
# heterogeneous result: filter zero-area shapes (LineString, Point)
interp = unary_union ( [ geom for geom in interp . geoms if geom . area > 0 ] )
if interp . type == ' MultiPolygon ' :
# homogeneous result: construct convex hull to connect
# FIXME: construct concave hull / alpha shape
interp = interp . convex_hull
if interp . minimum_clearance < 1.0 :
# follow-up calculations will necessarily be integer;
# so anticipate rounding here and then ensure validity
interp = asPolygon ( np . round ( interp . exterior . coords ) )
interp = make_valid ( interp )
return interp . exterior . coords [ : - 1 ] # keep open
# from ocrd_tesserocr, to be integrated into core (somehow)...
def make_valid ( polygon ) :
for split in range ( 1 , len ( polygon . exterior . coords ) - 1 ) :
if polygon . is_valid or polygon . simplify ( polygon . area ) . is_valid :
break
# simplification may not be possible (at all) due to ordering
# in that case, try another starting point
polygon = Polygon ( polygon . exterior . coords [ - split : ] + polygon . exterior . coords [ : - split ] )
for tolerance in range ( 1 , int ( polygon . area ) ) :
if polygon . is_valid :
break
# simplification may require a larger tolerance
polygon = polygon . simplify ( tolerance )
return polygon
if __name__ == ' __main__ ' :
ocrd_sbb_textline_detector ( )