refactor shapely converisons into contour2polygon / polygon2contour, also handle heterogeneous geometries

This commit is contained in:
Robert Sachunsky 2025-08-21 12:59:03 +02:00
parent 8b5f90e243
commit 8be52fb143
2 changed files with 83 additions and 25 deletions

View file

@ -27,7 +27,6 @@ from loky import ProcessPoolExecutor
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
import cv2 import cv2
import numpy as np import numpy as np
from shapely.geometry import Polygon
from scipy.signal import find_peaks from scipy.signal import find_peaks
from scipy.ndimage import gaussian_filter1d from scipy.ndimage import gaussian_filter1d
from numba import cuda from numba import cuda

View file

@ -1,7 +1,15 @@
from typing import Sequence, Union
from numbers import Number
from functools import partial from functools import partial
import itertools
import cv2 import cv2
import numpy as np import numpy as np
from shapely.geometry import Polygon from scipy.sparse.csgraph import minimum_spanning_tree
from shapely.geometry import Polygon, LineString
from shapely.geometry.polygon import orient
from shapely import set_precision
from shapely.ops import unary_union, nearest_points
from .rotate import rotate_image, rotation_image_new from .rotate import rotate_image, rotation_image_new
@ -37,29 +45,28 @@ def get_text_region_boxes_by_given_contours(contours):
return boxes, contours_new return boxes, contours_new
def filter_contours_area_of_image(image, contours, hierarchy, max_area, min_area): def filter_contours_area_of_image(image, contours, hierarchy, max_area=1.0, min_area=0.0, dilate=0):
found_polygons_early = [] found_polygons_early = []
for jv,c in enumerate(contours): for jv, contour in enumerate(contours):
if len(c) < 3: # A polygon cannot have less than 3 points if len(contour) < 3: # A polygon cannot have less than 3 points
continue continue
polygon = Polygon([point[0] for point in c]) polygon = contour2polygon(contour, dilate=dilate)
area = polygon.area area = polygon.area
if (area >= min_area * np.prod(image.shape[:2]) and if (area >= min_area * np.prod(image.shape[:2]) and
area <= max_area * np.prod(image.shape[:2]) and area <= max_area * np.prod(image.shape[:2]) and
hierarchy[0][jv][3] == -1): hierarchy[0][jv][3] == -1):
found_polygons_early.append(np.array(make_valid(polygon).exterior.coords[:-1], found_polygons_early.append(polygon2contour(polygon))
dtype=np.uint)[:, np.newaxis])
return found_polygons_early return found_polygons_early
def filter_contours_area_of_image_tables(image, contours, hierarchy, max_area, min_area): def filter_contours_area_of_image_tables(image, contours, hierarchy, max_area=1.0, min_area=0.0, dilate=0):
found_polygons_early = [] found_polygons_early = []
for jv,c in enumerate(contours): for jv, contour in enumerate(contours):
if len(c) < 3: # A polygon cannot have less than 3 points if len(contour) < 3: # A polygon cannot have less than 3 points
continue continue
polygon = Polygon([point[0] for point in c]) polygon = contour2polygon(contour, dilate=dilate)
# area = cv2.contourArea(c) # area = cv2.contourArea(contour)
area = polygon.area area = polygon.area
##print(np.prod(thresh.shape[:2])) ##print(np.prod(thresh.shape[:2]))
# Check that polygon has area greater than minimal area # Check that polygon has area greater than minimal area
@ -68,9 +75,8 @@ def filter_contours_area_of_image_tables(image, contours, hierarchy, max_area, m
area <= max_area * np.prod(image.shape[:2]) and area <= max_area * np.prod(image.shape[:2]) and
# hierarchy[0][jv][3]==-1 # hierarchy[0][jv][3]==-1
True): True):
# print(c[0][0][1]) # print(contour[0][0][1])
found_polygons_early.append(np.array(make_valid(polygon).exterior.coords[:-1], found_polygons_early.append(polygon2contour(polygon))
dtype=np.uint)[:, np.newaxis])
return found_polygons_early return found_polygons_early
def find_new_features_of_contours(contours_main): def find_new_features_of_contours(contours_main):
@ -325,16 +331,29 @@ def return_contours_of_interested_region_by_size(region_pre_p, pixel, min_area,
return img_ret[:, :, 0] return img_ret[:, :, 0]
def dilate_textline_contours(self, all_found_textline_polygons): def dilate_textline_contours(all_found_textline_polygons):
return [[np.array(make_valid(Polygon(poly[:, 0]).buffer(5)).exterior.coords[:-1], return [[polygon2contour(contour2polygon(contour, dilate=5))
dtype=np.uint)[:, np.newaxis] for contour in region]
for poly in region]
for region in all_found_textline_polygons] for region in all_found_textline_polygons]
def dilate_textregion_contours(self, all_found_textline_polygons): def dilate_textregion_contours(all_found_textline_polygons):
return [np.array(make_valid(Polygon(poly[:, 0]).buffer(5)).exterior.coords[:-1], return [polygon2contour(contour2polygon(contour, dilate=5))
dtype=np.uint)[:, np.newaxis] for contour in all_found_textline_polygons]
for poly in all_found_textline_polygons]
def contour2polygon(contour: Union[np.ndarray, Sequence[Sequence[Sequence[Number]]]], dilate=0):
polygon = Polygon([point[0] for point in contour])
if dilate:
polygon = polygon.buffer(dilate)
if polygon.geom_type == 'GeometryCollection':
# heterogeneous result: filter zero-area shapes (LineString, Point)
polygon = unary_union([geom for geom in polygon.geoms if geom.area > 0])
if polygon.geom_type == 'MultiPolygon':
# homogeneous result: construct convex hull to connect
polygon = join_polygons(polygon.geoms)
return make_valid(polygon)
def polygon2contour(polygon: Polygon) -> np.ndarray:
return np.array(polygon.exterior.coords[:-1], dtype=np.uint)[:, np.newaxis]
def make_valid(polygon: Polygon) -> Polygon: def make_valid(polygon: Polygon) -> Polygon:
"""Ensures shapely.geometry.Polygon object is valid by repeated rearrangement/simplification/enlargement.""" """Ensures shapely.geometry.Polygon object is valid by repeated rearrangement/simplification/enlargement."""
@ -343,7 +362,7 @@ def make_valid(polygon: Polygon) -> Polygon:
# make sure rounding does not invalidate # make sure rounding does not invalidate
if not all(map(isint, np.array(polygon.exterior.coords).flat)) and polygon.minimum_clearance < 1.0: if not all(map(isint, np.array(polygon.exterior.coords).flat)) and polygon.minimum_clearance < 1.0:
polygon = Polygon(np.round(polygon.exterior.coords)) polygon = Polygon(np.round(polygon.exterior.coords))
points = list(polygon.exterior.coords) points = list(polygon.exterior.coords[:-1])
# try by re-arranging points # try by re-arranging points
for split in range(1, len(points)): for split in range(1, len(points)):
if polygon.is_valid or polygon.simplify(polygon.area).is_valid: if polygon.is_valid or polygon.simplify(polygon.area).is_valid:
@ -365,3 +384,43 @@ def make_valid(polygon: Polygon) -> Polygon:
polygon = polygon.buffer(tolerance) polygon = polygon.buffer(tolerance)
assert polygon.is_valid, polygon.wkt assert polygon.is_valid, polygon.wkt
return polygon return polygon
def join_polygons(polygons: Sequence[Polygon], scale=20) -> Polygon:
"""construct concave hull (alpha shape) from input polygons by connecting their pairwise nearest points"""
# ensure input polygons are simply typed and all oriented equally
polygons = [orient(poly)
for poly in itertools.chain.from_iterable(
[poly.geoms
if poly.geom_type in ['MultiPolygon', 'GeometryCollection']
else [poly]
for poly in polygons])]
npoly = len(polygons)
if npoly == 1:
return polygons[0]
# find min-dist path through all polygons (travelling salesman)
pairs = itertools.combinations(range(npoly), 2)
dists = np.zeros((npoly, npoly), dtype=float)
for i, j in pairs:
dist = polygons[i].distance(polygons[j])
if dist < 1e-5:
dist = 1e-5 # if pair merely touches, we still need to get an edge
dists[i, j] = dist
dists[j, i] = dist
dists = minimum_spanning_tree(dists, overwrite=True)
# add bridge polygons (where necessary)
for prevp, nextp in zip(*dists.nonzero()):
prevp = polygons[prevp]
nextp = polygons[nextp]
nearest = nearest_points(prevp, nextp)
bridgep = orient(LineString(nearest).buffer(max(1, scale/5), resolution=1), -1)
polygons.append(bridgep)
jointp = unary_union(polygons)
assert jointp.geom_type == 'Polygon', jointp.wkt
# follow-up calculations will necessarily be integer;
# so anticipate rounding here and then ensure validity
jointp2 = set_precision(jointp, 1.0)
if jointp2.geom_type != 'Polygon' or not jointp2.is_valid:
jointp2 = Polygon(np.round(jointp.exterior.coords))
jointp2 = make_valid(jointp2)
assert jointp2.geom_type == 'Polygon', jointp2.wkt
return jointp2