From a48e52c00eef1b1e8c85b25bf4d95e46ecaf0cf1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 29 Sep 2025 13:49:18 +0200 Subject: [PATCH 001/101] :memo: extend changelog for v0.5.0 --- CHANGELOG.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0ad9a09..bfdd1ce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,12 +11,37 @@ Fixed: * restoring the contour in the original image caused an error due to an empty tuple, #154 +Changed + + * CLIs: read only allowed filename suffixes (image or XML) with `--dir_in` + * CLIs: make all output option required, and `-i` / `-di` required but mutually exclusive + * ocr CLI: drop redundant `-brb` in favour of just `-dib` + * APIs: move all input/output path options from class (kwarg and attribute) ro `run` kwarg + * layout textlines: polygonal also without `-cl` + Added: * `eynollah machine-based-reading-order` CLI to run reading order detection, #175 * `eynollah enhancement` CLI to run image enhancement, #175 * Improved models for page extraction and reading order detection, #175 +Merged PRs: + + * better machine based reading order + layout and textline + ocr by @vahidrezanezhad in https://github.com/qurator-spk/eynollah/pull/175 + * CI: pypi by @kba in https://github.com/qurator-spk/eynollah/pull/154 + * CI: Use most recent actions/setup-python@v5 by @kba in https://github.com/qurator-spk/eynollah/pull/157 + * update docker by @bertsky in https://github.com/qurator-spk/eynollah/pull/159 + * Ocrd fixes by @kba in https://github.com/qurator-spk/eynollah/pull/167 + * Updating readme for eynollah use cases cli by @kba in https://github.com/qurator-spk/eynollah/pull/166 + * OCR-D processor: expose reading_order_machine_based by @bertsky in https://github.com/qurator-spk/eynollah/pull/171 + * prepare release v0.5.0: fix logging by @bertsky in https://github.com/qurator-spk/eynollah/pull/180 + * mb_ro_on_layout: remove copy-pasta code not actually used by @kba in https://github.com/qurator-spk/eynollah/pull/181 + * prepare release v0.5.0: improve CLI docstring, refactor I/O path options from class to run kwargs, increase test coverage @bertsky in #182 + * prepare release v0.5.0: fix for OCR doit subtest by @bertsky in https://github.com/qurator-spk/eynollah/pull/183 + * Prepare release v0.5.0 by @kba in https://github.com/qurator-spk/eynollah/pull/178 + * updating eynollah README, how to use it for use cases by @vahidrezanezhad in https://github.com/qurator-spk/eynollah/pull/156 + * add feedback to command line interface by @michalbubula in https://github.com/qurator-spk/eynollah/pull/170 + ## [0.4.0] - 2025-04-07 Fixed: From 09ece86f0dcb860eef978319b2350ccf7df13c2c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 19 Aug 2025 11:58:45 +0200 Subject: [PATCH 002/101] dilate_textregions_contours: simplify (via shapely's Polygon.buffer()), ensure validity --- src/eynollah/eynollah.py | 212 ++-------------------------------- src/eynollah/utils/contour.py | 30 ++++- 2 files changed, 36 insertions(+), 206 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index d47016b..55789ae 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -27,6 +27,7 @@ from loky import ProcessPoolExecutor import xml.etree.ElementTree as ET import cv2 import numpy as np +from shapely.geometry import Polygon from scipy.signal import find_peaks from scipy.ndimage import gaussian_filter1d from numba import cuda @@ -68,6 +69,7 @@ from .utils.contour import ( get_text_region_boxes_by_given_contours, get_textregion_contours_in_org_image, get_textregion_contours_in_org_image_light, + make_valid, return_contours_of_image, return_contours_of_interested_region, return_contours_of_interested_region_by_min_size, @@ -3670,211 +3672,15 @@ class Eynollah: return x_differential_new def dilate_textregions_contours_textline_version(self, all_found_textline_polygons): - #print(all_found_textline_polygons) - for j in range(len(all_found_textline_polygons)): - for ij in range(len(all_found_textline_polygons[j])): - con_ind = all_found_textline_polygons[j][ij] - area = cv2.contourArea(con_ind) - con_ind = con_ind.astype(float) - - x_differential = np.diff( con_ind[:,0,0]) - y_differential = np.diff( con_ind[:,0,1]) - - x_differential = gaussian_filter1d(x_differential, 0.1) - y_differential = gaussian_filter1d(y_differential, 0.1) - - x_min = float(np.min( con_ind[:,0,0] )) - y_min = float(np.min( con_ind[:,0,1] )) - - x_max = float(np.max( con_ind[:,0,0] )) - y_max = float(np.max( con_ind[:,0,1] )) - - x_differential_mask_nonzeros = [ ind/abs(ind) if ind!=0 else ind for ind in x_differential] - y_differential_mask_nonzeros = [ ind/abs(ind) if ind!=0 else ind for ind in y_differential] - - abs_diff=abs(abs(x_differential)- abs(y_differential) ) - - inc_x = np.zeros(len(x_differential)+1) - inc_y = np.zeros(len(x_differential)+1) - - if (y_max-y_min) <= (x_max-x_min): - dilation_m1 = round(area / (x_max-x_min) * 0.12) - else: - dilation_m1 = round(area / (y_max-y_min) * 0.12) - - if dilation_m1>8: - dilation_m1 = 8 - if dilation_m1<6: - dilation_m1 = 6 - #print(dilation_m1, 'dilation_m1') - dilation_m1 = 6 - dilation_m2 = int(dilation_m1/2.) +1 - - for i in range(len(x_differential)): - if abs_diff[i]==0: - inc_x[i+1] = dilation_m2*(-1*y_differential_mask_nonzeros[i]) - inc_y[i+1] = dilation_m2*(x_differential_mask_nonzeros[i]) - elif abs_diff[i]!=0 and x_differential_mask_nonzeros[i]==0 and y_differential_mask_nonzeros[i]!=0: - inc_x[i+1]= dilation_m1*(-1*y_differential_mask_nonzeros[i]) - elif abs_diff[i]!=0 and x_differential_mask_nonzeros[i]!=0 and y_differential_mask_nonzeros[i]==0: - inc_y[i+1] = dilation_m1*(x_differential_mask_nonzeros[i]) - - elif abs_diff[i]!=0 and abs_diff[i]>=3: - if abs(x_differential[i])>abs(y_differential[i]): - inc_y[i+1] = dilation_m1*(x_differential_mask_nonzeros[i]) - else: - inc_x[i+1]= dilation_m1*(-1*y_differential_mask_nonzeros[i]) - else: - inc_x[i+1] = dilation_m2*(-1*y_differential_mask_nonzeros[i]) - inc_y[i+1] = dilation_m2*(x_differential_mask_nonzeros[i]) - - inc_x[0] = inc_x[-1] - inc_y[0] = inc_y[-1] - - con_scaled = con_ind*1 - - con_scaled[:,0, 0] = con_ind[:,0,0] + np.array(inc_x)[:] - con_scaled[:,0, 1] = con_ind[:,0,1] + np.array(inc_y)[:] - - con_scaled[:,0, 1][con_scaled[:,0, 1]<0] = 0 - con_scaled[:,0, 0][con_scaled[:,0, 0]<0] = 0 - - area_scaled = cv2.contourArea(con_scaled.astype(np.int32)) - - con_ind = con_ind.astype(np.int32) - - results = [cv2.pointPolygonTest(con_ind, (con_scaled[ind,0, 0], con_scaled[ind,0, 1]), False) - for ind in range(len(con_scaled[:,0, 1])) ] - results = np.array(results) - #print(results,'results') - results[results==0] = 1 - - diff_result = np.diff(results) - - indices_2 = [ind for ind in range(len(diff_result)) if diff_result[ind]==2] - indices_m2 = [ind for ind in range(len(diff_result)) if diff_result[ind]==-2] - - if results[0]==1: - con_scaled[:indices_m2[0]+1,0, 1] = con_ind[:indices_m2[0]+1,0,1] - con_scaled[:indices_m2[0]+1,0, 0] = con_ind[:indices_m2[0]+1,0,0] - #indices_2 = indices_2[1:] - indices_m2 = indices_m2[1:] - - if len(indices_2)>len(indices_m2): - con_scaled[indices_2[-1]+1:,0, 1] = con_ind[indices_2[-1]+1:,0,1] - con_scaled[indices_2[-1]+1:,0, 0] = con_ind[indices_2[-1]+1:,0,0] - indices_2 = indices_2[:-1] - - for ii in range(len(indices_2)): - con_scaled[indices_2[ii]+1:indices_m2[ii]+1,0, 1] = con_scaled[indices_2[ii],0, 1] - con_scaled[indices_2[ii]+1:indices_m2[ii]+1,0, 0] = con_scaled[indices_2[ii],0, 0] - - all_found_textline_polygons[j][ij][:,0,1] = con_scaled[:,0, 1] - all_found_textline_polygons[j][ij][:,0,0] = con_scaled[:,0, 0] - return all_found_textline_polygons + return [[np.array(make_valid(Polygon(poly[:, 0]).buffer(5)).exterior.coords, + dtype=int)[:, np.newaxis] + for poly in region] + for region in all_found_textline_polygons] def dilate_textregions_contours(self, all_found_textline_polygons): - #print(all_found_textline_polygons) - for j in range(len(all_found_textline_polygons)): - con_ind = all_found_textline_polygons[j] - #print(len(con_ind[:,0,0]),'con_ind[:,0,0]') - area = cv2.contourArea(con_ind) - con_ind = con_ind.astype(float) - - x_differential = np.diff( con_ind[:,0,0]) - y_differential = np.diff( con_ind[:,0,1]) - - x_differential = gaussian_filter1d(x_differential, 0.1) - y_differential = gaussian_filter1d(y_differential, 0.1) - - x_min = float(np.min( con_ind[:,0,0] )) - y_min = float(np.min( con_ind[:,0,1] )) - - x_max = float(np.max( con_ind[:,0,0] )) - y_max = float(np.max( con_ind[:,0,1] )) - - x_differential_mask_nonzeros = [ ind/abs(ind) if ind!=0 else ind for ind in x_differential] - y_differential_mask_nonzeros = [ ind/abs(ind) if ind!=0 else ind for ind in y_differential] - - abs_diff=abs(abs(x_differential)- abs(y_differential) ) - - inc_x = np.zeros(len(x_differential)+1) - inc_y = np.zeros(len(x_differential)+1) - - if (y_max-y_min) <= (x_max-x_min): - dilation_m1 = round(area / (x_max-x_min) * 0.12) - else: - dilation_m1 = round(area / (y_max-y_min) * 0.12) - - if dilation_m1>8: - dilation_m1 = 8 - if dilation_m1<6: - dilation_m1 = 6 - #print(dilation_m1, 'dilation_m1') - dilation_m1 = 6 - dilation_m2 = int(dilation_m1/2.) +1 - - for i in range(len(x_differential)): - if abs_diff[i]==0: - inc_x[i+1] = dilation_m2*(-1*y_differential_mask_nonzeros[i]) - inc_y[i+1] = dilation_m2*(x_differential_mask_nonzeros[i]) - elif abs_diff[i]!=0 and x_differential_mask_nonzeros[i]==0 and y_differential_mask_nonzeros[i]!=0: - inc_x[i+1]= dilation_m1*(-1*y_differential_mask_nonzeros[i]) - elif abs_diff[i]!=0 and x_differential_mask_nonzeros[i]!=0 and y_differential_mask_nonzeros[i]==0: - inc_y[i+1] = dilation_m1*(x_differential_mask_nonzeros[i]) - - elif abs_diff[i]!=0 and abs_diff[i]>=3: - if abs(x_differential[i])>abs(y_differential[i]): - inc_y[i+1] = dilation_m1*(x_differential_mask_nonzeros[i]) - else: - inc_x[i+1]= dilation_m1*(-1*y_differential_mask_nonzeros[i]) - else: - inc_x[i+1] = dilation_m2*(-1*y_differential_mask_nonzeros[i]) - inc_y[i+1] = dilation_m2*(x_differential_mask_nonzeros[i]) - - inc_x[0] = inc_x[-1] - inc_y[0] = inc_y[-1] - - con_scaled = con_ind*1 - - con_scaled[:,0, 0] = con_ind[:,0,0] + np.array(inc_x)[:] - con_scaled[:,0, 1] = con_ind[:,0,1] + np.array(inc_y)[:] - - con_scaled[:,0, 1][con_scaled[:,0, 1]<0] = 0 - con_scaled[:,0, 0][con_scaled[:,0, 0]<0] = 0 - - area_scaled = cv2.contourArea(con_scaled.astype(np.int32)) - - con_ind = con_ind.astype(np.int32) - - results = [cv2.pointPolygonTest(con_ind, (con_scaled[ind,0, 0], con_scaled[ind,0, 1]), False) - for ind in range(len(con_scaled[:,0, 1])) ] - results = np.array(results) - #print(results,'results') - results[results==0] = 1 - - diff_result = np.diff(results) - indices_2 = [ind for ind in range(len(diff_result)) if diff_result[ind]==2] - indices_m2 = [ind for ind in range(len(diff_result)) if diff_result[ind]==-2] - - if results[0]==1: - con_scaled[:indices_m2[0]+1,0, 1] = con_ind[:indices_m2[0]+1,0,1] - con_scaled[:indices_m2[0]+1,0, 0] = con_ind[:indices_m2[0]+1,0,0] - #indices_2 = indices_2[1:] - indices_m2 = indices_m2[1:] - - if len(indices_2)>len(indices_m2): - con_scaled[indices_2[-1]+1:,0, 1] = con_ind[indices_2[-1]+1:,0,1] - con_scaled[indices_2[-1]+1:,0, 0] = con_ind[indices_2[-1]+1:,0,0] - indices_2 = indices_2[:-1] - - for ii in range(len(indices_2)): - con_scaled[indices_2[ii]+1:indices_m2[ii]+1,0, 1] = con_scaled[indices_2[ii],0, 1] - con_scaled[indices_2[ii]+1:indices_m2[ii]+1,0, 0] = con_scaled[indices_2[ii],0, 0] - - all_found_textline_polygons[j][:,0,1] = con_scaled[:,0, 1] - all_found_textline_polygons[j][:,0,0] = con_scaled[:,0, 0] - return all_found_textline_polygons + return [np.array(make_valid(Polygon(poly[:, 0])).buffer(5).exterior.coords, + dtype=int)[:, np.newaxis] + for poly in all_found_textline_polygons] def dilate_textline_contours(self, all_found_textline_polygons): for j in range(len(all_found_textline_polygons)): diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index 0e84153..3d7e5c8 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -1,7 +1,7 @@ from functools import partial import cv2 import numpy as np -from shapely import geometry +from shapely.geometry import Polygon from .rotate import rotate_image, rotation_image_new @@ -43,7 +43,7 @@ def filter_contours_area_of_image(image, contours, hierarchy, max_area, min_area if len(c) < 3: # A polygon cannot have less than 3 points continue - polygon = geometry.Polygon([point[0] for point in c]) + polygon = Polygon([point[0] for point in c]) area = polygon.area if (area >= min_area * np.prod(image.shape[:2]) and area <= max_area * np.prod(image.shape[:2]) and @@ -58,7 +58,7 @@ def filter_contours_area_of_image_tables(image, contours, hierarchy, max_area, m if len(c) < 3: # A polygon cannot have less than 3 points continue - polygon = geometry.Polygon([point[0] for point in c]) + polygon = Polygon([point[0] for point in c]) # area = cv2.contourArea(c) area = polygon.area ##print(np.prod(thresh.shape[:2])) @@ -332,3 +332,27 @@ def return_contours_of_interested_region_by_size(region_pre_p, pixel, min_area, return img_ret[:, :, 0] +def make_valid(polygon: Polygon) -> Polygon: + """Ensures shapely.geometry.Polygon object is valid by repeated rearrangement/simplification/enlargement.""" + points = list(polygon.exterior.coords) + # try by re-arranging points + for split in range(1, len(points)): + if polygon.is_valid or polygon.simplify(polygon.area).is_valid: + break + # simplification may not be possible (at all) due to ordering + # in that case, try another starting point + polygon = Polygon(points[-split:]+points[:-split]) + # try by simplification + for tolerance in range(int(polygon.area + 1.5)): + if polygon.is_valid: + break + # simplification may require a larger tolerance + polygon = polygon.simplify(tolerance + 1) + # try by enlarging + for tolerance in range(1, int(polygon.area + 2.5)): + if polygon.is_valid: + break + # enlargement may require a larger tolerance + polygon = polygon.buffer(tolerance) + assert polygon.is_valid, polygon.wkt + return polygon From b48c41e68ff59d8cff97a59a534fee20d2d32408 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 19 Aug 2025 20:09:09 +0200 Subject: [PATCH 003/101] return_boxes_of_images_by_order_of_reading_new: simplify, avoid changing dtype during np.append --- src/eynollah/eynollah.py | 2 +- src/eynollah/utils/__init__.py | 214 +++++++++++++++------------------ 2 files changed, 97 insertions(+), 119 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 55789ae..959e9a6 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -3678,7 +3678,7 @@ class Eynollah: for region in all_found_textline_polygons] def dilate_textregions_contours(self, all_found_textline_polygons): - return [np.array(make_valid(Polygon(poly[:, 0])).buffer(5).exterior.coords, + return [np.array(make_valid(Polygon(poly[:, 0]).buffer(5)).exterior.coords, dtype=int)[:, np.newaxis] for poly in all_found_textline_polygons] diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index c5962f8..7168d95 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1632,6 +1632,7 @@ def return_boxes_of_images_by_order_of_reading_new( regions_without_separators = cv2.flip(regions_without_separators,1) boxes=[] peaks_neg_tot_tables = [] + splitter_y_new = np.array(splitter_y_new, dtype=int) for i in range(len(splitter_y_new)-1): #print(splitter_y_new[i],splitter_y_new[i+1]) matrix_new = matrix_of_lines_ch[:,:][(matrix_of_lines_ch[:,6]> splitter_y_new[i] ) & @@ -1644,14 +1645,9 @@ def return_boxes_of_images_by_order_of_reading_new( # 0.1 * (np.abs(splitter_y_new[i+1]-splitter_y_new[i]))): if True: try: - if erosion_hurts: - num_col, peaks_neg_fin = find_num_col( - regions_without_separators[int(splitter_y_new[i]):int(splitter_y_new[i+1]),:], - num_col_classifier, tables, multiplier=6.) - else: - num_col, peaks_neg_fin = find_num_col( - regions_without_separators[int(splitter_y_new[i]):int(splitter_y_new[i+1]),:], - num_col_classifier, tables, multiplier=7.) + num_col, peaks_neg_fin = find_num_col( + regions_without_separators[splitter_y_new[i]:splitter_y_new[i+1], :], + num_col_classifier, tables, multiplier=6. if erosion_hurts else 7.) except: peaks_neg_fin=[] num_col = 0 @@ -1661,7 +1657,7 @@ def return_boxes_of_images_by_order_of_reading_new( #print('burda') if len(peaks_neg_fin)==0: num_col, peaks_neg_fin = find_num_col( - regions_without_separators[int(splitter_y_new[i]):int(splitter_y_new[i+1]),:], + regions_without_separators[splitter_y_new[i]:splitter_y_new[i+1], :], num_col_classifier, tables, multiplier=3.) peaks_neg_fin_early=[] peaks_neg_fin_early.append(0) @@ -1674,21 +1670,21 @@ def return_boxes_of_images_by_order_of_reading_new( peaks_neg_fin_rev=[] for i_n in range(len(peaks_neg_fin_early)-1): #print(i_n,'i_n') - #plt.plot(regions_without_separators[int(splitter_y_new[i]): - # int(splitter_y_new[i+1]), + #plt.plot(regions_without_separators[splitter_y_new[i]: + # splitter_y_new[i+1], # peaks_neg_fin_early[i_n]: # peaks_neg_fin_early[i_n+1]].sum(axis=0) ) #plt.show() try: num_col, peaks_neg_fin1 = find_num_col( - regions_without_separators[int(splitter_y_new[i]):int(splitter_y_new[i+1]), + regions_without_separators[splitter_y_new[i]:splitter_y_new[i+1], peaks_neg_fin_early[i_n]:peaks_neg_fin_early[i_n+1]], num_col_classifier,tables, multiplier=7.) except: peaks_neg_fin1=[] try: num_col, peaks_neg_fin2 = find_num_col( - regions_without_separators[int(splitter_y_new[i]):int(splitter_y_new[i+1]), + regions_without_separators[splitter_y_new[i]:splitter_y_new[i+1], peaks_neg_fin_early[i_n]:peaks_neg_fin_early[i_n+1]], num_col_classifier,tables, multiplier=5.) except: @@ -1716,7 +1712,7 @@ def return_boxes_of_images_by_order_of_reading_new( except: pass #num_col, peaks_neg_fin = find_num_col( - # regions_without_separators[int(splitter_y_new[i]):int(splitter_y_new[i+1]),:], + # regions_without_separators[splitter_y_new[i]:splitter_y_new[i+1],:], # multiplier=7.0) x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ] x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ] @@ -1738,31 +1734,28 @@ def return_boxes_of_images_by_order_of_reading_new( y_lines_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \ new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order( x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, cy_hor_diff) - x_starting = np.array(x_starting) - x_ending = np.array(x_ending) - y_type_2 = np.array(y_type_2) - y_diff_type_2 = np.array(y_diff_type_2) + all_columns = set(range(len(peaks_neg_tot) - 1)) if ((reading_order_type==1) or (reading_order_type==0 and (len(y_lines_without_mother)>=2 or there_is_sep_with_child==1))): try: - y_grenze=int(splitter_y_new[i])+300 + y_grenze = splitter_y_new[i] + 300 #check if there is a big separator in this y_mains_sep_ohne_grenzen args_early_ys=np.arange(len(y_type_2)) #print(args_early_ys,'args_early_ys') - #print(int(splitter_y_new[i]),int(splitter_y_new[i+1])) + #print(splitter_y_new[i], splitter_y_new[i+1]) - x_starting_up = x_starting[(y_type_2 > int(splitter_y_new[i])) & + x_starting_up = x_starting[(y_type_2 > splitter_y_new[i]) & (y_type_2 <= y_grenze)] - x_ending_up = x_ending[(y_type_2 > int(splitter_y_new[i])) & + x_ending_up = x_ending[(y_type_2 > splitter_y_new[i]) & (y_type_2 <= y_grenze)] - y_type_2_up = y_type_2[(y_type_2 > int(splitter_y_new[i])) & + y_type_2_up = y_type_2[(y_type_2 > splitter_y_new[i]) & (y_type_2 <= y_grenze)] - y_diff_type_2_up = y_diff_type_2[(y_type_2 > int(splitter_y_new[i])) & + y_diff_type_2_up = y_diff_type_2[(y_type_2 > splitter_y_new[i]) & (y_type_2 <= y_grenze)] - args_up = args_early_ys[(y_type_2 > int(splitter_y_new[i])) & + args_up = args_early_ys[(y_type_2 > splitter_y_new[i]) & (y_type_2 <= y_grenze)] if len(y_type_2_up) > 0: y_main_separator_up = y_type_2_up [(x_starting_up==0) & @@ -1776,8 +1769,8 @@ def return_boxes_of_images_by_order_of_reading_new( args_to_be_kept = np.array(list( set(args_early_ys) - set(args_main_to_deleted) )) #print(args_to_be_kept,'args_to_be_kept') boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1], - int(splitter_y_new[i]), int( np.max(y_diff_main_separator_up))]) - splitter_y_new[i]=[ np.max(y_diff_main_separator_up) ][0] + splitter_y_new[i], y_diff_main_separator_up.max()]) + splitter_y_new[i] = y_diff_main_separator_up.max() #print(splitter_y_new[i],'splitter_y_new[i]') y_type_2 = y_type_2[args_to_be_kept] @@ -1786,29 +1779,28 @@ def return_boxes_of_images_by_order_of_reading_new( y_diff_type_2 = y_diff_type_2[args_to_be_kept] #print('galdiha') - y_grenze=int(splitter_y_new[i])+200 + y_grenze = splitter_y_new[i] + 200 args_early_ys2=np.arange(len(y_type_2)) - y_type_2_up=y_type_2[(y_type_2 > int(splitter_y_new[i])) & + y_type_2_up=y_type_2[(y_type_2 > splitter_y_new[i]) & (y_type_2 <= y_grenze)] - x_starting_up=x_starting[(y_type_2 > int(splitter_y_new[i])) & + x_starting_up=x_starting[(y_type_2 > splitter_y_new[i]) & (y_type_2 <= y_grenze)] - x_ending_up=x_ending[(y_type_2 > int(splitter_y_new[i])) & + x_ending_up=x_ending[(y_type_2 > splitter_y_new[i]) & (y_type_2 <= y_grenze)] - y_diff_type_2_up=y_diff_type_2[(y_type_2 > int(splitter_y_new[i])) & + y_diff_type_2_up=y_diff_type_2[(y_type_2 > splitter_y_new[i]) & (y_type_2 <= y_grenze)] - args_up2=args_early_ys2[(y_type_2 > int(splitter_y_new[i])) & + args_up2=args_early_ys2[(y_type_2 > splitter_y_new[i]) & (y_type_2 <= y_grenze)] #print(y_type_2_up,x_starting_up,x_ending_up,'didid') - nodes_in = [] + nodes_in = set() for ij in range(len(x_starting_up)): - nodes_in = nodes_in + list(range(x_starting_up[ij], - x_ending_up[ij])) - nodes_in = np.unique(nodes_in) + nodes_in.update(range(x_starting_up[ij], + x_ending_up[ij])) #print(nodes_in,'nodes_in') - if set(nodes_in)==set(range(len(peaks_neg_tot)-1)): + if nodes_in == set(range(len(peaks_neg_tot)-1)): pass - elif set(nodes_in)==set(range(1, len(peaks_neg_tot)-1)): + elif nodes_in == set(range(1, len(peaks_neg_tot)-1)): pass else: #print('burdaydikh') @@ -1823,17 +1815,16 @@ def return_boxes_of_images_by_order_of_reading_new( pass #print('burdaydikh2') elif len(y_diff_main_separator_up)==0: - nodes_in = [] + nodes_in = set() for ij in range(len(x_starting_up)): - nodes_in = nodes_in + list(range(x_starting_up[ij], - x_ending_up[ij])) - nodes_in = np.unique(nodes_in) + nodes_in.update(range(x_starting_up[ij], + x_ending_up[ij])) #print(nodes_in,'nodes_in2') #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))') - if set(nodes_in)==set(range(len(peaks_neg_tot)-1)): + if nodes_in == set(range(len(peaks_neg_tot)-1)): pass - elif set(nodes_in)==set(range(1,len(peaks_neg_tot)-1)): + elif nodes_in == set(range(1,len(peaks_neg_tot)-1)): pass else: #print('burdaydikh') @@ -1858,26 +1849,24 @@ def return_boxes_of_images_by_order_of_reading_new( x_end_by_order=[] if (len(x_end_with_child_without_mother)==0 and reading_order_type==0) or reading_order_type==1: if reading_order_type==1: - y_lines_by_order.append(int(splitter_y_new[i])) + y_lines_by_order.append(splitter_y_new[i]) x_start_by_order.append(0) x_end_by_order.append(len(peaks_neg_tot)-2) else: #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') - columns_covered_by_mothers = [] + columns_covered_by_mothers = set() for dj in range(len(x_start_without_mother)): - columns_covered_by_mothers = columns_covered_by_mothers + \ - list(range(x_start_without_mother[dj], - x_end_without_mother[dj])) - columns_covered_by_mothers = list(set(columns_covered_by_mothers)) - - all_columns=np.arange(len(peaks_neg_tot)-1) - columns_not_covered=list(set(all_columns) - set(columns_covered_by_mothers)) - y_type_2 = np.append(y_type_2, [int(splitter_y_new[i])] * (len(columns_not_covered) + len(x_start_without_mother))) - ##y_lines_by_order = np.append(y_lines_by_order, [int(splitter_y_new[i])] * len(columns_not_covered)) + columns_covered_by_mothers.update( + range(x_start_without_mother[dj], + x_end_without_mother[dj])) + columns_not_covered = list(all_columns - columns_covered_by_mothers) + y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + len(x_start_without_mother), + dtype=int) * splitter_y_new[i]) + ##y_lines_by_order = np.append(y_lines_by_order, [splitter_y_new[i]] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, columns_not_covered) + x_starting = np.append(x_starting, np.array(columns_not_covered, int)) x_starting = np.append(x_starting, x_start_without_mother) - x_ending = np.append(x_ending, np.array(columns_not_covered) + 1) + x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) x_ending = np.append(x_ending, x_end_without_mother) ind_args=np.arange(len(y_type_2)) @@ -1906,39 +1895,34 @@ def return_boxes_of_images_by_order_of_reading_new( x_end_by_order.append(x_end_column_sort[ii]-1) else: #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') - columns_covered_by_mothers = [] + columns_covered_by_mothers = set() for dj in range(len(x_start_without_mother)): - columns_covered_by_mothers = columns_covered_by_mothers + \ - list(range(x_start_without_mother[dj], - x_end_without_mother[dj])) - columns_covered_by_mothers = list(set(columns_covered_by_mothers)) - - all_columns=np.arange(len(peaks_neg_tot)-1) - columns_not_covered=list(set(all_columns) - set(columns_covered_by_mothers)) - y_type_2 = np.append(y_type_2, [int(splitter_y_new[i])] * (len(columns_not_covered) + len(x_start_without_mother))) - ##y_lines_by_order = np.append(y_lines_by_order, [int(splitter_y_new[i])] * len(columns_not_covered)) + columns_covered_by_mothers.update( + range(x_start_without_mother[dj], + x_end_without_mother[dj])) + columns_not_covered = list(all_columns - columns_covered_by_mothers) + y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + len(x_start_without_mother), + dtype=int) * splitter_y_new[i]) + ##y_lines_by_order = np.append(y_lines_by_order, [splitter_y_new[i]] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, columns_not_covered) + x_starting = np.append(x_starting, np.array(columns_not_covered, int)) x_starting = np.append(x_starting, x_start_without_mother) - x_ending = np.append(x_ending, np.array(columns_not_covered) + 1) + x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) x_ending = np.append(x_ending, x_end_without_mother) - columns_covered_by_with_child_no_mothers = [] + columns_covered_by_with_child_no_mothers = set() for dj in range(len(x_end_with_child_without_mother)): - columns_covered_by_with_child_no_mothers = columns_covered_by_with_child_no_mothers + \ - list(range(x_start_with_child_without_mother[dj], - x_end_with_child_without_mother[dj])) - columns_covered_by_with_child_no_mothers = list(set(columns_covered_by_with_child_no_mothers)) - - all_columns = np.arange(len(peaks_neg_tot)-1) - columns_not_covered_child_no_mother = list(set(all_columns) - set(columns_covered_by_with_child_no_mothers)) + columns_covered_by_with_child_no_mothers.update( + range(x_start_with_child_without_mother[dj], + x_end_with_child_without_mother[dj])) + columns_not_covered_child_no_mother = list(all_columns - columns_covered_by_with_child_no_mothers) #indexes_to_be_spanned=[] for i_s in range(len(x_end_with_child_without_mother)): columns_not_covered_child_no_mother.append(x_start_with_child_without_mother[i_s]) columns_not_covered_child_no_mother = np.sort(columns_not_covered_child_no_mother) ind_args = np.arange(len(y_type_2)) - x_end_with_child_without_mother = np.array(x_end_with_child_without_mother) - x_start_with_child_without_mother = np.array(x_start_with_child_without_mother) + x_end_with_child_without_mother = np.array(x_end_with_child_without_mother, int) + x_start_with_child_without_mother = np.array(x_start_with_child_without_mother, int) for i_s_nc in columns_not_covered_child_no_mother: if i_s_nc in x_start_with_child_without_mother: x_end_biggest_column = x_end_with_child_without_mother[x_start_with_child_without_mother==i_s_nc][0] @@ -1951,7 +1935,7 @@ def return_boxes_of_images_by_order_of_reading_new( for i_c in range(len(y_column_nc)): if i_c==(len(y_column_nc)-1): ind_all_lines_between_nm_wc=ind_args[(y_type_2>y_column_nc[i_c]) & - (y_type_2=i_s_nc) & (x_ending<=x_end_biggest_column)] else: @@ -1967,21 +1951,19 @@ def return_boxes_of_images_by_order_of_reading_new( if len(x_diff_all_between_nm_wc)>0: biggest=np.argmax(x_diff_all_between_nm_wc) - columns_covered_by_mothers = [] + columns_covered_by_mothers = set() for dj in range(len(x_starting_all_between_nm_wc)): - columns_covered_by_mothers = columns_covered_by_mothers + \ - list(range(x_starting_all_between_nm_wc[dj], - x_ending_all_between_nm_wc[dj])) - columns_covered_by_mothers = list(set(columns_covered_by_mothers)) - - all_columns=np.arange(i_s_nc, x_end_biggest_column) - columns_not_covered = list(set(all_columns) - set(columns_covered_by_mothers)) + columns_covered_by_mothers.update( + range(x_starting_all_between_nm_wc[dj], + x_ending_all_between_nm_wc[dj])) + child_columns = set(range(i_s_nc, x_end_biggest_column)) + columns_not_covered = list(child_columns - columns_covered_by_mothers) should_longest_line_be_extended=0 if (len(x_diff_all_between_nm_wc) > 0 and set(list(range(x_starting_all_between_nm_wc[biggest], x_ending_all_between_nm_wc[biggest])) + - list(columns_not_covered)) != set(all_columns)): + list(columns_not_covered)) != child_columns): should_longest_line_be_extended=1 index_lines_so_close_to_top_separator = \ np.arange(len(y_all_between_nm_wc))[(y_all_between_nm_wc>y_column_nc[i_c]) & @@ -2008,8 +1990,8 @@ def return_boxes_of_images_by_order_of_reading_new( pass y_all_between_nm_wc = np.append(y_all_between_nm_wc, [y_column_nc[i_c]] * len(columns_not_covered)) - x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, columns_not_covered) - x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, np.array(columns_not_covered) + 1) + x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, np.array(columns_not_covered, int)) + x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, np.array(columns_not_covered, int) + 1) ind_args_between=np.arange(len(x_ending_all_between_nm_wc)) for column in range(i_s_nc, x_end_biggest_column): @@ -2078,7 +2060,7 @@ def return_boxes_of_images_by_order_of_reading_new( if len(y_in_cols)>0: y_down=np.min(y_in_cols) else: - y_down=[int(splitter_y_new[i+1])][0] + y_down=splitter_y_new[i+1] #print(y_itself,'y_itself') boxes.append([peaks_neg_tot[column], peaks_neg_tot[column+1], @@ -2086,45 +2068,42 @@ def return_boxes_of_images_by_order_of_reading_new( y_down]) except: boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1], - int(splitter_y_new[i]), int(splitter_y_new[i+1])]) + splitter_y_new[i], splitter_y_new[i+1]]) else: y_lines_by_order=[] x_start_by_order=[] x_end_by_order=[] if len(x_starting)>0: - all_columns = np.arange(len(peaks_neg_tot)-1) - columns_covered_by_lines_covered_more_than_2col = [] + columns_covered_by_lines_covered_more_than_2col = set() for dj in range(len(x_starting)): - if set(list(range(x_starting[dj],x_ending[dj]))) == set(all_columns): - pass - else: - columns_covered_by_lines_covered_more_than_2col = columns_covered_by_lines_covered_more_than_2col + \ - list(range(x_starting[dj],x_ending[dj])) - columns_covered_by_lines_covered_more_than_2col = list(set(columns_covered_by_lines_covered_more_than_2col)) - columns_not_covered = list(set(all_columns) - set(columns_covered_by_lines_covered_more_than_2col)) + if set(range(x_starting[dj], x_ending[dj])) != all_columns: + columns_covered_by_lines_covered_more_than_2col.update( + range(x_starting[dj], x_ending[dj])) + columns_not_covered = list(all_columns - columns_covered_by_lines_covered_more_than_2col) - y_type_2 = np.append(y_type_2, [int(splitter_y_new[i])] * (len(columns_not_covered) + 1)) - ##y_lines_by_order = np.append(y_lines_by_order, [int(splitter_y_new[i])] * len(columns_not_covered)) + y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + 1, + dtype=int) * splitter_y_new[i]) + ##y_lines_by_order = np.append(y_lines_by_order, [splitter_y_new[i]] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, columns_not_covered) - x_ending = np.append(x_ending, np.array(columns_not_covered) + 1) + x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) + x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) if len(new_main_sep_y) > 0: x_starting = np.append(x_starting, 0) - x_ending = np.append(x_ending, len(peaks_neg_tot)-1) + x_ending = np.append(x_ending, len(peaks_neg_tot) - 1) else: x_starting = np.append(x_starting, x_starting[0]) x_ending = np.append(x_ending, x_ending[0]) else: - all_columns = np.arange(len(peaks_neg_tot)-1) - columns_not_covered = list(set(all_columns)) - y_type_2 = np.append(y_type_2, [int(splitter_y_new[i])] * len(columns_not_covered)) - ##y_lines_by_order = np.append(y_lines_by_order, [int(splitter_y_new[i])] * len(columns_not_covered)) + columns_not_covered = list(all_columns) + y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered), + dtype=int) * splitter_y_new[i]) + ##y_lines_by_order = np.append(y_lines_by_order, [splitter_y_new[i]] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, columns_not_covered) - x_ending = np.append(x_ending, np.array(columns_not_covered) + 1) + x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) + x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) - ind_args=np.array(range(len(y_type_2))) - #ind_args=np.array(ind_args) + ind_args = np.arange(len(y_type_2)) + for column in range(len(peaks_neg_tot)-1): #print(column,'column') ind_args_in_col=ind_args[x_starting==column] @@ -2155,7 +2134,6 @@ def return_boxes_of_images_by_order_of_reading_new( x_start_itself=x_start_copy.pop(il) x_end_itself=x_end_copy.pop(il) - #print(y_copy,'y_copy2') for column in range(x_start_itself, x_end_itself+1): #print(column,'cols') y_in_cols=[] @@ -2170,7 +2148,7 @@ def return_boxes_of_images_by_order_of_reading_new( if len(y_in_cols)>0: y_down=np.min(y_in_cols) else: - y_down=[int(splitter_y_new[i+1])][0] + y_down=splitter_y_new[i+1] #print(y_itself,'y_itself') boxes.append([peaks_neg_tot[column], peaks_neg_tot[column+1], From 66b2bce8b9f420895b8c47ebf46faf1ca3bbdd03 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 19 Sep 2025 12:19:58 +0200 Subject: [PATCH 004/101] return_boxes_of_images_by_order_of_reading_new: log any exceptions --- src/eynollah/eynollah.py | 6 ++++-- src/eynollah/utils/__init__.py | 22 ++++++++++++++++------ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 959e9a6..8080035 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4553,11 +4553,13 @@ class Eynollah: if np.abs(slope_deskew) < SLOPE_THRESHOLD: boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new( splitter_y_new, regions_without_separators, matrix_of_lines_ch, - num_col_classifier, erosion_hurts, self.tables, self.right2left) + num_col_classifier, erosion_hurts, self.tables, self.right2left, + logger=self.logger) else: boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new( splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, - num_col_classifier, erosion_hurts, self.tables, self.right2left) + num_col_classifier, erosion_hurts, self.tables, self.right2left, + logger=self.logger) if self.plotter: self.plotter.write_images_into_directory(polygons_of_images, image_page) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 7168d95..3c130d7 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1,3 +1,5 @@ +from typing import Tuple +from logging import getLogger import time import math @@ -1626,10 +1628,16 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, def return_boxes_of_images_by_order_of_reading_new( splitter_y_new, regions_without_separators, matrix_of_lines_ch, - num_col_classifier, erosion_hurts, tables, right2left_readingorder): + num_col_classifier, erosion_hurts, tables, + right2left_readingorder, + logger=None): if right2left_readingorder: regions_without_separators = cv2.flip(regions_without_separators,1) + if logger is None: + logger = getLogger(__package__) + logger.debug('enter return_boxes_of_images_by_order_of_reading_new') + boxes=[] peaks_neg_tot_tables = [] splitter_y_new = np.array(splitter_y_new, dtype=int) @@ -1710,7 +1718,7 @@ def return_boxes_of_images_by_order_of_reading_new( #print(peaks_neg_fin,'peaks_neg_fin') except: - pass + logger.exception("cannot find peaks consistent with columns") #num_col, peaks_neg_fin = find_num_col( # regions_without_separators[splitter_y_new[i]:splitter_y_new[i+1],:], # multiplier=7.0) @@ -1987,7 +1995,7 @@ def return_boxes_of_images_by_order_of_reading_new( x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest]) x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest]) except: - pass + logger.exception("cannot append") y_all_between_nm_wc = np.append(y_all_between_nm_wc, [y_column_nc[i_c]] * len(columns_not_covered)) x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, np.array(columns_not_covered, int)) @@ -2067,6 +2075,7 @@ def return_boxes_of_images_by_order_of_reading_new( y_itself, y_down]) except: + logger.exception("cannot assign boxes") boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1], splitter_y_new[i], splitter_y_new[i+1]]) else: @@ -2170,6 +2179,7 @@ def return_boxes_of_images_by_order_of_reading_new( x_end_new = regions_without_separators.shape[1] - boxes[i][0] boxes[i][0] = x_start_new boxes[i][1] = x_end_new - return boxes, peaks_neg_tot_tables_new - else: - return boxes, peaks_neg_tot_tables + peaks_neg_tot_tables = peaks_neg_tot_tables_new + + logger.debug('exit return_boxes_of_images_by_order_of_reading_new') + return boxes, peaks_neg_tot_tables From afba70c920b4f1dc80bd70511a07df82439e6db3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 19 Aug 2025 22:56:36 +0200 Subject: [PATCH 005/101] separate_lines/do_work_of_slopes: skip if crop is empty --- src/eynollah/utils/separate_lines.py | 46 +++++++++++++++------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/src/eynollah/utils/separate_lines.py b/src/eynollah/utils/separate_lines.py index 0322579..ffbfff7 100644 --- a/src/eynollah/utils/separate_lines.py +++ b/src/eynollah/utils/separate_lines.py @@ -1345,24 +1345,26 @@ def textline_contours_postprocessing(textline_mask, slope, contour_text_interest return contours_rotated_clean -def separate_lines_new2(img_path, thetha, num_col, slope_region, logger=None, plotter=None): +def separate_lines_new2(img_crop, thetha, num_col, slope_region, logger=None, plotter=None): if logger is None: logger = getLogger(__package__) + if not np.prod(img_crop.shape): + return img_crop if num_col == 1: - num_patches = int(img_path.shape[1] / 200.0) + num_patches = int(img_crop.shape[1] / 200.0) else: - num_patches = int(img_path.shape[1] / 140.0) - # num_patches=int(img_path.shape[1]/200.) + num_patches = int(img_crop.shape[1] / 140.0) + # num_patches=int(img_crop.shape[1]/200.) if num_patches == 0: num_patches = 1 - img_patch_ineterst = img_path[:, :] # [peaks_neg_true[14]-dis_up:peaks_neg_true[15]+dis_down ,:] + img_patch_interest = img_crop[:, :] # [peaks_neg_true[14]-dis_up:peaks_neg_true[15]+dis_down ,:] - # plt.imshow(img_patch_ineterst) + # plt.imshow(img_patch_interest) # plt.show() - length_x = int(img_path.shape[1] / float(num_patches)) + length_x = int(img_crop.shape[1] / float(num_patches)) # margin = int(0.04 * length_x) just recently this was changed because it break lines into 2 margin = int(0.04 * length_x) # if margin<=4: @@ -1370,7 +1372,7 @@ def separate_lines_new2(img_path, thetha, num_col, slope_region, logger=None, pl # margin=0 width_mid = length_x - 2 * margin - nxf = img_path.shape[1] / float(width_mid) + nxf = img_crop.shape[1] / float(width_mid) if nxf > int(nxf): nxf = int(nxf) + 1 @@ -1386,12 +1388,12 @@ def separate_lines_new2(img_path, thetha, num_col, slope_region, logger=None, pl index_x_d = i * width_mid index_x_u = index_x_d + length_x - if index_x_u > img_path.shape[1]: - index_x_u = img_path.shape[1] - index_x_d = img_path.shape[1] - length_x + if index_x_u > img_crop.shape[1]: + index_x_u = img_crop.shape[1] + index_x_d = img_crop.shape[1] - length_x # img_patch = img[index_y_d:index_y_u, index_x_d:index_x_u, :] - img_xline = img_patch_ineterst[:, index_x_d:index_x_u] + img_xline = img_patch_interest[:, index_x_d:index_x_u] try: assert img_xline.any() @@ -1407,9 +1409,9 @@ def separate_lines_new2(img_path, thetha, num_col, slope_region, logger=None, pl img_line_rotated = rotate_image(img_xline, slope_xline) img_line_rotated[:, :][img_line_rotated[:, :] != 0] = 1 - img_patch_ineterst = img_path[:, :] # [peaks_neg_true[14]-dis_up:peaks_neg_true[14]+dis_down ,:] + img_patch_interest = img_crop[:, :] # [peaks_neg_true[14]-dis_up:peaks_neg_true[14]+dis_down ,:] - img_patch_ineterst_revised = np.zeros(img_patch_ineterst.shape) + img_patch_interest_revised = np.zeros(img_patch_interest.shape) for i in range(nxf): if i == 0: @@ -1419,11 +1421,11 @@ def separate_lines_new2(img_path, thetha, num_col, slope_region, logger=None, pl index_x_d = i * width_mid index_x_u = index_x_d + length_x - if index_x_u > img_path.shape[1]: - index_x_u = img_path.shape[1] - index_x_d = img_path.shape[1] - length_x + if index_x_u > img_crop.shape[1]: + index_x_u = img_crop.shape[1] + index_x_d = img_crop.shape[1] - length_x - img_xline = img_patch_ineterst[:, index_x_d:index_x_u] + img_xline = img_patch_interest[:, index_x_d:index_x_u] img_int = np.zeros((img_xline.shape[0], img_xline.shape[1])) img_int[:, :] = img_xline[:, :] # img_patch_org[:,:,0] @@ -1446,9 +1448,9 @@ def separate_lines_new2(img_path, thetha, num_col, slope_region, logger=None, pl int(img_int.shape[1] * (1.0)) : int(img_int.shape[1] * (1.0)) + img_int.shape[1]] img_patch_separated_returned_true_size = img_patch_separated_returned_true_size[:, margin : length_x - margin] - img_patch_ineterst_revised[:, index_x_d + margin : index_x_u - margin] = img_patch_separated_returned_true_size + img_patch_interest_revised[:, index_x_d + margin : index_x_u - margin] = img_patch_separated_returned_true_size - return img_patch_ineterst_revised + return img_patch_interest_revised def do_image_rotation(angle, img, sigma_des, logger=None): if logger is None: @@ -1546,7 +1548,7 @@ def do_work_of_slopes_new( img_int_p = all_text_region_raw[:,:] img_int_p = cv2.erode(img_int_p, KERNEL, iterations=2) - if img_int_p.shape[0] /img_int_p.shape[1] < 0.1: + if not np.prod(img_int_p.shape) or img_int_p.shape[0] /img_int_p.shape[1] < 0.1: slope = 0 slope_for_all = slope_deskew all_text_region_raw = textline_mask_tot_ea[y: y + h, x: x + w] @@ -1603,7 +1605,7 @@ def do_work_of_slopes_new_curved( # plt.imshow(img_int_p) # plt.show() - if img_int_p.shape[0] / img_int_p.shape[1] < 0.1: + if not np.prod(img_int_p.shape) or img_int_p.shape[0] / img_int_p.shape[1] < 0.1: slope = 0 slope_for_all = slope_deskew else: From 41cc38c51aaa74fb27854a101e9fbe727478f86b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 20 Aug 2025 14:28:14 +0200 Subject: [PATCH 006/101] get_textregion_contours_in_org_image_light: no back rotation, drop slope_first (always 0) --- src/eynollah/eynollah.py | 14 ++++++-------- src/eynollah/utils/contour.py | 26 +++++++++++--------------- 2 files changed, 17 insertions(+), 23 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 8080035..49f6b33 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2927,12 +2927,10 @@ class Eynollah: #print(textline_mask_tot_ea.shape, 'textline_mask_tot_ea deskew') slope_deskew = return_deskew_slop(cv2.erode(textline_mask_tot_ea, KERNEL, iterations=2), 2, 30, True, map=self.executor.map, logger=self.logger, plotter=self.plotter) - slope_first = 0 - if self.plotter: self.plotter.save_deskewed_image(slope_deskew) self.logger.info("slope_deskew: %.2f°", slope_deskew) - return slope_deskew, slope_first + return slope_deskew def run_marginals( self, image_page, textline_mask_tot_ea, mask_images, mask_lines, @@ -4173,9 +4171,9 @@ class Eynollah: textline_mask_tot_ea_deskew = resize_image(textline_mask_tot_ea,img_h_new, img_w_new ) - slope_deskew, slope_first = self.run_deskew(textline_mask_tot_ea_deskew) + slope_deskew = self.run_deskew(textline_mask_tot_ea_deskew) else: - slope_deskew, slope_first = self.run_deskew(textline_mask_tot_ea) + slope_deskew = self.run_deskew(textline_mask_tot_ea) #print("text region early -2,5 in %.1fs", time.time() - t0) #self.logger.info("Textregion detection took %.1fs ", time.time() - t1t) num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, \ @@ -4216,7 +4214,7 @@ class Eynollah: textline_mask_tot_ea = self.run_textline(image_page) self.logger.info("textline detection took %.1fs", time.time() - t1) t1 = time.time() - slope_deskew, slope_first = self.run_deskew(textline_mask_tot_ea) + slope_deskew = self.run_deskew(textline_mask_tot_ea) self.logger.info("deskewing took %.1fs", time.time() - t1) elif num_col_classifier in (1,2): org_h_l_m = textline_mask_tot_ea.shape[0] @@ -4405,12 +4403,12 @@ class Eynollah: contours_only_text_parent, contours_only_text_parent_d_ordered, text_only, marginal_cnts=polygons_of_marginals) #print("text region early 3.5 in %.1fs", time.time() - t0) txt_con_org , conf_contours_textregions = get_textregion_contours_in_org_image_light( - contours_only_text_parent, self.image, slope_first, confidence_matrix, map=self.executor.map) + contours_only_text_parent, self.image, confidence_matrix) #txt_con_org = self.dilate_textregions_contours(txt_con_org) #contours_only_text_parent = self.dilate_textregions_contours(contours_only_text_parent) else: txt_con_org , conf_contours_textregions = get_textregion_contours_in_org_image_light( - contours_only_text_parent, self.image, slope_first, confidence_matrix, map=self.executor.map) + contours_only_text_parent, self.image, confidence_matrix) #print("text region early 4 in %.1fs", time.time() - t0) boxes_text, _ = get_text_region_boxes_by_given_contours(contours_only_text_parent) boxes_marginals, _ = get_text_region_boxes_by_given_contours(polygons_of_marginals) diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index 3d7e5c8..249748a 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -247,23 +247,19 @@ def do_back_rotation_and_get_cnt_back(contour_par, index_r_con, img, slope_first cont_int[0][:, 0, 1] = cont_int[0][:, 0, 1] + np.abs(img_copy.shape[0] - img.shape[0]) return cont_int[0], index_r_con, confidence_contour -def get_textregion_contours_in_org_image_light(cnts, img, slope_first, confidence_matrix, map=map): +def get_textregion_contours_in_org_image_light(cnts, img, confidence_matrix): if not len(cnts): return [], [] - - confidence_matrix = cv2.resize(confidence_matrix, (int(img.shape[1]/6), int(img.shape[0]/6)), interpolation=cv2.INTER_NEAREST) - img = cv2.resize(img, (int(img.shape[1]/6), int(img.shape[0]/6)), interpolation=cv2.INTER_NEAREST) - ##cnts = list( (np.array(cnts)/2).astype(np.int16) ) - #cnts = cnts/2 - cnts = [(i/6).astype(int) for i in cnts] - results = map(partial(do_back_rotation_and_get_cnt_back, - img=img, - slope_first=slope_first, - confidence_matrix=confidence_matrix, - ), - cnts, range(len(cnts))) - contours, indexes, conf_contours = tuple(zip(*results)) - return [i*6 for i in contours], list(conf_contours) + + confidence_matrix = cv2.resize(confidence_matrix, + (img.shape[1] // 6, img.shape[0] // 6), + interpolation=cv2.INTER_NEAREST) + confs = [] + for cnt in cnts: + cnt_mask = np.zeros(confidence_matrix.shape) + cnt_mask = cv2.fillPoly(cnt_mask, pts=[cnt // 6], color=1.0) + confs.append(np.sum(confidence_matrix * cnt_mask) / np.sum(cnt_mask)) + return cnts, confs def return_contours_of_interested_textline(region_pre_p, pixel): # pixels of images are identified by 5 From 7b51fd662497ecd7c35b09764df2ed5c6b651a76 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 21 Aug 2025 01:03:46 +0200 Subject: [PATCH 007/101] avoid creating invalid polygons via rounding --- src/eynollah/eynollah.py | 5 +++-- src/eynollah/utils/contour.py | 9 +++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 49f6b33..0f458b4 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -3670,16 +3670,17 @@ class Eynollah: return x_differential_new def dilate_textregions_contours_textline_version(self, all_found_textline_polygons): - return [[np.array(make_valid(Polygon(poly[:, 0]).buffer(5)).exterior.coords, + return [[np.array(make_valid(Polygon(poly[:, 0]).buffer(5)).exterior.coords[:-1], dtype=int)[:, np.newaxis] for poly in region] for region in all_found_textline_polygons] def dilate_textregions_contours(self, all_found_textline_polygons): - return [np.array(make_valid(Polygon(poly[:, 0]).buffer(5)).exterior.coords, + return [np.array(make_valid(Polygon(poly[:, 0]).buffer(5)).exterior.coords[:-1], dtype=int)[:, np.newaxis] for poly in all_found_textline_polygons] + def dilate_textline_contours(self, all_found_textline_polygons): for j in range(len(all_found_textline_polygons)): for ij in range(len(all_found_textline_polygons[j])): diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index 249748a..8205c2b 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -49,7 +49,7 @@ def filter_contours_area_of_image(image, contours, hierarchy, max_area, min_area area <= max_area * np.prod(image.shape[:2]) and hierarchy[0][jv][3] == -1): found_polygons_early.append(np.array([[point] - for point in polygon.exterior.coords], dtype=np.uint)) + for point in polygon.exterior.coords[:-1]], dtype=np.uint)) return found_polygons_early def filter_contours_area_of_image_tables(image, contours, hierarchy, max_area, min_area): @@ -70,7 +70,7 @@ def filter_contours_area_of_image_tables(image, contours, hierarchy, max_area, m True): # print(c[0][0][1]) found_polygons_early.append(np.array([[point] - for point in polygon.exterior.coords], dtype=np.int32)) + for point in polygon.exterior.coords[:-1]], dtype=np.int32)) return found_polygons_early def find_new_features_of_contours(contours_main): @@ -330,6 +330,11 @@ def return_contours_of_interested_region_by_size(region_pre_p, pixel, min_area, def make_valid(polygon: Polygon) -> Polygon: """Ensures shapely.geometry.Polygon object is valid by repeated rearrangement/simplification/enlargement.""" + def isint(x): + return isinstance(x, int) or int(x) == x + # make sure rounding does not invalidate + if not all(map(isint, np.array(polygon.exterior.coords).flat)) and polygon.minimum_clearance < 1.0: + polygon = Polygon(np.round(polygon.exterior.coords)) points = list(polygon.exterior.coords) # try by re-arranging points for split in range(1, len(points)): From e730725da3d40cfbd20f857c36843190713725ca Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 21 Aug 2025 01:05:15 +0200 Subject: [PATCH 008/101] check_any_text_region_in_model_one_is_main_or_header_light: return original instead of resampled contours --- src/eynollah/utils/__init__.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 3c130d7..c479744 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -957,11 +957,11 @@ def check_any_text_region_in_model_one_is_main_or_header_light( regions_model_full = cv2.resize(regions_model_full, (regions_model_full.shape[1] // zoom, regions_model_full.shape[0] // zoom), interpolation=cv2.INTER_NEAREST) - contours_only_text_parent = [(i / zoom).astype(int) for i in contours_only_text_parent] + contours_only_text_parent_z = [(cnt / zoom).astype(int) for cnt in contours_only_text_parent] ### cx_main, cy_main, x_min_main, x_max_main, y_min_main, y_max_main, y_corr_x_min_from_argmin = \ - find_new_features_of_contours(contours_only_text_parent) + find_new_features_of_contours(contours_only_text_parent_z) length_con=x_max_main-x_min_main height_con=y_max_main-y_min_main @@ -984,8 +984,7 @@ def check_any_text_region_in_model_one_is_main_or_header_light( contours_only_text_parent_main_d=[] contours_only_text_parent_head_d=[] - for ii in range(len(contours_only_text_parent)): - con=contours_only_text_parent[ii] + for ii, con in enumerate(contours_only_text_parent_z): img=np.zeros((regions_model_1.shape[0], regions_model_1.shape[1], 3)) img = cv2.fillPoly(img, pts=[con], color=(255, 255, 255)) @@ -996,23 +995,22 @@ def check_any_text_region_in_model_one_is_main_or_header_light( if (pixels_header/float(pixels_main)>=0.3) and ( (length_con[ii]/float(height_con[ii]) )>=1.3 ): regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ]=2 - contours_only_text_parent_head.append(con) + contours_only_text_parent_head.append(contours_only_text_parent[ii]) + conf_contours_head.append(None) # why not conf_contours[ii], too? if contours_only_text_parent_d_ordered is not None: contours_only_text_parent_head_d.append(contours_only_text_parent_d_ordered[ii]) all_box_coord_head.append(all_box_coord[ii]) slopes_head.append(slopes[ii]) all_found_textline_polygons_head.append(all_found_textline_polygons[ii]) - conf_contours_head.append(None) else: regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ]=1 - contours_only_text_parent_main.append(con) + contours_only_text_parent_main.append(contours_only_text_parent[ii]) conf_contours_main.append(conf_contours[ii]) if contours_only_text_parent_d_ordered is not None: contours_only_text_parent_main_d.append(contours_only_text_parent_d_ordered[ii]) all_box_coord_main.append(all_box_coord[ii]) slopes_main.append(slopes[ii]) all_found_textline_polygons_main.append(all_found_textline_polygons[ii]) - #print(all_pixels,pixels_main,pixels_header) ### to make it faster @@ -1020,8 +1018,6 @@ def check_any_text_region_in_model_one_is_main_or_header_light( # regions_model_full = cv2.resize(img, (regions_model_full.shape[1] // zoom, # regions_model_full.shape[0] // zoom), # interpolation=cv2.INTER_NEAREST) - contours_only_text_parent_head = [(i * zoom).astype(int) for i in contours_only_text_parent_head] - contours_only_text_parent_main = [(i * zoom).astype(int) for i in contours_only_text_parent_main] ### return (regions_model_1, From 17bcf1af71802d790f7508d52221d64ea4fff939 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 21 Aug 2025 01:32:32 +0200 Subject: [PATCH 009/101] =?UTF-8?q?rename=20*lines=5Fxml=20=E2=86=92=20*se?= =?UTF-8?q?plines=20for=20clarity?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/eynollah/eynollah.py | 58 ++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 0f458b4..c04c481 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -1713,9 +1713,9 @@ class Eynollah: mask_texts_only = (prediction_regions_org[:,:] ==1)*1 mask_images_only=(prediction_regions_org[:,:] ==2)*1 - polygons_lines_xml, hir_lines_xml = return_contours_of_image(mask_lines_only) - polygons_lines_xml = textline_con_fil = filter_contours_area_of_image( - mask_lines_only, polygons_lines_xml, hir_lines_xml, max_area=1, min_area=0.00001) + polygons_seplines, hir_seplines = return_contours_of_image(mask_lines_only) + polygons_seplines = textline_con_fil = filter_contours_area_of_image( + mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001) polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only,1,0.00001) polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only,1,0.00001) @@ -1779,7 +1779,7 @@ class Eynollah: [page_coord_img[2], page_coord_img[1]]])) self.logger.debug("exit get_regions_extract_images_only") - return text_regions_p_true, erosion_hurts, polygons_lines_xml, polygons_of_images_fin, image_page, page_coord, cont_page + return text_regions_p_true, erosion_hurts, polygons_seplines, polygons_of_images_fin, image_page, page_coord, cont_page def get_regions_light_v(self,img,is_image_enhanced, num_col_classifier, skip_layout_and_reading_order=False): self.logger.debug("enter get_regions_light_v") @@ -1895,24 +1895,24 @@ class Eynollah: mask_texts_only = cv2.dilate(mask_texts_only, kernel=np.ones((2,2), np.uint8), iterations=1) mask_images_only=(prediction_regions_org[:,:] ==2)*1 - polygons_lines_xml, hir_lines_xml = return_contours_of_image(mask_lines_only) + polygons_seplines, hir_seplines = return_contours_of_image(mask_lines_only) test_khat = np.zeros(prediction_regions_org.shape) - test_khat = cv2.fillPoly(test_khat, pts=polygons_lines_xml, color=(1,1,1)) + test_khat = cv2.fillPoly(test_khat, pts=polygons_seplines, color=(1,1,1)) #plt.imshow(test_khat[:,:]) #plt.show() #for jv in range(1): - #print(jv, hir_lines_xml[0][232][3]) + #print(jv, hir_seplines[0][232][3]) #test_khat = np.zeros(prediction_regions_org.shape) - #test_khat = cv2.fillPoly(test_khat, pts = [polygons_lines_xml[232]], color=(1,1,1)) + #test_khat = cv2.fillPoly(test_khat, pts = [polygons_seplines[232]], color=(1,1,1)) #plt.imshow(test_khat[:,:]) #plt.show() - polygons_lines_xml = filter_contours_area_of_image( - mask_lines_only, polygons_lines_xml, hir_lines_xml, max_area=1, min_area=0.00001) + polygons_seplines = filter_contours_area_of_image( + mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001) test_khat = np.zeros(prediction_regions_org.shape) - test_khat = cv2.fillPoly(test_khat, pts = polygons_lines_xml, color=(1,1,1)) + test_khat = cv2.fillPoly(test_khat, pts = polygons_seplines, color=(1,1,1)) #plt.imshow(test_khat[:,:]) #plt.show() @@ -1937,7 +1937,7 @@ class Eynollah: #plt.show() #print("inside 4 ", time.time()-t_in) self.logger.debug("exit get_regions_light_v") - return text_regions_p_true, erosion_hurts, polygons_lines_xml, textline_mask_tot_ea, img_bin, confidence_matrix + return text_regions_p_true, erosion_hurts, polygons_seplines, textline_mask_tot_ea, img_bin, confidence_matrix else: img_bin = resize_image(img_bin,img_height_h, img_width_h ) self.logger.debug("exit get_regions_light_v") @@ -2020,9 +2020,9 @@ class Eynollah: mask_texts_only=(prediction_regions_org[:,:]==1)*1 mask_images_only=(prediction_regions_org[:,:]==2)*1 - polygons_lines_xml, hir_lines_xml = return_contours_of_image(mask_lines_only) - polygons_lines_xml = filter_contours_area_of_image( - mask_lines_only, polygons_lines_xml, hir_lines_xml, max_area=1, min_area=0.00001) + polygons_seplines, hir_seplines = return_contours_of_image(mask_lines_only) + polygons_seplines = filter_contours_area_of_image( + mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001) polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only, 1, 0.00001) polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only, 1, 0.00001) @@ -2034,7 +2034,7 @@ class Eynollah: text_regions_p_true=cv2.fillPoly(text_regions_p_true,pts=polygons_of_only_texts, color=(1,1,1)) self.logger.debug("exit get_regions_from_xy_2models") - return text_regions_p_true, erosion_hurts, polygons_lines_xml + return text_regions_p_true, erosion_hurts, polygons_seplines except: if self.input_binary: prediction_bin = np.copy(img_org) @@ -2069,9 +2069,9 @@ class Eynollah: mask_texts_only = (prediction_regions_org == 1)*1 mask_images_only= (prediction_regions_org == 2)*1 - polygons_lines_xml, hir_lines_xml = return_contours_of_image(mask_lines_only) - polygons_lines_xml = filter_contours_area_of_image( - mask_lines_only, polygons_lines_xml, hir_lines_xml, max_area=1, min_area=0.00001) + polygons_seplines, hir_seplines = return_contours_of_image(mask_lines_only) + polygons_seplines = filter_contours_area_of_image( + mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001) polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only,1,0.00001) polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only,1,0.00001) @@ -2084,7 +2084,7 @@ class Eynollah: erosion_hurts = True self.logger.debug("exit get_regions_from_xy_2models") - return text_regions_p_true, erosion_hurts, polygons_lines_xml + return text_regions_p_true, erosion_hurts, polygons_seplines def do_order_of_regions_full_layout( self, contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot): @@ -4102,7 +4102,7 @@ class Eynollah: img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(self.light_version) self.logger.info("Enhancing took %.1fs ", time.time() - t0) if self.extract_only_images: - text_regions_p_1, erosion_hurts, polygons_lines_xml, polygons_of_images, image_page, page_coord, cont_page = \ + text_regions_p_1, erosion_hurts, polygons_seplines, polygons_of_images, image_page, page_coord, cont_page = \ self.get_regions_light_v_extract_only_images(img_res, is_image_enhanced, num_col_classifier) ocr_all_textlines = None pcgts = self.writer.build_pagexml_no_full_layout( @@ -4145,7 +4145,7 @@ class Eynollah: polygons_of_marginals = [] all_found_textline_polygons_marginals = [] all_box_coord_marginals = [] - polygons_lines_xml = [] + polygons_seplines = [] contours_tables = [] ocr_all_textlines = None conf_contours_textregions =None @@ -4153,13 +4153,13 @@ class Eynollah: cont_page, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, page_coord, polygons_of_images, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, - cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines, conf_contours_textregions) + cont_page, polygons_seplines, contours_tables, ocr_all_textlines, conf_contours_textregions) return pcgts #print("text region early -1 in %.1fs", time.time() - t0) t1 = time.time() if self.light_version: - text_regions_p_1 ,erosion_hurts, polygons_lines_xml, textline_mask_tot_ea, img_bin_light, confidence_matrix = \ + text_regions_p_1, erosion_hurts, polygons_seplines, textline_mask_tot_ea, img_bin_light, confidence_matrix = \ self.get_regions_light_v(img_res, is_image_enhanced, num_col_classifier) #print("text region early -2 in %.1fs", time.time() - t0) @@ -4186,7 +4186,7 @@ class Eynollah: textline_mask_tot_ea_org = np.copy(textline_mask_tot_ea) #print("text region early -4 in %.1fs", time.time() - t0) else: - text_regions_p_1 ,erosion_hurts, polygons_lines_xml = \ + text_regions_p_1, erosion_hurts, polygons_seplines = \ self.get_regions_from_xy_2models(img_res, is_image_enhanced, num_col_classifier) self.logger.info("Textregion detection took %.1fs ", time.time() - t1) @@ -4385,13 +4385,13 @@ class Eynollah: [], [], page_coord, [], [], [], [], [], [], polygons_of_images, contours_tables, [], polygons_of_marginals, empty_marginals, empty_marginals, [], [], [], - cont_page, polygons_lines_xml, [], [], []) + cont_page, polygons_seplines, [], [], []) else: pcgts = self.writer.build_pagexml_no_full_layout( [], page_coord, [], [], [], [], polygons_of_images, polygons_of_marginals, empty_marginals, empty_marginals, [], [], - cont_page, polygons_lines_xml, contours_tables, [], []) + cont_page, polygons_seplines, contours_tables, [], []) return pcgts @@ -4586,7 +4586,7 @@ class Eynollah: all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, - cont_page, polygons_lines_xml, ocr_all_textlines, conf_contours_textregions, conf_contours_textregions_h) + cont_page, polygons_seplines, ocr_all_textlines, conf_contours_textregions, conf_contours_textregions_h) return pcgts contours_only_text_parent_h = None @@ -4665,7 +4665,7 @@ class Eynollah: txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, - cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines, conf_contours_textregions) + cont_page, polygons_seplines, contours_tables, ocr_all_textlines, conf_contours_textregions) return pcgts From a433c736281dcf86630f80bfa686064814b313d9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 21 Aug 2025 01:33:16 +0200 Subject: [PATCH 010/101] filter_contours_area_of_image*: also ensure validity here --- src/eynollah/eynollah.py | 4 ++-- src/eynollah/utils/contour.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index c04c481..7b3b81a 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -3671,13 +3671,13 @@ class Eynollah: def dilate_textregions_contours_textline_version(self, all_found_textline_polygons): return [[np.array(make_valid(Polygon(poly[:, 0]).buffer(5)).exterior.coords[:-1], - dtype=int)[:, np.newaxis] + dtype=np.uint)[:, np.newaxis] for poly in region] for region in all_found_textline_polygons] def dilate_textregions_contours(self, all_found_textline_polygons): return [np.array(make_valid(Polygon(poly[:, 0]).buffer(5)).exterior.coords[:-1], - dtype=int)[:, np.newaxis] + dtype=np.uint)[:, np.newaxis] for poly in all_found_textline_polygons] diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index 8205c2b..03d45b7 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -48,8 +48,8 @@ def filter_contours_area_of_image(image, contours, hierarchy, max_area, min_area if (area >= min_area * np.prod(image.shape[:2]) and area <= max_area * np.prod(image.shape[:2]) and hierarchy[0][jv][3] == -1): - found_polygons_early.append(np.array([[point] - for point in polygon.exterior.coords[:-1]], dtype=np.uint)) + found_polygons_early.append(np.array(make_valid(polygon).exterior.coords[:-1], + dtype=np.uint)[:, np.newaxis]) return found_polygons_early def filter_contours_area_of_image_tables(image, contours, hierarchy, max_area, min_area): @@ -69,8 +69,8 @@ def filter_contours_area_of_image_tables(image, contours, hierarchy, max_area, m # hierarchy[0][jv][3]==-1 True): # print(c[0][0][1]) - found_polygons_early.append(np.array([[point] - for point in polygon.exterior.coords[:-1]], dtype=np.int32)) + found_polygons_early.append(np.array(make_valid(polygon).exterior.coords[:-1], + dtype=np.uint)[:, np.newaxis]) return found_polygons_early def find_new_features_of_contours(contours_main): From 0650274ffad576acde6048822b5f74b6303ef689 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 21 Aug 2025 01:42:46 +0200 Subject: [PATCH 011/101] =?UTF-8?q?move=20dilate=5F*=5Fcontours=20to=20.ut?= =?UTF-8?q?ils.contour,=20rename=20dilate=5Ftextregions=5Fcontours=5Ftextl?= =?UTF-8?q?ine=5Fversion=20=E2=86=92=20dilate=5Ftextline=5Fcontours?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/eynollah/eynollah.py | 253 ++-------------------------------- src/eynollah/utils/contour.py | 11 ++ 2 files changed, 22 insertions(+), 242 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 7b3b81a..fe233cb 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -69,12 +69,13 @@ from .utils.contour import ( get_text_region_boxes_by_given_contours, get_textregion_contours_in_org_image, get_textregion_contours_in_org_image_light, - make_valid, return_contours_of_image, return_contours_of_interested_region, return_contours_of_interested_region_by_min_size, return_contours_of_interested_textline, return_parent_contours, + dilate_textregion_contours, + dilate_textline_contours, ) from .utils.rotate import ( rotate_image, @@ -1919,7 +1920,7 @@ class Eynollah: #sys.exit() polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only,1,0.00001) - ##polygons_of_only_texts = self.dilate_textregions_contours(polygons_of_only_texts) + ##polygons_of_only_texts = dilate_textregion_contours(polygons_of_only_texts) polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only,1,0.00001) text_regions_p_true = np.zeros(prediction_regions_org.shape) @@ -3669,117 +3670,6 @@ class Eynollah: return x_differential_new - def dilate_textregions_contours_textline_version(self, all_found_textline_polygons): - return [[np.array(make_valid(Polygon(poly[:, 0]).buffer(5)).exterior.coords[:-1], - dtype=np.uint)[:, np.newaxis] - for poly in region] - for region in all_found_textline_polygons] - - def dilate_textregions_contours(self, all_found_textline_polygons): - return [np.array(make_valid(Polygon(poly[:, 0]).buffer(5)).exterior.coords[:-1], - dtype=np.uint)[:, np.newaxis] - for poly in all_found_textline_polygons] - - - def dilate_textline_contours(self, all_found_textline_polygons): - for j in range(len(all_found_textline_polygons)): - for ij in range(len(all_found_textline_polygons[j])): - con_ind = all_found_textline_polygons[j][ij] - area = cv2.contourArea(con_ind) - - con_ind = con_ind.astype(float) - - x_differential = np.diff( con_ind[:,0,0]) - y_differential = np.diff( con_ind[:,0,1]) - - x_differential = gaussian_filter1d(x_differential, 3) - y_differential = gaussian_filter1d(y_differential, 3) - - x_min = float(np.min( con_ind[:,0,0] )) - y_min = float(np.min( con_ind[:,0,1] )) - - x_max = float(np.max( con_ind[:,0,0] )) - y_max = float(np.max( con_ind[:,0,1] )) - - x_differential_mask_nonzeros = [ ind/abs(ind) if ind!=0 else ind for ind in x_differential] - y_differential_mask_nonzeros = [ ind/abs(ind) if ind!=0 else ind for ind in y_differential] - - abs_diff=abs(abs(x_differential)- abs(y_differential) ) - - inc_x = np.zeros(len(x_differential)+1) - inc_y = np.zeros(len(x_differential)+1) - - if (y_max-y_min) <= (x_max-x_min): - dilation_m1 = round(area / (x_max-x_min) * 0.35) - else: - dilation_m1 = round(area / (y_max-y_min) * 0.35) - - if dilation_m1>12: - dilation_m1 = 12 - if dilation_m1<4: - dilation_m1 = 4 - #print(dilation_m1, 'dilation_m1') - dilation_m2 = int(dilation_m1/2.) +1 - - for i in range(len(x_differential)): - if abs_diff[i]==0: - inc_x[i+1] = dilation_m2*(-1*y_differential_mask_nonzeros[i]) - inc_y[i+1] = dilation_m2*(x_differential_mask_nonzeros[i]) - elif abs_diff[i]!=0 and x_differential_mask_nonzeros[i]==0 and y_differential_mask_nonzeros[i]!=0: - inc_x[i+1]= dilation_m1*(-1*y_differential_mask_nonzeros[i]) - elif abs_diff[i]!=0 and x_differential_mask_nonzeros[i]!=0 and y_differential_mask_nonzeros[i]==0: - inc_y[i+1] = dilation_m1*(x_differential_mask_nonzeros[i]) - - elif abs_diff[i]!=0 and abs_diff[i]>=3: - if abs(x_differential[i])>abs(y_differential[i]): - inc_y[i+1] = dilation_m1*(x_differential_mask_nonzeros[i]) - else: - inc_x[i+1]= dilation_m1*(-1*y_differential_mask_nonzeros[i]) - else: - inc_x[i+1] = dilation_m2*(-1*y_differential_mask_nonzeros[i]) - inc_y[i+1] = dilation_m2*(x_differential_mask_nonzeros[i]) - - inc_x[0] = inc_x[-1] - inc_y[0] = inc_y[-1] - - con_scaled = con_ind*1 - - con_scaled[:,0, 0] = con_ind[:,0,0] + np.array(inc_x)[:] - con_scaled[:,0, 1] = con_ind[:,0,1] + np.array(inc_y)[:] - - con_scaled[:,0, 1][con_scaled[:,0, 1]<0] = 0 - con_scaled[:,0, 0][con_scaled[:,0, 0]<0] = 0 - - con_ind = con_ind.astype(np.int32) - - results = [cv2.pointPolygonTest(con_ind, (con_scaled[ind,0, 0], con_scaled[ind,0, 1]), False) - for ind in range(len(con_scaled[:,0, 1])) ] - results = np.array(results) - results[results==0] = 1 - - diff_result = np.diff(results) - - indices_2 = [ind for ind in range(len(diff_result)) if diff_result[ind]==2] - indices_m2 = [ind for ind in range(len(diff_result)) if diff_result[ind]==-2] - - if results[0]==1: - con_scaled[:indices_m2[0]+1,0, 1] = con_ind[:indices_m2[0]+1,0,1] - con_scaled[:indices_m2[0]+1,0, 0] = con_ind[:indices_m2[0]+1,0,0] - indices_m2 = indices_m2[1:] - - if len(indices_2)>len(indices_m2): - con_scaled[indices_2[-1]+1:,0, 1] = con_ind[indices_2[-1]+1:,0,1] - con_scaled[indices_2[-1]+1:,0, 0] = con_ind[indices_2[-1]+1:,0,0] - indices_2 = indices_2[:-1] - - for ii in range(len(indices_2)): - con_scaled[indices_2[ii]+1:indices_m2[ii]+1,0, 1] = con_scaled[indices_2[ii],0, 1] - con_scaled[indices_2[ii]+1:indices_m2[ii]+1,0, 0] = con_scaled[indices_2[ii],0, 0] - - all_found_textline_polygons[j][ij][:,0,1] = con_scaled[:,0, 1] - all_found_textline_polygons[j][ij][:,0,0] = con_scaled[:,0, 0] - return all_found_textline_polygons - def filter_contours_inside_a_bigger_one(self,contours, contours_d_ordered, image, marginal_cnts=None, type_contour="textregion"): if type_contour=="textregion": areas = [cv2.contourArea(contours[j]) for j in range(len(contours))] @@ -3917,121 +3807,6 @@ class Eynollah: return contours, text_con_org, conf_contours_textregions, contours_textline, contours_only_text_parent_d_ordered, np.array(range(len(contours))) - def dilate_textlines(self, all_found_textline_polygons): - for j in range(len(all_found_textline_polygons)): - for i in range(len(all_found_textline_polygons[j])): - con_ind = all_found_textline_polygons[j][i] - con_ind = con_ind.astype(float) - - x_differential = np.diff( con_ind[:,0,0]) - y_differential = np.diff( con_ind[:,0,1]) - - x_min = float(np.min( con_ind[:,0,0] )) - y_min = float(np.min( con_ind[:,0,1] )) - - x_max = float(np.max( con_ind[:,0,0] )) - y_max = float(np.max( con_ind[:,0,1] )) - - if (y_max - y_min) > (x_max - x_min) and (x_max - x_min)<70: - x_biger_than_x = np.abs(x_differential) > np.abs(y_differential) - mult = x_biger_than_x*x_differential - - arg_min_mult = np.argmin(mult) - arg_max_mult = np.argmax(mult) - - if y_differential[0]==0: - y_differential[0] = 0.1 - if y_differential[-1]==0: - y_differential[-1]= 0.1 - y_differential = [y_differential[ind] if y_differential[ind] != 0 - else 0.5 * (y_differential[ind-1] + y_differential[ind+1]) - for ind in range(len(y_differential))] - - if y_differential[0]==0.1: - y_differential[0] = y_differential[1] - if y_differential[-1]==0.1: - y_differential[-1] = y_differential[-2] - y_differential.append(y_differential[0]) - - y_differential = [-1 if y_differential[ind] < 0 else 1 - for ind in range(len(y_differential))] - y_differential = self.return_it_in_two_groups(y_differential) - y_differential = np.array(y_differential) - - con_scaled = con_ind*1 - con_scaled[:,0, 0] = con_ind[:,0,0] - 8*y_differential - con_scaled[arg_min_mult,0, 1] = con_ind[arg_min_mult,0,1] + 8 - con_scaled[arg_min_mult+1,0, 1] = con_ind[arg_min_mult+1,0,1] + 8 - - try: - con_scaled[arg_min_mult-1,0, 1] = con_ind[arg_min_mult-1,0,1] + 5 - con_scaled[arg_min_mult+2,0, 1] = con_ind[arg_min_mult+2,0,1] + 5 - except: - pass - - con_scaled[arg_max_mult,0, 1] = con_ind[arg_max_mult,0,1] - 8 - con_scaled[arg_max_mult+1,0, 1] = con_ind[arg_max_mult+1,0,1] - 8 - - try: - con_scaled[arg_max_mult-1,0, 1] = con_ind[arg_max_mult-1,0,1] - 5 - con_scaled[arg_max_mult+2,0, 1] = con_ind[arg_max_mult+2,0,1] - 5 - except: - pass - - else: - y_biger_than_x = np.abs(y_differential) > np.abs(x_differential) - mult = y_biger_than_x*y_differential - - arg_min_mult = np.argmin(mult) - arg_max_mult = np.argmax(mult) - - if x_differential[0]==0: - x_differential[0] = 0.1 - if x_differential[-1]==0: - x_differential[-1]= 0.1 - x_differential = [x_differential[ind] if x_differential[ind] != 0 - else 0.5 * (x_differential[ind-1] + x_differential[ind+1]) - for ind in range(len(x_differential))] - - if x_differential[0]==0.1: - x_differential[0] = x_differential[1] - if x_differential[-1]==0.1: - x_differential[-1] = x_differential[-2] - x_differential.append(x_differential[0]) - - x_differential = [-1 if x_differential[ind] < 0 else 1 - for ind in range(len(x_differential))] - x_differential = self.return_it_in_two_groups(x_differential) - x_differential = np.array(x_differential) - - con_scaled = con_ind*1 - con_scaled[:,0, 1] = con_ind[:,0,1] + 8*x_differential - con_scaled[arg_min_mult,0, 0] = con_ind[arg_min_mult,0,0] + 8 - con_scaled[arg_min_mult+1,0, 0] = con_ind[arg_min_mult+1,0,0] + 8 - - try: - con_scaled[arg_min_mult-1,0, 0] = con_ind[arg_min_mult-1,0,0] + 5 - con_scaled[arg_min_mult+2,0, 0] = con_ind[arg_min_mult+2,0,0] + 5 - except: - pass - - con_scaled[arg_max_mult,0, 0] = con_ind[arg_max_mult,0,0] - 8 - con_scaled[arg_max_mult+1,0, 0] = con_ind[arg_max_mult+1,0,0] - 8 - - try: - con_scaled[arg_max_mult-1,0, 0] = con_ind[arg_max_mult-1,0,0] - 5 - con_scaled[arg_max_mult+2,0, 0] = con_ind[arg_max_mult+2,0,0] - 5 - except: - pass - - con_scaled[:,0, 1][con_scaled[:,0, 1]<0] = 0 - con_scaled[:,0, 0][con_scaled[:,0, 0]<0] = 0 - - all_found_textline_polygons[j][i][:,0,1] = con_scaled[:,0, 1] - all_found_textline_polygons[j][i][:,0,0] = con_scaled[:,0, 0] - - return all_found_textline_polygons - def delete_regions_without_textlines( self, slopes, all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, index_by_text_par_con): @@ -4130,8 +3905,7 @@ class Eynollah: all_found_textline_polygons=[ all_found_textline_polygons ] - all_found_textline_polygons = self.dilate_textregions_contours_textline_version( - all_found_textline_polygons) + all_found_textline_polygons = dilate_textline_contours(all_found_textline_polygons) all_found_textline_polygons = self.filter_contours_inside_a_bigger_one( all_found_textline_polygons, None, textline_mask_tot_ea, type_contour="textline") @@ -4255,14 +4029,14 @@ class Eynollah: boxes, boxes_d, polygons_of_marginals, contours_tables = \ self.run_boxes_no_full_layout(image_page, textline_mask_tot, text_regions_p, slope_deskew, num_col_classifier, table_prediction, erosion_hurts) - ###polygons_of_marginals = self.dilate_textregions_contours(polygons_of_marginals) + ###polygons_of_marginals = dilate_textregion_contours(polygons_of_marginals) else: polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, regions_without_separators_d, \ regions_fully, regions_without_separators, polygons_of_marginals, contours_tables = \ self.run_boxes_full_layout(image_page, textline_mask_tot, text_regions_p, slope_deskew, num_col_classifier, img_only_regions, table_prediction, erosion_hurts, img_bin_light if self.light_version else None) - ###polygons_of_marginals = self.dilate_textregions_contours(polygons_of_marginals) + ###polygons_of_marginals = dilate_textregion_contours(polygons_of_marginals) if self.light_version: drop_label_in_full_layout = 4 textline_mask_tot_ea_org[img_revised_tab==drop_label_in_full_layout] = 0 @@ -4398,15 +4172,14 @@ class Eynollah: #print("text region early 3 in %.1fs", time.time() - t0) if self.light_version: - contours_only_text_parent = self.dilate_textregions_contours( - contours_only_text_parent) + contours_only_text_parent = dilate_textregion_contours(contours_only_text_parent) contours_only_text_parent , contours_only_text_parent_d_ordered = self.filter_contours_inside_a_bigger_one( contours_only_text_parent, contours_only_text_parent_d_ordered, text_only, marginal_cnts=polygons_of_marginals) #print("text region early 3.5 in %.1fs", time.time() - t0) txt_con_org , conf_contours_textregions = get_textregion_contours_in_org_image_light( contours_only_text_parent, self.image, confidence_matrix) - #txt_con_org = self.dilate_textregions_contours(txt_con_org) - #contours_only_text_parent = self.dilate_textregions_contours(contours_only_text_parent) + #txt_con_org = dilate_textregion_contours(txt_con_org) + #contours_only_text_parent = dilate_textregion_contours(contours_only_text_parent) else: txt_con_org , conf_contours_textregions = get_textregion_contours_in_org_image_light( contours_only_text_parent, self.image, confidence_matrix) @@ -4433,14 +4206,10 @@ class Eynollah: #slopes_marginals, all_found_textline_polygons_marginals, boxes_marginals, polygons_of_marginals, polygons_of_marginals, _ = \ # self.delete_regions_without_textlines(slopes_marginals, all_found_textline_polygons_marginals, # boxes_marginals, polygons_of_marginals, polygons_of_marginals, np.array(range(len(polygons_of_marginals)))) - #all_found_textline_polygons = self.dilate_textlines(all_found_textline_polygons) - #####all_found_textline_polygons = self.dilate_textline_contours(all_found_textline_polygons) - all_found_textline_polygons = self.dilate_textregions_contours_textline_version( - all_found_textline_polygons) + all_found_textline_polygons = dilate_textline_contours(all_found_textline_polygons) all_found_textline_polygons = self.filter_contours_inside_a_bigger_one( all_found_textline_polygons, None, textline_mask_tot_ea_org, type_contour="textline") - all_found_textline_polygons_marginals = self.dilate_textregions_contours_textline_version( - all_found_textline_polygons_marginals) + all_found_textline_polygons_marginals = dilate_textline_contours(all_found_textline_polygons_marginals) contours_only_text_parent, txt_con_org, conf_contours_textregions, all_found_textline_polygons, contours_only_text_parent_d_ordered, \ index_by_text_par_con = self.filter_contours_without_textline_inside( contours_only_text_parent, txt_con_org, all_found_textline_polygons, contours_only_text_parent_d_ordered, conf_contours_textregions) diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index 03d45b7..f228e53 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -328,6 +328,17 @@ def return_contours_of_interested_region_by_size(region_pre_p, pixel, min_area, return img_ret[:, :, 0] +def dilate_textline_contours(self, all_found_textline_polygons): + return [[np.array(make_valid(Polygon(poly[:, 0]).buffer(5)).exterior.coords[:-1], + dtype=np.uint)[:, np.newaxis] + for poly in region] + for region in all_found_textline_polygons] + +def dilate_textregion_contours(self, all_found_textline_polygons): + return [np.array(make_valid(Polygon(poly[:, 0]).buffer(5)).exterior.coords[:-1], + dtype=np.uint)[:, np.newaxis] + for poly in all_found_textline_polygons] + def make_valid(polygon: Polygon) -> Polygon: """Ensures shapely.geometry.Polygon object is valid by repeated rearrangement/simplification/enlargement.""" def isint(x): From f3faa29528ce7acdafa0c02fc2a9ec4732d91e4a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 21 Aug 2025 12:59:03 +0200 Subject: [PATCH 012/101] refactor shapely converisons into contour2polygon / polygon2contour, also handle heterogeneous geometries --- src/eynollah/eynollah.py | 1 - src/eynollah/utils/contour.py | 107 ++++++++++++++++++++++++++-------- 2 files changed, 83 insertions(+), 25 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index fe233cb..54ace30 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -27,7 +27,6 @@ from loky import ProcessPoolExecutor import xml.etree.ElementTree as ET import cv2 import numpy as np -from shapely.geometry import Polygon from scipy.signal import find_peaks from scipy.ndimage import gaussian_filter1d from numba import cuda diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index f228e53..1123241 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -1,7 +1,15 @@ +from typing import Sequence, Union +from numbers import Number from functools import partial +import itertools + import cv2 import numpy as np -from shapely.geometry import Polygon +from scipy.sparse.csgraph import minimum_spanning_tree +from shapely.geometry import Polygon, LineString +from shapely.geometry.polygon import orient +from shapely import set_precision +from shapely.ops import unary_union, nearest_points from .rotate import rotate_image, rotation_image_new @@ -37,29 +45,28 @@ def get_text_region_boxes_by_given_contours(contours): return boxes, contours_new -def filter_contours_area_of_image(image, contours, hierarchy, max_area, min_area): +def filter_contours_area_of_image(image, contours, hierarchy, max_area=1.0, min_area=0.0, dilate=0): found_polygons_early = [] - for jv,c in enumerate(contours): - if len(c) < 3: # A polygon cannot have less than 3 points + for jv, contour in enumerate(contours): + if len(contour) < 3: # A polygon cannot have less than 3 points continue - polygon = Polygon([point[0] for point in c]) + polygon = contour2polygon(contour, dilate=dilate) area = polygon.area if (area >= min_area * np.prod(image.shape[:2]) and area <= max_area * np.prod(image.shape[:2]) and hierarchy[0][jv][3] == -1): - found_polygons_early.append(np.array(make_valid(polygon).exterior.coords[:-1], - dtype=np.uint)[:, np.newaxis]) + found_polygons_early.append(polygon2contour(polygon)) return found_polygons_early -def filter_contours_area_of_image_tables(image, contours, hierarchy, max_area, min_area): +def filter_contours_area_of_image_tables(image, contours, hierarchy, max_area=1.0, min_area=0.0, dilate=0): found_polygons_early = [] - for jv,c in enumerate(contours): - if len(c) < 3: # A polygon cannot have less than 3 points + for jv, contour in enumerate(contours): + if len(contour) < 3: # A polygon cannot have less than 3 points continue - polygon = Polygon([point[0] for point in c]) - # area = cv2.contourArea(c) + polygon = contour2polygon(contour, dilate=dilate) + # area = cv2.contourArea(contour) area = polygon.area ##print(np.prod(thresh.shape[:2])) # Check that polygon has area greater than minimal area @@ -68,9 +75,8 @@ def filter_contours_area_of_image_tables(image, contours, hierarchy, max_area, m area <= max_area * np.prod(image.shape[:2]) and # hierarchy[0][jv][3]==-1 True): - # print(c[0][0][1]) - found_polygons_early.append(np.array(make_valid(polygon).exterior.coords[:-1], - dtype=np.uint)[:, np.newaxis]) + # print(contour[0][0][1]) + found_polygons_early.append(polygon2contour(polygon)) return found_polygons_early def find_new_features_of_contours(contours_main): @@ -328,16 +334,29 @@ def return_contours_of_interested_region_by_size(region_pre_p, pixel, min_area, return img_ret[:, :, 0] -def dilate_textline_contours(self, all_found_textline_polygons): - return [[np.array(make_valid(Polygon(poly[:, 0]).buffer(5)).exterior.coords[:-1], - dtype=np.uint)[:, np.newaxis] - for poly in region] +def dilate_textline_contours(all_found_textline_polygons): + return [[polygon2contour(contour2polygon(contour, dilate=5)) + for contour in region] for region in all_found_textline_polygons] -def dilate_textregion_contours(self, all_found_textline_polygons): - return [np.array(make_valid(Polygon(poly[:, 0]).buffer(5)).exterior.coords[:-1], - dtype=np.uint)[:, np.newaxis] - for poly in all_found_textline_polygons] +def dilate_textregion_contours(all_found_textline_polygons): + return [polygon2contour(contour2polygon(contour, dilate=5)) + for contour in all_found_textline_polygons] + +def contour2polygon(contour: Union[np.ndarray, Sequence[Sequence[Sequence[Number]]]], dilate=0): + polygon = Polygon([point[0] for point in contour]) + if dilate: + polygon = polygon.buffer(dilate) + if polygon.geom_type == 'GeometryCollection': + # heterogeneous result: filter zero-area shapes (LineString, Point) + polygon = unary_union([geom for geom in polygon.geoms if geom.area > 0]) + if polygon.geom_type == 'MultiPolygon': + # homogeneous result: construct convex hull to connect + polygon = join_polygons(polygon.geoms) + return make_valid(polygon) + +def polygon2contour(polygon: Polygon) -> np.ndarray: + return np.array(polygon.exterior.coords[:-1], dtype=np.uint)[:, np.newaxis] def make_valid(polygon: Polygon) -> Polygon: """Ensures shapely.geometry.Polygon object is valid by repeated rearrangement/simplification/enlargement.""" @@ -346,7 +365,7 @@ def make_valid(polygon: Polygon) -> Polygon: # make sure rounding does not invalidate if not all(map(isint, np.array(polygon.exterior.coords).flat)) and polygon.minimum_clearance < 1.0: polygon = Polygon(np.round(polygon.exterior.coords)) - points = list(polygon.exterior.coords) + points = list(polygon.exterior.coords[:-1]) # try by re-arranging points for split in range(1, len(points)): if polygon.is_valid or polygon.simplify(polygon.area).is_valid: @@ -368,3 +387,43 @@ def make_valid(polygon: Polygon) -> Polygon: polygon = polygon.buffer(tolerance) assert polygon.is_valid, polygon.wkt return polygon + +def join_polygons(polygons: Sequence[Polygon], scale=20) -> Polygon: + """construct concave hull (alpha shape) from input polygons by connecting their pairwise nearest points""" + # ensure input polygons are simply typed and all oriented equally + polygons = [orient(poly) + for poly in itertools.chain.from_iterable( + [poly.geoms + if poly.geom_type in ['MultiPolygon', 'GeometryCollection'] + else [poly] + for poly in polygons])] + npoly = len(polygons) + if npoly == 1: + return polygons[0] + # find min-dist path through all polygons (travelling salesman) + pairs = itertools.combinations(range(npoly), 2) + dists = np.zeros((npoly, npoly), dtype=float) + for i, j in pairs: + dist = polygons[i].distance(polygons[j]) + if dist < 1e-5: + dist = 1e-5 # if pair merely touches, we still need to get an edge + dists[i, j] = dist + dists[j, i] = dist + dists = minimum_spanning_tree(dists, overwrite=True) + # add bridge polygons (where necessary) + for prevp, nextp in zip(*dists.nonzero()): + prevp = polygons[prevp] + nextp = polygons[nextp] + nearest = nearest_points(prevp, nextp) + bridgep = orient(LineString(nearest).buffer(max(1, scale/5), resolution=1), -1) + polygons.append(bridgep) + jointp = unary_union(polygons) + assert jointp.geom_type == 'Polygon', jointp.wkt + # follow-up calculations will necessarily be integer; + # so anticipate rounding here and then ensure validity + jointp2 = set_precision(jointp, 1.0) + if jointp2.geom_type != 'Polygon' or not jointp2.is_valid: + jointp2 = Polygon(np.round(jointp.exterior.coords)) + jointp2 = make_valid(jointp2) + assert jointp2.geom_type == 'Polygon', jointp2.wkt + return jointp2 From 7a9e8256ee8a4c777baa0bd972697cece3e269a5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 21 Aug 2025 13:00:31 +0200 Subject: [PATCH 013/101] =?UTF-8?q?increase=20dilatation:=20textregions/li?= =?UTF-8?q?nes=20(5=E2=86=926),=20seplines=20(0=E2=86=921)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/eynollah/eynollah.py | 10 +++++----- src/eynollah/utils/contour.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 54ace30..8cb1d52 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -1714,8 +1714,8 @@ class Eynollah: mask_images_only=(prediction_regions_org[:,:] ==2)*1 polygons_seplines, hir_seplines = return_contours_of_image(mask_lines_only) - polygons_seplines = textline_con_fil = filter_contours_area_of_image( - mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001) + polygons_seplines = filter_contours_area_of_image( + mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only,1,0.00001) polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only,1,0.00001) @@ -1909,7 +1909,7 @@ class Eynollah: #plt.show() polygons_seplines = filter_contours_area_of_image( - mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001) + mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) test_khat = np.zeros(prediction_regions_org.shape) test_khat = cv2.fillPoly(test_khat, pts = polygons_seplines, color=(1,1,1)) @@ -2022,7 +2022,7 @@ class Eynollah: polygons_seplines, hir_seplines = return_contours_of_image(mask_lines_only) polygons_seplines = filter_contours_area_of_image( - mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001) + mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only, 1, 0.00001) polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only, 1, 0.00001) @@ -2071,7 +2071,7 @@ class Eynollah: polygons_seplines, hir_seplines = return_contours_of_image(mask_lines_only) polygons_seplines = filter_contours_area_of_image( - mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001) + mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only,1,0.00001) polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only,1,0.00001) diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index 1123241..c571be6 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -335,12 +335,12 @@ def return_contours_of_interested_region_by_size(region_pre_p, pixel, min_area, return img_ret[:, :, 0] def dilate_textline_contours(all_found_textline_polygons): - return [[polygon2contour(contour2polygon(contour, dilate=5)) + return [[polygon2contour(contour2polygon(contour, dilate=6)) for contour in region] for region in all_found_textline_polygons] def dilate_textregion_contours(all_found_textline_polygons): - return [polygon2contour(contour2polygon(contour, dilate=5)) + return [polygon2contour(contour2polygon(contour, dilate=6)) for contour in all_found_textline_polygons] def contour2polygon(contour: Union[np.ndarray, Sequence[Sequence[Sequence[Number]]]], dilate=0): From 11e143afee1f446bfef7c6b19ba720e5cddb981d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 29 Aug 2025 12:16:56 +0200 Subject: [PATCH 014/101] polygon2contour: avoid overflow --- src/eynollah/utils/contour.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index c571be6..2cd7080 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -356,7 +356,8 @@ def contour2polygon(contour: Union[np.ndarray, Sequence[Sequence[Sequence[Number return make_valid(polygon) def polygon2contour(polygon: Polygon) -> np.ndarray: - return np.array(polygon.exterior.coords[:-1], dtype=np.uint)[:, np.newaxis] + polygon = np.array(polygon.exterior.coords[:-1], dtype=int) + return np.maximum(0, polygon).astype(np.uint)[:, np.newaxis] def make_valid(polygon: Polygon) -> Polygon: """Ensures shapely.geometry.Polygon object is valid by repeated rearrangement/simplification/enlargement.""" From 235539a35071559f8929bfcda9cb47d506c23d58 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 29 Aug 2025 12:19:37 +0200 Subject: [PATCH 015/101] filter_contours_without_textline_inside: avoid removing from identical lists twice --- src/eynollah/eynollah.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 8cb1d52..b636b09 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -3764,7 +3764,9 @@ class Eynollah: return contours def filter_contours_without_textline_inside( - self, contours,text_con_org, contours_textline, contours_only_text_parent_d_ordered, conf_contours_textregions): + self, contours, text_con_org, contours_textline, + contours_only_text_parent_d_ordered, + conf_contours_textregions): ###contours_txtline_of_all_textregions = [] ###for jj in range(len(contours_textline)): ###contours_txtline_of_all_textregions = contours_txtline_of_all_textregions + contours_textline[jj] @@ -3788,23 +3790,23 @@ class Eynollah: ###if np.any(results==1): ###contours_with_textline.append(con_tr) - textregion_index_to_del = [] + textregion_index_to_del = set() for index_textregion, textlines_textregion in enumerate(contours_textline): - if len(textlines_textregion)==0: - textregion_index_to_del.append(index_textregion) + if len(textlines_textregion) == 0: + textregion_index_to_del.add(index_textregion) + def filterfun(lis): + if len(lis) == 0: + return [] + if len(textregion_index_to_del) == 0: + return lis + return list(np.delete(lis, list(textregion_index_to_del))) - uniqe_args_trs = np.unique(textregion_index_to_del) - uniqe_args_trs_sorted = np.sort(uniqe_args_trs)[::-1] - - for ind_u_a_trs in uniqe_args_trs_sorted: - conf_contours_textregions.pop(ind_u_a_trs) - contours.pop(ind_u_a_trs) - contours_textline.pop(ind_u_a_trs) - text_con_org.pop(ind_u_a_trs) - if len(contours_only_text_parent_d_ordered) > 0: - contours_only_text_parent_d_ordered.pop(ind_u_a_trs) - - return contours, text_con_org, conf_contours_textregions, contours_textline, contours_only_text_parent_d_ordered, np.array(range(len(contours))) + return (filterfun(contours), + filterfun(text_con_org), + filterfun(conf_contours_textregions), + filterfun(contours_textline), + filterfun(contours_only_text_parent_d_ordered), + np.arange(len(contours) - len(textregion_index_to_del))) def delete_regions_without_textlines( self, slopes, all_found_textline_polygons, boxes_text, txt_con_org, From bca2ae3d78fcc6536c5365c9b93a0143ebbbf658 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 29 Aug 2025 12:37:44 +0200 Subject: [PATCH 016/101] get_marginals: exit early if no peaks found to avoid spurious overlap mask --- src/eynollah/utils/marginals.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/eynollah/utils/marginals.py b/src/eynollah/utils/marginals.py index a29e50d..22ada4e 100644 --- a/src/eynollah/utils/marginals.py +++ b/src/eynollah/utils/marginals.py @@ -94,6 +94,8 @@ def get_marginals(text_with_lines, text_regions, num_col, slope_deskew, light_ve except: point_left=first_nonzero + if point_left == first_nonzero and point_right == last_nonzero: + return text_regions if point_right>=mask_marginals.shape[1]: From 9b5182c1c07ebbdb65ea81978f9c667917b82743 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 26 Aug 2025 21:00:33 +0200 Subject: [PATCH 017/101] utils: introduce box2rect and box2slice --- src/eynollah/utils/__init__.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index c479744..bbf30a8 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -300,9 +300,17 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order( x_end_with_child_without_mother, new_main_sep_y) +def box2rect(box: Tuple[int, int, int, int]) -> Tuple[int, int, int, int]: + return (box[1], box[1] + box[3], + box[0], box[0] + box[2]) + +def box2slice(box: Tuple[int, int, int, int]) -> Tuple[slice, slice]: + return (slice(box[1], box[1] + box[3]), + slice(box[0], box[0] + box[2])) + def crop_image_inside_box(box, img_org_copy): - image_box = img_org_copy[box[1] : box[1] + box[3], box[0] : box[0] + box[2]] - return image_box, [box[1], box[1] + box[3], box[0], box[0] + box[2]] + image_box = img_org_copy[box2slice(box)] + return image_box, box2rect(box) def otsu_copy_binary(img): img_r = np.zeros((img.shape[0], img.shape[1], 3)) From 5bff2d156ab32b72470b547870874da3053a3d7b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 26 Aug 2025 21:02:43 +0200 Subject: [PATCH 018/101] use box2rect instead of crop_image_inside_box when no image needed --- src/eynollah/eynollah.py | 8 +++++--- src/eynollah/utils/separate_lines.py | 8 +++++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index b636b09..6847c1f 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -98,6 +98,8 @@ from .utils.resize import resize_image from .utils import ( boosting_headers_by_longshot_region_segmentation, crop_image_inside_box, + box2rect, + box2slice, find_num_col, otsu_copy_binary, put_drop_out_from_only_drop_model, @@ -1542,7 +1544,7 @@ class Eynollah: all_found_textline_polygons.append(textlines_ins[::-1]) slopes.append(slope_deskew) - _, crop_coor = crop_image_inside_box(boxes[index],image_page_rotated) + crop_coor = box2rect(boxes[index]) all_box_coord.append(crop_coor) return all_found_textline_polygons, boxes, contours, contours_par, all_box_coord, np.array(range(len(contours_par))), slopes @@ -1754,7 +1756,7 @@ class Eynollah: ##polygons_of_images_fin.append(ploy_img_ind) box = cv2.boundingRect(ploy_img_ind) - _, page_coord_img = crop_image_inside_box(box, text_regions_p_true) + page_coord_img = box2rect(box) # cont_page.append(np.array([[page_coord[2], page_coord[0]], # [page_coord[3], page_coord[0]], # [page_coord[3], page_coord[1]], @@ -1768,7 +1770,7 @@ class Eynollah: if h < 150 or w < 150: pass else: - _, page_coord_img = crop_image_inside_box(box, text_regions_p_true) + page_coord_img = box2rect(box) # cont_page.append(np.array([[page_coord[2], page_coord[0]], # [page_coord[3], page_coord[0]], # [page_coord[3], page_coord[1]], diff --git a/src/eynollah/utils/separate_lines.py b/src/eynollah/utils/separate_lines.py index ffbfff7..b1a90b5 100644 --- a/src/eynollah/utils/separate_lines.py +++ b/src/eynollah/utils/separate_lines.py @@ -18,6 +18,8 @@ from .contour import ( from . import ( find_num_col_deskew, crop_image_inside_box, + box2rect, + box2slice, ) def dedup_separate_lines(img_patch, contour_text_interest, thetha, axis): @@ -1540,7 +1542,7 @@ def do_work_of_slopes_new( logger.debug('enter do_work_of_slopes_new') x, y, w, h = box_text - _, crop_coor = crop_image_inside_box(box_text, image_page_rotated) + crop_coor = box2rect(box_text) mask_textline = np.zeros(textline_mask_tot_ea.shape) mask_textline = cv2.fillPoly(mask_textline, pts=[contour], color=(1,1,1)) all_text_region_raw = textline_mask_tot_ea * mask_textline @@ -1631,7 +1633,7 @@ def do_work_of_slopes_new_curved( slope_for_all = slope_deskew slope = slope_for_all - _, crop_coor = crop_image_inside_box(box_text, image_page_rotated) + crop_coor = box2rect(box_text) if abs(slope_for_all) < 45: textline_region_in_image = np.zeros(textline_mask_tot_ea.shape) @@ -1685,7 +1687,7 @@ def do_work_of_slopes_new_light( logger.debug('enter do_work_of_slopes_new_light') x, y, w, h = box_text - _, crop_coor = crop_image_inside_box(box_text, image_page_rotated) + crop_coor = box2rect(box_text) mask_textline = np.zeros(textline_mask_tot_ea.shape) mask_textline = cv2.fillPoly(mask_textline, pts=[contour], color=(1,1,1)) all_text_region_raw = textline_mask_tot_ea * mask_textline From 5b16c2fc0066f3e1542dfdf7a1fe9f9241401c38 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 26 Aug 2025 21:05:40 +0200 Subject: [PATCH 019/101] avoid pulling unused 'image_page_rotated' through functions --- src/eynollah/eynollah.py | 48 +++++++++++++--------------- src/eynollah/utils/separate_lines.py | 6 ++-- 2 files changed, 25 insertions(+), 29 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 6847c1f..8f66af5 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -1521,7 +1521,7 @@ class Eynollah: self.logger.debug("exit extract_text_regions") return prediction_regions, prediction_regions2 - def get_slopes_and_deskew_new_light2(self, contours, contours_par, textline_mask_tot, image_page_rotated, boxes, slope_deskew): + def get_slopes_and_deskew_new_light2(self, contours, contours_par, textline_mask_tot, boxes, slope_deskew): polygons_of_textlines = return_contours_of_interested_region(textline_mask_tot,1,0.00001) M_main_tot = [cv2.moments(polygons_of_textlines[j]) @@ -1549,13 +1549,12 @@ class Eynollah: return all_found_textline_polygons, boxes, contours, contours_par, all_box_coord, np.array(range(len(contours_par))), slopes - def get_slopes_and_deskew_new_light(self, contours, contours_par, textline_mask_tot, image_page_rotated, boxes, slope_deskew): + def get_slopes_and_deskew_new_light(self, contours, contours_par, textline_mask_tot, boxes, slope_deskew): if not len(contours): return [], [], [], [], [], [], [] self.logger.debug("enter get_slopes_and_deskew_new_light") results = self.executor.map(partial(do_work_of_slopes_new_light, textline_mask_tot_ea=textline_mask_tot, - image_page_rotated=image_page_rotated, slope_deskew=slope_deskew,textline_light=self.textline_light, logger=self.logger,), boxes, contours, contours_par, range(len(contours_par))) @@ -1563,13 +1562,12 @@ class Eynollah: self.logger.debug("exit get_slopes_and_deskew_new_light") return tuple(zip(*results)) - def get_slopes_and_deskew_new(self, contours, contours_par, textline_mask_tot, image_page_rotated, boxes, slope_deskew): + def get_slopes_and_deskew_new(self, contours, contours_par, textline_mask_tot, boxes, slope_deskew): if not len(contours): return [], [], [], [], [], [], [] self.logger.debug("enter get_slopes_and_deskew_new") results = self.executor.map(partial(do_work_of_slopes_new, textline_mask_tot_ea=textline_mask_tot, - image_page_rotated=image_page_rotated, slope_deskew=slope_deskew, MAX_SLOPE=MAX_SLOPE, KERNEL=KERNEL, @@ -1580,13 +1578,12 @@ class Eynollah: self.logger.debug("exit get_slopes_and_deskew_new") return tuple(zip(*results)) - def get_slopes_and_deskew_new_curved(self, contours, contours_par, textline_mask_tot, image_page_rotated, boxes, mask_texts_only, num_col, scale_par, slope_deskew): + def get_slopes_and_deskew_new_curved(self, contours, contours_par, textline_mask_tot, boxes, mask_texts_only, num_col, scale_par, slope_deskew): if not len(contours): return [], [], [], [], [], [], [] self.logger.debug("enter get_slopes_and_deskew_new_curved") results = self.executor.map(partial(do_work_of_slopes_new_curved, textline_mask_tot_ea=textline_mask_tot, - image_page_rotated=image_page_rotated, mask_texts_only=mask_texts_only, num_col=num_col, scale_par=scale_par, @@ -2935,10 +2932,10 @@ class Eynollah: return slope_deskew def run_marginals( - self, image_page, textline_mask_tot_ea, mask_images, mask_lines, + self, textline_mask_tot_ea, mask_images, mask_lines, num_col_classifier, slope_deskew, text_regions_p_1, table_prediction): - image_page_rotated, textline_mask_tot = image_page[:, :], textline_mask_tot_ea[:, :] + textline_mask_tot = textline_mask_tot_ea[:, :] textline_mask_tot[mask_images[:, :] == 1] = 0 text_regions_p_1[mask_lines[:, :] == 1] = 3 @@ -2957,10 +2954,7 @@ class Eynollah: except Exception as e: self.logger.error("exception %s", e) - if self.plotter: - self.plotter.save_plot_of_layout_main_all(text_regions_p, image_page) - self.plotter.save_plot_of_layout_main(text_regions_p, image_page) - return textline_mask_tot, text_regions_p, image_page_rotated + return textline_mask_tot, text_regions_p def run_boxes_no_full_layout( self, image_page, textline_mask_tot, text_regions_p, @@ -3112,7 +3106,7 @@ class Eynollah: text_regions_p[:,:][table_prediction[:,:]==1] = 10 img_revised_tab = text_regions_p[:,:] if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - image_page_rotated_n, textline_mask_tot_d, text_regions_p_1_n, table_prediction_n = \ + _, textline_mask_tot_d, text_regions_p_1_n, table_prediction_n = \ rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, table_prediction, slope_deskew) text_regions_p_1_n = resize_image(text_regions_p_1_n,text_regions_p.shape[0],text_regions_p.shape[1]) @@ -3132,7 +3126,7 @@ class Eynollah: else: if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - image_page_rotated_n, textline_mask_tot_d, text_regions_p_1_n, table_prediction_n = \ + _, textline_mask_tot_d, text_regions_p_1_n, table_prediction_n = \ rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, table_prediction, slope_deskew) text_regions_p_1_n = resize_image(text_regions_p_1_n,text_regions_p.shape[0],text_regions_p.shape[1]) @@ -4010,9 +4004,12 @@ class Eynollah: text_regions_p_1 = resize_image(text_regions_p_1,img_h_new, img_w_new ) table_prediction = resize_image(table_prediction,img_h_new, img_w_new ) - textline_mask_tot, text_regions_p, image_page_rotated = \ - self.run_marginals(image_page, textline_mask_tot_ea, mask_images, mask_lines, + textline_mask_tot, text_regions_p = \ + self.run_marginals(textline_mask_tot_ea, mask_images, mask_lines, num_col_classifier, slope_deskew, text_regions_p_1, table_prediction) + if self.plotter: + self.plotter.save_plot_of_layout_main_all(text_regions_p, image_page) + self.plotter.save_plot_of_layout_main(text_regions_p, image_page) if self.light_version and num_col_classifier in (1,2): image_page = resize_image(image_page,org_h_l_m, org_w_l_m ) @@ -4021,7 +4018,6 @@ class Eynollah: textline_mask_tot = resize_image(textline_mask_tot,org_h_l_m, org_w_l_m ) text_regions_p_1 = resize_image(text_regions_p_1,org_h_l_m, org_w_l_m ) table_prediction = resize_image(table_prediction,org_h_l_m, org_w_l_m ) - image_page_rotated = resize_image(image_page_rotated,org_h_l_m, org_w_l_m ) self.logger.info("detection of marginals took %.1fs", time.time() - t1) #print("text region early 2 marginal in %.1fs", time.time() - t0) @@ -4197,11 +4193,11 @@ class Eynollah: all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, \ all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_light2( txt_con_org, contours_only_text_parent, textline_mask_tot_ea_org, - image_page_rotated, boxes_text, slope_deskew) + boxes_text, slope_deskew) all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, \ all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new_light2( polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea_org, - image_page_rotated, boxes_marginals, slope_deskew) + boxes_marginals, slope_deskew) #slopes, all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, index_by_text_par_con = \ # self.delete_regions_without_textlines(slopes, all_found_textline_polygons, @@ -4221,11 +4217,11 @@ class Eynollah: all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, \ index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_light( txt_con_org, contours_only_text_parent, textline_mask_tot_ea, - image_page_rotated, boxes_text, slope_deskew) + boxes_text, slope_deskew) all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, \ all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new_light( polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, - image_page_rotated, boxes_marginals, slope_deskew) + boxes_marginals, slope_deskew) #all_found_textline_polygons = self.filter_contours_inside_a_bigger_one( # all_found_textline_polygons, textline_mask_tot_ea_org, type_contour="textline") else: @@ -4233,25 +4229,25 @@ class Eynollah: all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, \ all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new( txt_con_org, contours_only_text_parent, textline_mask_tot_ea, - image_page_rotated, boxes_text, slope_deskew) + boxes_text, slope_deskew) all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, \ all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new( polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, - image_page_rotated, boxes_marginals, slope_deskew) + boxes_marginals, slope_deskew) else: scale_param = 1 textline_mask_tot_ea_erode = cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=2) all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, \ all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_curved( txt_con_org, contours_only_text_parent, textline_mask_tot_ea_erode, - image_page_rotated, boxes_text, text_only, + boxes_text, text_only, num_col_classifier, scale_param, slope_deskew) all_found_textline_polygons = small_textlines_to_parent_adherence2( all_found_textline_polygons, textline_mask_tot_ea, num_col_classifier) all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, \ all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new_curved( polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea_erode, - image_page_rotated, boxes_marginals, text_only, + boxes_marginals, text_only, num_col_classifier, scale_param, slope_deskew) all_found_textline_polygons_marginals = small_textlines_to_parent_adherence2( all_found_textline_polygons_marginals, textline_mask_tot_ea, num_col_classifier) diff --git a/src/eynollah/utils/separate_lines.py b/src/eynollah/utils/separate_lines.py index b1a90b5..dcddc65 100644 --- a/src/eynollah/utils/separate_lines.py +++ b/src/eynollah/utils/separate_lines.py @@ -1532,7 +1532,7 @@ def get_smallest_skew(img, sigma_des, angles, logger=None, plotter=None, map=map def do_work_of_slopes_new( box_text, contour, contour_par, index_r_con, - textline_mask_tot_ea, image_page_rotated, slope_deskew, + textline_mask_tot_ea, slope_deskew, logger=None, MAX_SLOPE=999, KERNEL=None, plotter=None ): if KERNEL is None: @@ -1590,7 +1590,7 @@ def do_work_of_slopes_new( def do_work_of_slopes_new_curved( box_text, contour, contour_par, index_r_con, - textline_mask_tot_ea, image_page_rotated, mask_texts_only, num_col, scale_par, slope_deskew, + textline_mask_tot_ea, mask_texts_only, num_col, scale_par, slope_deskew, logger=None, MAX_SLOPE=999, KERNEL=None, plotter=None ): if KERNEL is None: @@ -1679,7 +1679,7 @@ def do_work_of_slopes_new_curved( def do_work_of_slopes_new_light( box_text, contour, contour_par, index_r_con, - textline_mask_tot_ea, image_page_rotated, slope_deskew, textline_light, + textline_mask_tot_ea, slope_deskew, textline_light, logger=None ): if logger is None: From 4337d6298596b1272c35b909a0ec0ee50adc4ba2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 26 Aug 2025 21:06:36 +0200 Subject: [PATCH 020/101] =?UTF-8?q?contours:=20rename=20'pixel'=20?= =?UTF-8?q?=E2=86=92=20'label'=20for=20clarity?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/eynollah/utils/contour.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index 2cd7080..0700ed4 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -141,12 +141,12 @@ def return_parent_contours(contours, hierarchy): if hierarchy[0][i][3] == -1] return contours_parent -def return_contours_of_interested_region(region_pre_p, pixel, min_area=0.0002): +def return_contours_of_interested_region(region_pre_p, label, min_area=0.0002): # pixels of images are identified by 5 if len(region_pre_p.shape) == 3: - cnts_images = (region_pre_p[:, :, 0] == pixel) * 1 + cnts_images = (region_pre_p[:, :, 0] == label) * 1 else: - cnts_images = (region_pre_p[:, :] == pixel) * 1 + cnts_images = (region_pre_p[:, :] == label) * 1 cnts_images = cnts_images.astype(np.uint8) cnts_images = np.repeat(cnts_images[:, :, np.newaxis], 3, axis=2) imgray = cv2.cvtColor(cnts_images, cv2.COLOR_BGR2GRAY) @@ -267,12 +267,12 @@ def get_textregion_contours_in_org_image_light(cnts, img, confidence_matrix): confs.append(np.sum(confidence_matrix * cnt_mask) / np.sum(cnt_mask)) return cnts, confs -def return_contours_of_interested_textline(region_pre_p, pixel): +def return_contours_of_interested_textline(region_pre_p, label): # pixels of images are identified by 5 if len(region_pre_p.shape) == 3: - cnts_images = (region_pre_p[:, :, 0] == pixel) * 1 + cnts_images = (region_pre_p[:, :, 0] == label) * 1 else: - cnts_images = (region_pre_p[:, :] == pixel) * 1 + cnts_images = (region_pre_p[:, :] == label) * 1 cnts_images = cnts_images.astype(np.uint8) cnts_images = np.repeat(cnts_images[:, :, np.newaxis], 3, axis=2) imgray = cv2.cvtColor(cnts_images, cv2.COLOR_BGR2GRAY) @@ -295,12 +295,12 @@ def return_contours_of_image(image): contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) return contours, hierarchy -def return_contours_of_interested_region_by_min_size(region_pre_p, pixel, min_size=0.00003): +def return_contours_of_interested_region_by_min_size(region_pre_p, label, min_size=0.00003): # pixels of images are identified by 5 if len(region_pre_p.shape) == 3: - cnts_images = (region_pre_p[:, :, 0] == pixel) * 1 + cnts_images = (region_pre_p[:, :, 0] == label) * 1 else: - cnts_images = (region_pre_p[:, :] == pixel) * 1 + cnts_images = (region_pre_p[:, :] == label) * 1 cnts_images = cnts_images.astype(np.uint8) cnts_images = np.repeat(cnts_images[:, :, np.newaxis], 3, axis=2) imgray = cv2.cvtColor(cnts_images, cv2.COLOR_BGR2GRAY) @@ -313,12 +313,12 @@ def return_contours_of_interested_region_by_min_size(region_pre_p, pixel, min_si return contours_imgs -def return_contours_of_interested_region_by_size(region_pre_p, pixel, min_area, max_area): +def return_contours_of_interested_region_by_size(region_pre_p, label, min_area, max_area): # pixels of images are identified by 5 if len(region_pre_p.shape) == 3: - cnts_images = (region_pre_p[:, :, 0] == pixel) * 1 + cnts_images = (region_pre_p[:, :, 0] == label) * 1 else: - cnts_images = (region_pre_p[:, :] == pixel) * 1 + cnts_images = (region_pre_p[:, :] == label) * 1 cnts_images = cnts_images.astype(np.uint8) cnts_images = np.repeat(cnts_images[:, :, np.newaxis], 3, axis=2) imgray = cv2.cvtColor(cnts_images, cv2.COLOR_BGR2GRAY) From f458e3ece01aa7142c77b930dbdf1843c6835d85 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 26 Aug 2025 21:07:18 +0200 Subject: [PATCH 021/101] writer: SeparatorRegion needs SeparatorRegionType (not ImageRegionType) --- src/eynollah/writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/eynollah/writer.py b/src/eynollah/writer.py index 92e353f..01c86de 100644 --- a/src/eynollah/writer.py +++ b/src/eynollah/writer.py @@ -296,7 +296,7 @@ class EynollahXmlWriter(): page.add_ImageRegion(ImageRegionType(id=counter.next_region_id, Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region_img[mm], page_coord)))) for mm in range(len(polygons_lines_to_be_written_in_xml)): - page.add_SeparatorRegion(ImageRegionType(id=counter.next_region_id, Coords=CoordsType(points=self.calculate_polygon_coords(polygons_lines_to_be_written_in_xml[mm], [0 , 0, 0, 0])))) + page.add_SeparatorRegion(SeparatorRegionType(id=counter.next_region_id, Coords=CoordsType(points=self.calculate_polygon_coords(polygons_lines_to_be_written_in_xml[mm], [0 , 0, 0, 0])))) for mm in range(len(found_polygons_tables)): page.add_TableRegion(TableRegionType(id=counter.next_region_id, Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_tables[mm], page_coord)))) From dc0caad512219a2e08da3841c215167eed1526bb Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 26 Aug 2025 21:07:50 +0200 Subject: [PATCH 022/101] writer: use @type='heading' instead of 'header' --- src/eynollah/writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/eynollah/writer.py b/src/eynollah/writer.py index 01c86de..b9e906a 100644 --- a/src/eynollah/writer.py +++ b/src/eynollah/writer.py @@ -268,7 +268,7 @@ class EynollahXmlWriter(): self.logger.debug('len(found_polygons_text_region_h) %s', len(found_polygons_text_region_h)) for mm in range(len(found_polygons_text_region_h)): - textregion = TextRegionType(id=counter.next_region_id, type_='header', + textregion = TextRegionType(id=counter.next_region_id, type_='heading', Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region_h[mm], page_coord))) page.add_TextRegion(textregion) From abf5c0f845255f247ce4991d18a5b3b8a3808f4e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 2 Sep 2025 15:01:52 +0200 Subject: [PATCH 023/101] get_smallest_skew: when shifting search range of rotation angle, compare resulting (maximum) variances instead of blindly assuming the new range is better --- src/eynollah/utils/separate_lines.py | 32 +++++++++++++++++----------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/src/eynollah/utils/separate_lines.py b/src/eynollah/utils/separate_lines.py index dcddc65..3363367 100644 --- a/src/eynollah/utils/separate_lines.py +++ b/src/eynollah/utils/separate_lines.py @@ -1486,33 +1486,36 @@ def return_deskew_slop(img_patch_org, sigma_des,n_tot_angles=100, if main_page and img_patch_org.shape[1] > img_patch_org.shape[0]: angles = np.array([-45, 0, 45, 90,]) - angle = get_smallest_skew(img_resized, sigma_des, angles, map=map, logger=logger, plotter=plotter) + angle, _ = get_smallest_skew(img_resized, sigma_des, angles, map=map, logger=logger, plotter=plotter) angles = np.linspace(angle - 22.5, angle + 22.5, n_tot_angles) - angle = get_smallest_skew(img_resized, sigma_des, angles, map=map, logger=logger, plotter=plotter) + angle, _ = get_smallest_skew(img_resized, sigma_des, angles, map=map, logger=logger, plotter=plotter) elif main_page: angles = np.linspace(-12, 12, n_tot_angles)#np.array([0 , 45 , 90 , -45]) - angle = get_smallest_skew(img_resized, sigma_des, angles, map=map, logger=logger, plotter=plotter) + angle, var = get_smallest_skew(img_resized, sigma_des, angles, map=map, logger=logger, plotter=plotter) early_slope_edge=11 if abs(angle) > early_slope_edge: if angle < 0: - angles = np.linspace(-90, -12, n_tot_angles) + angles2 = np.linspace(-90, -12, n_tot_angles) else: - angles = np.linspace(90, 12, n_tot_angles) - angle = get_smallest_skew(img_resized, sigma_des, angles, map=map, logger=logger, plotter=plotter) + angles2 = np.linspace(90, 12, n_tot_angles) + angle2, var2 = get_smallest_skew(img_resized, sigma_des, angles2, map=map, logger=logger, plotter=plotter) + if var2 > var: + angle = angle2 else: angles = np.linspace(-25, 25, int(0.5 * n_tot_angles) + 10) - angle = get_smallest_skew(img_resized, sigma_des, angles, map=map, logger=logger, plotter=plotter) + angle, var = get_smallest_skew(img_resized, sigma_des, angles, map=map, logger=logger, plotter=plotter) early_slope_edge=22 if abs(angle) > early_slope_edge: if angle < 0: - angles = np.linspace(-90, -25, int(0.5 * n_tot_angles) + 10) + angles2 = np.linspace(-90, -25, int(0.5 * n_tot_angles) + 10) else: - angles = np.linspace(90, 25, int(0.5 * n_tot_angles) + 10) - angle = get_smallest_skew(img_resized, sigma_des, angles, map=map, logger=logger, plotter=plotter) - + angles2 = np.linspace(90, 25, int(0.5 * n_tot_angles) + 10) + angle2, var2 = get_smallest_skew(img_resized, sigma_des, angles2, map=map, logger=logger, plotter=plotter) + if var2 > var: + angle = angle2 return angle def get_smallest_skew(img, sigma_des, angles, logger=None, plotter=None, map=map): @@ -1524,11 +1527,14 @@ def get_smallest_skew(img, sigma_des, angles, logger=None, plotter=None, map=map try: var_res = np.array(results) assert var_res.any() - angle = angles[np.argmax(var_res)] + idx = np.argmax(var_res) + angle = angles[idx] + var = var_res[idx] except: logger.exception("cannot determine best angle among %s", str(angles)) angle = 0 - return angle + var = 0 + return angle, var def do_work_of_slopes_new( box_text, contour, contour_par, index_r_con, From 8be2c7977101080856e4d6e43660a0de055b86c9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 3 Sep 2025 09:01:18 +0200 Subject: [PATCH 024/101] Revert "deskewing with faster multiprocessing" This reverts commit 5db3e9fa64d39c128bd9bee27c9d0fb73b3459d2. --- src/eynollah/eynollah.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 8f66af5..b450b17 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2926,6 +2926,7 @@ class Eynollah: #print(textline_mask_tot_ea.shape, 'textline_mask_tot_ea deskew') slope_deskew = return_deskew_slop(cv2.erode(textline_mask_tot_ea, KERNEL, iterations=2), 2, 30, True, map=self.executor.map, logger=self.logger, plotter=self.plotter) + if self.plotter: self.plotter.save_deskewed_image(slope_deskew) self.logger.info("slope_deskew: %.2f°", slope_deskew) From 31f240c3b8a6eaa034b5ae02cf009930e8275725 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 2 Sep 2025 15:04:04 +0200 Subject: [PATCH 025/101] do_image_rotation, do_work_of_slopes_new_curved: pass arrays via shared memory --- src/eynollah/eynollah.py | 12 +++++--- src/eynollah/utils/separate_lines.py | 12 ++++++-- src/eynollah/utils/shm.py | 45 ++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 7 deletions(-) create mode 100644 src/eynollah/utils/shm.py diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index b450b17..42af8e4 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -95,6 +95,7 @@ from .utils.drop_capitals import ( ) from .utils.marginals import get_marginals from .utils.resize import resize_image +from .utils.shm import share_ndarray from .utils import ( boosting_headers_by_longshot_region_segmentation, crop_image_inside_box, @@ -1582,9 +1583,11 @@ class Eynollah: if not len(contours): return [], [], [], [], [], [], [] self.logger.debug("enter get_slopes_and_deskew_new_curved") - results = self.executor.map(partial(do_work_of_slopes_new_curved, - textline_mask_tot_ea=textline_mask_tot, - mask_texts_only=mask_texts_only, + with share_ndarray(textline_mask_tot) as textline_mask_tot_shared: + with share_ndarray(mask_texts_only) as mask_texts_only_shared: + results = self.executor.map(partial(do_work_of_slopes_new_curved, + textline_mask_tot_ea=textline_mask_tot_shared, + mask_texts_only=mask_texts_only_shared, num_col=num_col, scale_par=scale_par, slope_deskew=slope_deskew, @@ -1593,7 +1596,8 @@ class Eynollah: logger=self.logger, plotter=self.plotter,), boxes, contours, contours_par, range(len(contours_par))) - #textline_polygons, boxes, text_regions, text_regions_par, box_coord, index_text_con, slopes = zip(*results) + #textline_polygons, boxes, text_regions, text_regions_par, box_coord, index_text_con, slopes = zip(*results) + results = list(results) # exhaust prior to release self.logger.debug("exit get_slopes_and_deskew_new_curved") return tuple(zip(*results)) diff --git a/src/eynollah/utils/separate_lines.py b/src/eynollah/utils/separate_lines.py index 3363367..e4bb953 100644 --- a/src/eynollah/utils/separate_lines.py +++ b/src/eynollah/utils/separate_lines.py @@ -15,6 +15,7 @@ from .contour import ( return_contours_of_interested_textline, find_contours_mean_y_diff, ) +from .shm import share_ndarray, wrap_ndarray_shared from . import ( find_num_col_deskew, crop_image_inside_box, @@ -1454,7 +1455,8 @@ def separate_lines_new2(img_crop, thetha, num_col, slope_region, logger=None, pl return img_patch_interest_revised -def do_image_rotation(angle, img, sigma_des, logger=None): +@wrap_ndarray_shared(kw='img') +def do_image_rotation(angle, img=None, sigma_des=1.0, logger=None): if logger is None: logger = getLogger(__package__) img_rot = rotate_image(img, angle) @@ -1521,7 +1523,8 @@ def return_deskew_slop(img_patch_org, sigma_des,n_tot_angles=100, def get_smallest_skew(img, sigma_des, angles, logger=None, plotter=None, map=map): if logger is None: logger = getLogger(__package__) - results = list(map(partial(do_image_rotation, img=img, sigma_des=sigma_des, logger=logger), angles)) + with share_ndarray(img) as img_shared: + results = list(map(partial(do_image_rotation, img=img_shared, sigma_des=sigma_des, logger=logger), angles)) if plotter: plotter.save_plot_of_rotation_angle(angles, results) try: @@ -1594,9 +1597,12 @@ def do_work_of_slopes_new( return cnt_clean_rot, box_text, contour, contour_par, crop_coor, index_r_con, slope +@wrap_ndarray_shared(kw='textline_mask_tot_ea') +@wrap_ndarray_shared(kw='mask_texts_only') def do_work_of_slopes_new_curved( box_text, contour, contour_par, index_r_con, - textline_mask_tot_ea, mask_texts_only, num_col, scale_par, slope_deskew, + textline_mask_tot_ea=None, mask_texts_only=None, + num_col=1, scale_par=1.0, slope_deskew=0.0, logger=None, MAX_SLOPE=999, KERNEL=None, plotter=None ): if KERNEL is None: diff --git a/src/eynollah/utils/shm.py b/src/eynollah/utils/shm.py new file mode 100644 index 0000000..4b51053 --- /dev/null +++ b/src/eynollah/utils/shm.py @@ -0,0 +1,45 @@ +from multiprocessing import shared_memory +from contextlib import contextmanager +from functools import wraps +import numpy as np + +@contextmanager +def share_ndarray(array: np.ndarray): + size = np.dtype(array.dtype).itemsize * np.prod(array.shape) + shm = shared_memory.SharedMemory(create=True, size=size) + try: + shared_array = np.ndarray(array.shape, dtype=array.dtype, buffer=shm.buf) + shared_array[:] = array[:] + shared_array.flags["WRITEABLE"] = False + yield dict(shape=array.shape, dtype=array.dtype, name=shm.name) + finally: + shm.close() + shm.unlink() + +@contextmanager +def ndarray_shared(array: dict): + shm = shared_memory.SharedMemory(name=array['name']) + try: + array = np.ndarray(array['shape'], dtype=array['dtype'], buffer=shm.buf) + yield array + finally: + shm.close() + +def wrap_ndarray_shared(kw=None): + def wrapper(f): + if kw is None: + @wraps(f) + def shared_func(array, *args, **kwargs): + with ndarray_shared(array) as ndarray: + return f(ndarray, *args, **kwargs) + return shared_func + else: + @wraps(f) + def shared_func(*args, **kwargs): + array = kwargs.pop(kw) + with ndarray_shared(array) as ndarray: + kwargs[kw] = ndarray + return f(*args, **kwargs) + return shared_func + return wrapper + From 0662ece536e090989ad4e2281317336129eae468 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 4 Sep 2025 15:18:55 +0200 Subject: [PATCH 026/101] do_work_of_slopes*: use shm also in non-light mode(s) --- src/eynollah/eynollah.py | 33 ++++++++++++++++------------ src/eynollah/utils/separate_lines.py | 6 +++-- 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 42af8e4..6333ca5 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -1554,11 +1554,14 @@ class Eynollah: if not len(contours): return [], [], [], [], [], [], [] self.logger.debug("enter get_slopes_and_deskew_new_light") - results = self.executor.map(partial(do_work_of_slopes_new_light, - textline_mask_tot_ea=textline_mask_tot, - slope_deskew=slope_deskew,textline_light=self.textline_light, - logger=self.logger,), - boxes, contours, contours_par, range(len(contours_par))) + with share_ndarray(textline_mask_tot) as textline_mask_tot_shared: + results = self.executor.map(partial(do_work_of_slopes_new_light, + textline_mask_tot_ea=textline_mask_tot_shared, + slope_deskew=slope_deskew, + textline_light=self.textline_light, + logger=self.logger,), + boxes, contours, contours_par, range(len(contours_par))) + results = list(results) # exhaust prior to release #textline_polygons, boxes, text_regions, text_regions_par, box_coord, index_text_con, slopes = zip(*results) self.logger.debug("exit get_slopes_and_deskew_new_light") return tuple(zip(*results)) @@ -1567,14 +1570,16 @@ class Eynollah: if not len(contours): return [], [], [], [], [], [], [] self.logger.debug("enter get_slopes_and_deskew_new") - results = self.executor.map(partial(do_work_of_slopes_new, - textline_mask_tot_ea=textline_mask_tot, - slope_deskew=slope_deskew, - MAX_SLOPE=MAX_SLOPE, - KERNEL=KERNEL, - logger=self.logger, - plotter=self.plotter,), - boxes, contours, contours_par, range(len(contours_par))) + with share_ndarray(textline_mask_tot) as textline_mask_tot_shared: + results = self.executor.map(partial(do_work_of_slopes_new, + textline_mask_tot_ea=textline_mask_tot_shared, + slope_deskew=slope_deskew, + MAX_SLOPE=MAX_SLOPE, + KERNEL=KERNEL, + logger=self.logger, + plotter=self.plotter,), + boxes, contours, contours_par, range(len(contours_par))) + results = list(results) # exhaust prior to release #textline_polygons, boxes, text_regions, text_regions_par, box_coord, index_text_con, slopes = zip(*results) self.logger.debug("exit get_slopes_and_deskew_new") return tuple(zip(*results)) @@ -1596,8 +1601,8 @@ class Eynollah: logger=self.logger, plotter=self.plotter,), boxes, contours, contours_par, range(len(contours_par))) - #textline_polygons, boxes, text_regions, text_regions_par, box_coord, index_text_con, slopes = zip(*results) results = list(results) # exhaust prior to release + #textline_polygons, boxes, text_regions, text_regions_par, box_coord, index_text_con, slopes = zip(*results) self.logger.debug("exit get_slopes_and_deskew_new_curved") return tuple(zip(*results)) diff --git a/src/eynollah/utils/separate_lines.py b/src/eynollah/utils/separate_lines.py index e4bb953..1a2f511 100644 --- a/src/eynollah/utils/separate_lines.py +++ b/src/eynollah/utils/separate_lines.py @@ -1539,9 +1539,10 @@ def get_smallest_skew(img, sigma_des, angles, logger=None, plotter=None, map=map var = 0 return angle, var +@wrap_ndarray_shared(kw='textline_mask_tot_ea') def do_work_of_slopes_new( box_text, contour, contour_par, index_r_con, - textline_mask_tot_ea, slope_deskew, + textline_mask_tot_ea=None, slope_deskew=0.0, logger=None, MAX_SLOPE=999, KERNEL=None, plotter=None ): if KERNEL is None: @@ -1689,9 +1690,10 @@ def do_work_of_slopes_new_curved( return textlines_cnt_per_region[::-1], box_text, contour, contour_par, crop_coor, index_r_con, slope +@wrap_ndarray_shared(kw='textline_mask_tot_ea') def do_work_of_slopes_new_light( box_text, contour, contour_par, index_r_con, - textline_mask_tot_ea, slope_deskew, textline_light, + textline_mask_tot_ea=None, slope_deskew=0, textline_light=True, logger=None ): if logger is None: From 04c3d7dd1b98b01adf2b8ccd72830ad5fd9a4e95 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 18 Sep 2025 20:07:54 +0200 Subject: [PATCH 027/101] get_smallest_skew: avoid shm if no ProcessPoolExecutor is passed --- src/eynollah/utils/separate_lines.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/eynollah/utils/separate_lines.py b/src/eynollah/utils/separate_lines.py index 1a2f511..4d8badb 100644 --- a/src/eynollah/utils/separate_lines.py +++ b/src/eynollah/utils/separate_lines.py @@ -1469,7 +1469,7 @@ def do_image_rotation(angle, img=None, sigma_des=1.0, logger=None): return var def return_deskew_slop(img_patch_org, sigma_des,n_tot_angles=100, - main_page=False, logger=None, plotter=None, map=map): + main_page=False, logger=None, plotter=None, map=None): if main_page and plotter: plotter.save_plot_of_textline_density(img_patch_org) @@ -1523,8 +1523,13 @@ def return_deskew_slop(img_patch_org, sigma_des,n_tot_angles=100, def get_smallest_skew(img, sigma_des, angles, logger=None, plotter=None, map=map): if logger is None: logger = getLogger(__package__) - with share_ndarray(img) as img_shared: - results = list(map(partial(do_image_rotation, img=img_shared, sigma_des=sigma_des, logger=logger), angles)) + if map is None: + results = [do_image_rotation.__wrapped__(angle, img=img, sigma_des=sigma_des, logger=logger) + for angle in angles] + else: + with share_ndarray(img) as img_shared: + results = list(map(partial(do_image_rotation, img=img_shared, sigma_des=sigma_des, logger=logger), + angles)) if plotter: plotter.save_plot_of_rotation_angle(angles, results) try: From b94c96fcbbb5bbce72bc9cdc9b334953abd774ad Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 20 Sep 2025 00:56:33 +0200 Subject: [PATCH 028/101] find_num_col: exit early if empty (avoiding exceptions) --- src/eynollah/utils/__init__.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index bbf30a8..9daec7d 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -383,6 +383,10 @@ def find_num_col_deskew(regions_without_separators, sigma_, multiplier=3.8): return np.std(z) def find_num_col(regions_without_separators, num_col_classifier, tables, multiplier=3.8): + if not regions_without_separators.any(): + return 0, [] + #plt.imshow(regions_without_separators) + #plt.show() regions_without_separators_0 = regions_without_separators.sum(axis=0) ##plt.plot(regions_without_separators_0) ##plt.show() @@ -402,6 +406,9 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl zneg = gaussian_filter1d(zneg, sigma_) peaks_neg, _ = find_peaks(zneg, height=0) + #plt.plot(zneg) + #plt.plot(peaks_neg, zneg[peaks_neg], 'rx') + #plt.show() peaks, _ = find_peaks(z, height=0) peaks_neg = peaks_neg - 10 - 10 @@ -416,9 +423,13 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl (peaks_neg < (regions_without_separators.shape[1] - 370))] interest_pos = z[peaks] interest_pos = interest_pos[interest_pos > 10] + if not interest_pos.any(): + return 0, [] # plt.plot(z) # plt.show() interest_neg = z[peaks_neg] + if not interest_neg.any(): + return 0, [] min_peaks_pos = np.min(interest_pos) max_peaks_pos = np.max(interest_pos) From 0366707136568241c42bac2f3bf675dda5989fe2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 20 Sep 2025 00:57:00 +0200 Subject: [PATCH 029/101] get_smallest_skew: do not pass logger --- src/eynollah/utils/separate_lines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/eynollah/utils/separate_lines.py b/src/eynollah/utils/separate_lines.py index 4d8badb..1d27a17 100644 --- a/src/eynollah/utils/separate_lines.py +++ b/src/eynollah/utils/separate_lines.py @@ -1528,7 +1528,7 @@ def get_smallest_skew(img, sigma_des, angles, logger=None, plotter=None, map=map for angle in angles] else: with share_ndarray(img) as img_shared: - results = list(map(partial(do_image_rotation, img=img_shared, sigma_des=sigma_des, logger=logger), + results = list(map(partial(do_image_rotation, img=img_shared, sigma_des=sigma_des, logger=None), angles)) if plotter: plotter.save_plot_of_rotation_angle(angles, results) From 758602403eb92625608d04e7d77fcbf896c55e2d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 21 Sep 2025 21:35:22 +0200 Subject: [PATCH 030/101] replace loky with concurrent.futures.ProcessPoolExecutor (faster) --- src/eynollah/eynollah.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 6333ca5..1c70498 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -23,7 +23,7 @@ import gc import copy import json -from loky import ProcessPoolExecutor +from concurrent.futures import ProcessPoolExecutor import xml.etree.ElementTree as ET import cv2 import numpy as np @@ -244,7 +244,7 @@ class Eynollah: self.num_col_lower = num_col_lower self.logger = logger if logger else getLogger('eynollah') # for parallelization of CPU-intensive tasks: - self.executor = ProcessPoolExecutor(max_workers=cpu_count(), timeout=1200) + self.executor = ProcessPoolExecutor(max_workers=cpu_count()) atexit.register(self.executor.shutdown) self.dir_models = dir_models self.model_dir_of_enhancement = dir_models + "/eynollah-enhancement_20210425" From c0137c29ad46adf2096664632e9a20a30afbfe09 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 30 Sep 2025 02:23:43 +0200 Subject: [PATCH 031/101] try to fix the failed outsourcing of utils_ocr --- src/eynollah/eynollah.py | 63 ++------------------------------- src/eynollah/utils/utils_ocr.py | 1 + 2 files changed, 3 insertions(+), 61 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 32490a2..192f6f4 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -3917,34 +3917,6 @@ class Eynollah: region_ids = ['region_%04d' % i for i in range(len(co_text_all_org))] return ordered, region_ids - def return_list_of_contours_with_desired_order(self, ls_cons, sorted_indexes): - return [ls_cons[sorted_indexes[index]] for index in range(len(sorted_indexes))] - - def return_it_in_two_groups(self, x_differential): - split = [ind if x_differential[ind]!=x_differential[ind+1] else -1 - for ind in range(len(x_differential)-1)] - split_masked = list( np.array(split[:])[np.array(split[:])!=-1] ) - if 0 not in split_masked: - split_masked.insert(0, -1) - split_masked.append(len(x_differential)-1) - - split_masked = np.array(split_masked) +1 - - sums = [np.sum(x_differential[split_masked[ind]:split_masked[ind+1]]) - for ind in range(len(split_masked)-1)] - - indexes_to_bec_changed = [ind if (np.abs(sums[ind-1]) > np.abs(sums[ind]) and - np.abs(sums[ind+1]) > np.abs(sums[ind])) else -1 - for ind in range(1,len(sums)-1)] - indexes_to_bec_changed_filtered = np.array(indexes_to_bec_changed)[np.array(indexes_to_bec_changed)!=-1] - - x_differential_new = np.copy(x_differential) - for i in indexes_to_bec_changed_filtered: - i_slice = slice(split_masked[i], split_masked[i+1]) - x_differential_new[i_slice] = -1 * np.array(x_differential)[i_slice] - - return x_differential_new - def return_start_and_end_of_common_text_of_textline_ocr(self,textline_image, ind_tot): width = np.shape(textline_image)[1] height = np.shape(textline_image)[0] @@ -3988,36 +3960,6 @@ class Eynollah: else: pass - def return_start_and_end_of_common_text_of_textline_ocr_without_common_section(self, textline_image, ind_tot): - width = np.shape(textline_image)[1] - height = np.shape(textline_image)[0] - common_window = int(0.06*width) - - width1 = int ( width/2. - common_window ) - width2 = int ( width/2. + common_window ) - - img_sum = np.sum(textline_image[:,:,0], axis=0) - sum_smoothed = gaussian_filter1d(img_sum, 3) - - peaks_real, _ = find_peaks(sum_smoothed, height=0) - if len(peaks_real)>70: - #print(len(peaks_real), 'len(peaks_real)') - - peaks_real = peaks_real[(peaks_realwidth1)] - - arg_max = np.argmax(sum_smoothed[peaks_real]) - peaks_final = peaks_real[arg_max] - - #plt.figure(ind_tot) - #plt.imshow(textline_image) - #plt.plot([peaks_final, peaks_final], [0, height-1]) - ##plt.plot([peaks_final[1], peaks_final[1]], [0, height-1]) - #plt.savefig('./'+str(ind_tot)+'.png') - - return peaks_final - else: - return None - def return_start_and_end_of_common_text_of_textline_ocr_new_splitted( self, peaks_real, sum_smoothed, start_split, end_split): @@ -4079,8 +4021,7 @@ class Eynollah: #width1 = int ( width/2. - common_window ) #width2 = int ( width/2. + common_window ) - split_point = self.return_start_and_end_of_common_text_of_textline_ocr_without_common_section( - textline_image, ind_tot) + split_point = return_start_and_end_of_common_text_of_textline_ocr_without_common_section(textline_image) if split_point: image1 = textline_image[:, :split_point,:]# image.crop((0, 0, width2, height)) image2 = textline_image[:, split_point:,:]#image.crop((width1, 0, width, height)) @@ -5144,7 +5085,7 @@ class Eynollah: box_ind = all_box_coord[indexing] #print(ind_poly,np.shape(ind_poly), 'ind_poly') #print(box_ind) - ind_poly = self.return_textline_contour_with_added_box_coordinate(ind_poly, box_ind) + ind_poly = return_textline_contour_with_added_box_coordinate(ind_poly, box_ind) #print(ind_poly_copy) ind_poly[ind_poly<0] = 0 x, y, w, h = cv2.boundingRect(ind_poly) diff --git a/src/eynollah/utils/utils_ocr.py b/src/eynollah/utils/utils_ocr.py index 4fa99f7..5f19387 100644 --- a/src/eynollah/utils/utils_ocr.py +++ b/src/eynollah/utils/utils_ocr.py @@ -92,6 +92,7 @@ def return_start_and_end_of_common_text_of_textline_ocr_without_common_section(t return peaks_final else: return None + # Function to fit text inside the given area def fit_text_single_line(draw, text, font_path, max_width, max_height): initial_font_size = 50 From f857ee7b518e23c62b28aab32cd64d396da836fe Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 19 Sep 2025 02:12:18 +0200 Subject: [PATCH 032/101] simplify --- src/eynollah/eynollah.py | 23 +++-------------------- src/eynollah/utils/__init__.py | 2 +- 2 files changed, 4 insertions(+), 21 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 192f6f4..0c9692e 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -3182,26 +3182,9 @@ class Eynollah: num_col = num_col + 1 if not num_column_is_classified: num_col_classifier = num_col + 1 - if self.num_col_upper and self.num_col_lower: - if self.num_col_upper == self.num_col_lower: - num_col_classifier = self.num_col_upper - else: - if num_col_classifier < self.num_col_lower: - num_col_classifier = self.num_col_lower - if num_col_classifier > self.num_col_upper: - num_col_classifier = self.num_col_upper - - elif self.num_col_lower and not self.num_col_upper: - if num_col_classifier < self.num_col_lower: - num_col_classifier = self.num_col_lower - - elif self.num_col_upper and not self.num_col_lower: - if num_col_classifier > self.num_col_upper: - num_col_classifier = self.num_col_upper - - else: - pass - + num_col_classifier = min(self.num_col_upper or num_col_classifier, + max(self.num_col_lower or num_col_classifier, + num_col_classifier)) except Exception as why: self.logger.error(why) num_col = None diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 243430e..f8926cf 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1675,9 +1675,9 @@ def return_boxes_of_images_by_order_of_reading_new( peaks_neg_fin=[] num_col = 0 try: - peaks_neg_fin_org=np.copy(peaks_neg_fin) if (len(peaks_neg_fin)+1) Date: Tue, 30 Sep 2025 03:52:19 +0200 Subject: [PATCH 033/101] indent extremely long lines --- src/eynollah/eynollah.py | 750 ++++++++++++++++++--------- src/eynollah/utils/__init__.py | 30 +- src/eynollah/utils/separate_lines.py | 136 +++-- src/eynollah/utils/utils_ocr.py | 25 +- 4 files changed, 652 insertions(+), 289 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 0c9692e..2e31433 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -272,7 +272,6 @@ class Eynollah: else: self.threshold_art_class_textline = 0.1 - self.dir_models = dir_models self.model_dir_of_enhancement = dir_models + "/eynollah-enhancement_20210425" self.model_dir_of_binarization = dir_models + "/eynollah-binarization_20210425" self.model_dir_of_col_classifier = dir_models + "/eynollah-column-classifier_20210425" @@ -289,8 +288,17 @@ class Eynollah: self.model_page_dir = dir_models + "/model_eynollah_page_extraction_20250915" self.model_region_dir_p_ens = dir_models + "/eynollah-main-regions-ensembled_20210425" self.model_region_dir_p_ens_light = dir_models + "/eynollah-main-regions_20220314" - self.model_region_dir_p_ens_light_only_images_extraction = dir_models + "/eynollah-main-regions_20231127_672_org_ens_11_13_16_17_18" - self.model_reading_order_dir = dir_models + "/model_eynollah_reading_order_20250824"#"/model_mb_ro_aug_ens_11"#"/model_step_3200000_mb_ro"#"/model_ens_reading_order_machine_based"#"/model_mb_ro_aug_ens_8"#"/model_ens_reading_order_machine_based" + self.model_region_dir_p_ens_light_only_images_extraction = (dir_models + + "/eynollah-main-regions_20231127_672_org_ens_11_13_16_17_18" + ) + self.model_reading_order_dir = (dir_models + + "/model_eynollah_reading_order_20250824" + #"/model_mb_ro_aug_ens_11" + #"/model_step_3200000_mb_ro" + #"/model_ens_reading_order_machine_based" + #"/model_mb_ro_aug_ens_8" + #"/model_ens_reading_order_machine_based" + ) #"/modelens_12sp_elay_0_3_4__3_6_n" #"/modelens_earlylayout_12spaltige_2_3_5_6_7_8" #"/modelens_early12_sp_2_3_5_6_7_8_9_10_12_14_15_16_18" @@ -379,11 +387,9 @@ class Eynollah: self.b_s_ocr = 8 else: self.b_s_ocr = int(batch_size_ocr) - with open(os.path.join(self.model_ocr_dir, "characters_org.txt"),"r") as config_file: characters = json.load(config_file) - AUTOTUNE = tf.data.AUTOTUNE @@ -840,7 +846,9 @@ class Eynollah: self, patches, img, model, n_batch_inference=1, marginal_of_patch_percent=0.1, thresholding_for_some_classes_in_light_version=False, - thresholding_for_artificial_class_in_light_version=False, thresholding_for_fl_light_version=False, threshold_art_class_textline=0.1): + thresholding_for_artificial_class_in_light_version=False, + thresholding_for_fl_light_version=False, + threshold_art_class_textline=0.1): self.logger.debug("enter do_prediction") img_height_model = model.layers[-1].output_shape[1] @@ -1254,7 +1262,9 @@ class Eynollah: self, patches, img, model, n_batch_inference=1, marginal_of_patch_percent=0.1, thresholding_for_some_classes_in_light_version=False, - thresholding_for_artificial_class_in_light_version=False, threshold_art_class_textline=0.1, threshold_art_class_layout=0.1): + thresholding_for_artificial_class_in_light_version=False, + threshold_art_class_textline=0.1, + threshold_art_class_layout=0.1): self.logger.debug("enter do_prediction_new_concept") img_height_model = model.layers[-1].output_shape[1] @@ -1384,7 +1394,8 @@ class Eynollah: for i_batch, j_batch in zip(list_i_s, list_j_s): seg_in = seg[indexer_inside_batch] - if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + if (thresholding_for_artificial_class_in_light_version or + thresholding_for_some_classes_in_light_version): seg_in_art = seg_art[indexer_inside_batch] index_y_u_in = list_y_u[indexer_inside_batch] @@ -1404,7 +1415,8 @@ class Eynollah: label_p_pred[0, 0:-margin or None, 0:-margin or None, 1] - if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + if (thresholding_for_artificial_class_in_light_version or + thresholding_for_some_classes_in_light_version): prediction_true[index_y_d_in + 0:index_y_u_in - margin, index_x_d_in + 0:index_x_u_in - margin, 1] = \ seg_in_art[0:-margin or None, @@ -1421,7 +1433,8 @@ class Eynollah: label_p_pred[0, margin:, margin:, 1] - if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + if (thresholding_for_artificial_class_in_light_version or + thresholding_for_some_classes_in_light_version): prediction_true[index_y_d_in + margin:index_y_u_in - 0, index_x_d_in + margin:index_x_u_in - 0, 1] = \ seg_in_art[margin:, @@ -1439,7 +1452,8 @@ class Eynollah: 0:-margin or None, 1] - if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + if (thresholding_for_artificial_class_in_light_version or + thresholding_for_some_classes_in_light_version): prediction_true[index_y_d_in + margin:index_y_u_in - 0, index_x_d_in + 0:index_x_u_in - margin, 1] = \ seg_in_art[margin:, @@ -1456,7 +1470,8 @@ class Eynollah: label_p_pred[0, 0:-margin or None, margin:, 1] - if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + if (thresholding_for_artificial_class_in_light_version or + thresholding_for_some_classes_in_light_version): prediction_true[index_y_d_in + 0:index_y_u_in - margin, index_x_d_in + margin:index_x_u_in - 0, 1] = \ seg_in_art[0:-margin or None, @@ -1473,7 +1488,8 @@ class Eynollah: label_p_pred[0, margin:-margin or None, 0:-margin or None, 1] - if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + if (thresholding_for_artificial_class_in_light_version or + thresholding_for_some_classes_in_light_version): prediction_true[index_y_d_in + margin:index_y_u_in - margin, index_x_d_in + 0:index_x_u_in - margin, 1] = \ seg_in_art[margin:-margin or None, @@ -1489,7 +1505,8 @@ class Eynollah: label_p_pred[0, margin:-margin or None, margin:, 1] - if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + if (thresholding_for_artificial_class_in_light_version or + thresholding_for_some_classes_in_light_version): prediction_true[index_y_d_in + margin:index_y_u_in - margin, index_x_d_in + margin:index_x_u_in - 0, 1] = \ seg_in_art[margin:-margin or None, @@ -1505,7 +1522,8 @@ class Eynollah: label_p_pred[0, 0:-margin or None, margin:-margin or None, 1] - if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + if (thresholding_for_artificial_class_in_light_version or + thresholding_for_some_classes_in_light_version): prediction_true[index_y_d_in + 0:index_y_u_in - margin, index_x_d_in + margin:index_x_u_in - margin, 1] = \ seg_in_art[0:-margin or None, @@ -1521,7 +1539,8 @@ class Eynollah: label_p_pred[0, margin:, margin:-margin or None, 1] - if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + if (thresholding_for_artificial_class_in_light_version or + thresholding_for_some_classes_in_light_version): prediction_true[index_y_d_in + margin:index_y_u_in - 0, index_x_d_in + margin:index_x_u_in - margin, 1] = \ seg_in_art[margin:, @@ -1537,7 +1556,8 @@ class Eynollah: label_p_pred[0, margin:-margin or None, margin:-margin or None, 1] - if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + if (thresholding_for_artificial_class_in_light_version or + thresholding_for_some_classes_in_light_version): prediction_true[index_y_d_in + margin:index_y_u_in - margin, index_x_d_in + margin:index_x_u_in - margin, 1] = \ seg_in_art[margin:-margin or None, @@ -1686,7 +1706,10 @@ class Eynollah: else: img = resize_image(img, int(img_height_h * 2500 / float(img_width_h)), 2500).astype(np.uint8) - prediction_regions = self.do_prediction(patches, img, model_region, marginal_of_patch_percent=0.1, n_batch_inference=3, thresholding_for_fl_light_version=thresholding_for_fl_light_version) + prediction_regions = self.do_prediction(patches, img, model_region, + marginal_of_patch_percent=0.1, + n_batch_inference=3, + thresholding_for_fl_light_version=thresholding_for_fl_light_version) prediction_regions = resize_image(prediction_regions, img_height_h, img_width_h) self.logger.debug("exit extract_text_regions") return prediction_regions, prediction_regions @@ -1839,7 +1862,10 @@ class Eynollah: cy_textline_in = [cy_main_tot[ind] for ind in indexes_in] w_h_textlines_in = [w_h_textlines[ind][0] / float(w_h_textlines[ind][1]) for ind in indexes_in] - textlines_ins = self.get_textlines_of_a_textregion_sorted(textlines_ins, cx_textline_in, cy_textline_in, w_h_textlines_in) + textlines_ins = self.get_textlines_of_a_textregion_sorted(textlines_ins, + cx_textline_in, + cy_textline_in, + w_h_textlines_in) all_found_textline_polygons.append(textlines_ins)#[::-1]) slopes.append(slope_deskew) @@ -1847,7 +1873,13 @@ class Eynollah: crop_coor = box2rect(boxes[index]) all_box_coord.append(crop_coor) - return all_found_textline_polygons, boxes, contours, contours_par, all_box_coord, np.array(range(len(contours_par))), slopes + return (all_found_textline_polygons, + boxes, + contours, + contours_par, + all_box_coord, + np.array(range(len(contours_par))), + slopes) def get_slopes_and_deskew_new_light(self, contours, contours_par, textline_mask_tot, boxes, slope_deskew): if not len(contours): @@ -1883,7 +1915,8 @@ class Eynollah: self.logger.debug("exit get_slopes_and_deskew_new") return tuple(zip(*results)) - def get_slopes_and_deskew_new_curved(self, contours, contours_par, textline_mask_tot, boxes, mask_texts_only, num_col, scale_par, slope_deskew): + def get_slopes_and_deskew_new_curved(self, contours, contours_par, textline_mask_tot, boxes, + mask_texts_only, num_col, scale_par, slope_deskew): if not len(contours): return [], [], [], [], [], [], [] self.logger.debug("enter get_slopes_and_deskew_new_curved") @@ -1914,10 +1947,11 @@ class Eynollah: img_w = img_org.shape[1] img = resize_image(img_org, int(img_org.shape[0] * scaler_h), int(img_org.shape[1] * scaler_w)) - prediction_textline = self.do_prediction( - use_patches, img, self.model_textline, - marginal_of_patch_percent=0.15, n_batch_inference=3, - thresholding_for_artificial_class_in_light_version=self.textline_light, threshold_art_class_textline=self.threshold_art_class_textline) + prediction_textline = self.do_prediction(use_patches, img, self.model_textline, + marginal_of_patch_percent=0.15, + n_batch_inference=3, + thresholding_for_artificial_class_in_light_version=self.textline_light, + threshold_art_class_textline=self.threshold_art_class_textline) #if not self.textline_light: #if num_col_classifier==1: #prediction_textline_nopatch = self.do_prediction(False, img, self.model_textline) @@ -2009,12 +2043,14 @@ class Eynollah: boxes_sub_new = [] poly_sub = [] for mv in range(len(boxes_per_process)): - crop_img, _ = crop_image_inside_box(boxes_per_process[mv], np.repeat(textline_mask_tot[:, :, np.newaxis], 3, axis=2)) + crop_img, _ = crop_image_inside_box(boxes_per_process[mv], + np.repeat(textline_mask_tot[:, :, np.newaxis], 3, axis=2)) crop_img = crop_img[:, :, 0] crop_img = cv2.erode(crop_img, KERNEL, iterations=2) try: textline_con, hierarchy = return_contours_of_image(crop_img) - textline_con_fil = filter_contours_area_of_image(crop_img, textline_con, hierarchy, max_area=1, min_area=0.0008) + textline_con_fil = filter_contours_area_of_image(crop_img, textline_con, hierarchy, + max_area=1, min_area=0.0008) y_diff_mean = find_contours_mean_y_diff(textline_con_fil) sigma_des = max(1, int(y_diff_mean * (4.0 / 40.0))) crop_img[crop_img > 0] = 1 @@ -2139,7 +2175,13 @@ class Eynollah: [page_coord_img[2], page_coord_img[1]]])) self.logger.debug("exit get_regions_extract_images_only") - return text_regions_p_true, erosion_hurts, polygons_seplines, polygons_of_images_fin, image_page, page_coord, cont_page + return (text_regions_p_true, + erosion_hurts, + polygons_seplines, + polygons_of_images_fin, + image_page, + page_coord, + cont_page) def get_regions_light_v(self,img,is_image_enhanced, num_col_classifier, skip_layout_and_reading_order=False): self.logger.debug("enter get_regions_light_v") @@ -2197,7 +2239,8 @@ class Eynollah: #print("inside 1 ", time.time()-t_in) ###textline_mask_tot_ea = self.run_textline(img_bin) - self.logger.debug("detecting textlines on %s with %d colors", str(img_resized.shape), len(np.unique(img_resized))) + self.logger.debug("detecting textlines on %s with %d colors", + str(img_resized.shape), len(np.unique(img_resized))) textline_mask_tot_ea = self.run_textline(img_resized, num_col_classifier) textline_mask_tot_ea = resize_image(textline_mask_tot_ea,img_height_h, img_width_h ) @@ -2214,13 +2257,15 @@ class Eynollah: img_resized.shape[1], img_resized.shape[0], num_col_classifier) prediction_regions_org, confidence_matrix = self.do_prediction_new_concept( True, img_resized, self.model_region_1_2, n_batch_inference=1, - thresholding_for_some_classes_in_light_version=True, threshold_art_class_layout=self.threshold_art_class_layout) + thresholding_for_some_classes_in_light_version=True, + threshold_art_class_layout=self.threshold_art_class_layout) else: prediction_regions_org = np.zeros((self.image_org.shape[0], self.image_org.shape[1], 3)) confidence_matrix = np.zeros((self.image_org.shape[0], self.image_org.shape[1])) prediction_regions_page, confidence_matrix_page = self.do_prediction_new_concept( False, self.image_page_org_size, self.model_region_1_2, n_batch_inference=1, - thresholding_for_artificial_class_in_light_version=True, threshold_art_class_layout=self.threshold_art_class_layout) + thresholding_for_artificial_class_in_light_version=True, + threshold_art_class_layout=self.threshold_art_class_layout) ys = slice(*self.page_coord[0:2]) xs = slice(*self.page_coord[2:4]) prediction_regions_org[ys, xs] = prediction_regions_page @@ -2233,8 +2278,11 @@ class Eynollah: img_resized.shape[1], img_resized.shape[0], new_h, num_col_classifier) prediction_regions_org, confidence_matrix = self.do_prediction_new_concept( True, img_resized, self.model_region_1_2, n_batch_inference=2, - thresholding_for_some_classes_in_light_version=True, threshold_art_class_layout=self.threshold_art_class_layout) - ###prediction_regions_org = self.do_prediction(True, img_bin, self.model_region, n_batch_inference=3, thresholding_for_some_classes_in_light_version=True) + thresholding_for_some_classes_in_light_version=True, + threshold_art_class_layout=self.threshold_art_class_layout) + ###prediction_regions_org = self.do_prediction(True, img_bin, self.model_region, + ###n_batch_inference=3, + ###thresholding_for_some_classes_in_light_version=True) #print("inside 3 ", time.time()-t_in) #plt.imshow(prediction_regions_org[:,:,0]) #plt.show() @@ -2297,7 +2345,12 @@ class Eynollah: #plt.show() #print("inside 4 ", time.time()-t_in) self.logger.debug("exit get_regions_light_v") - return text_regions_p_true, erosion_hurts, polygons_seplines, textline_mask_tot_ea, img_bin, confidence_matrix + return (text_regions_p_true, + erosion_hurts, + polygons_seplines, + textline_mask_tot_ea, + img_bin, + confidence_matrix) else: img_bin = resize_image(img_bin,img_height_h, img_width_h ) self.logger.debug("exit get_regions_light_v") @@ -2417,14 +2470,10 @@ class Eynollah: #img = resize_image(img_org, int(img_org.shape[0]*1), int(img_org.shape[1]*1)) #prediction_regions_org = self.do_prediction(True, img, self.model_region) - #prediction_regions_org = resize_image(prediction_regions_org, img_height_h, img_width_h ) - #prediction_regions_org = prediction_regions_org[:,:,0] - #prediction_regions_org[(prediction_regions_org[:,:] == 1) & (mask_zeros_y[:,:] == 1)]=0 - mask_lines_only = (prediction_regions_org == 3)*1 mask_texts_only = (prediction_regions_org == 1)*1 mask_images_only= (prediction_regions_org == 2)*1 @@ -2843,7 +2892,8 @@ class Eynollah: contours_new.append(contours_sep[ji]) if num_col_classifier>=2: only_recent_contour_image = np.zeros((layout.shape[0],layout.shape[1])) - only_recent_contour_image= cv2.fillPoly(only_recent_contour_image, pts=[contours_sep[ji]], color=(1,1,1)) + only_recent_contour_image= cv2.fillPoly(only_recent_contour_image, + pts=[contours_sep[ji]], color=(1,1,1)) table_pixels_masked_from_early_pre = only_recent_contour_image * table_prediction_early iou_in = 100. * table_pixels_masked_from_early_pre.sum() / only_recent_contour_image.sum() #print(iou_in,'iou_in_in1') @@ -2928,9 +2978,11 @@ class Eynollah: contours,hirarchy=cv2.findContours(thresh.copy(), cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) if indiv==pixel_table: - main_contours = filter_contours_area_of_image_tables(thresh, contours, hirarchy, max_area = 1, min_area = 0.001) + main_contours = filter_contours_area_of_image_tables(thresh, contours, hirarchy, + max_area=1, min_area=0.001) else: - main_contours = filter_contours_area_of_image_tables(thresh, contours, hirarchy, max_area = 1, min_area = min_area) + main_contours = filter_contours_area_of_image_tables(thresh, contours, hirarchy, + max_area=1, min_area=min_area) img_comm = cv2.fillPoly(img_comm, pts = main_contours, color = (indiv, indiv, indiv)) img_comm = img_comm.astype(np.uint8) @@ -2965,8 +3017,14 @@ class Eynollah: y_min_main_line ,y_max_main_line=find_features_of_contours(contours_line) y_min_main_tab ,y_max_main_tab=find_features_of_contours(contours_tab) - cx_tab_m_text,cy_tab_m_text ,x_min_tab_m_text , x_max_tab_m_text, y_min_tab_m_text ,y_max_tab_m_text, _= find_new_features_of_contours(contours_table_m_text) - cx_tabl,cy_tabl ,x_min_tabl , x_max_tabl, y_min_tabl ,y_max_tabl,_= find_new_features_of_contours(contours_tab) + (cx_tab_m_text, cy_tab_m_text, + x_min_tab_m_text, x_max_tab_m_text, + y_min_tab_m_text, y_max_tab_m_text, + _) = find_new_features_of_contours(contours_table_m_text) + (cx_tabl, cy_tabl, + x_min_tabl, x_max_tabl, + y_min_tabl, y_max_tabl, + _) = find_new_features_of_contours(contours_tab) if len(y_min_main_tab )>0: y_down_tabs=[] @@ -2976,9 +3034,15 @@ class Eynollah: y_down_tab=[] y_up_tab=[] for i_l in range(len(y_min_main_line)): - if y_min_main_tab[i_t]>y_min_main_line[i_l] and y_max_main_tab[i_t]>y_min_main_line[i_l] and y_min_main_tab[i_t]>y_max_main_line[i_l] and y_max_main_tab[i_t]>y_min_main_line[i_l]: + if (y_min_main_tab[i_t] > y_min_main_line[i_l] and + y_max_main_tab[i_t] > y_min_main_line[i_l] and + y_min_main_tab[i_t] > y_max_main_line[i_l] and + y_max_main_tab[i_t] > y_min_main_line[i_l]): pass - elif y_min_main_tab[i_t]= SLOPE_THRESHOLD: _, textline_mask_tot_d, text_regions_p_1_n, table_prediction_n = \ - rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, table_prediction, slope_deskew) + rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, + table_prediction, slope_deskew) - text_regions_p_1_n = resize_image(text_regions_p_1_n,text_regions_p.shape[0],text_regions_p.shape[1]) - textline_mask_tot_d = resize_image(textline_mask_tot_d,text_regions_p.shape[0],text_regions_p.shape[1]) - table_prediction_n = resize_image(table_prediction_n,text_regions_p.shape[0],text_regions_p.shape[1]) + text_regions_p_1_n = resize_image(text_regions_p_1_n, + text_regions_p.shape[0], + text_regions_p.shape[1]) + textline_mask_tot_d = resize_image(textline_mask_tot_d, + text_regions_p.shape[0], + text_regions_p.shape[1]) + table_prediction_n = resize_image(table_prediction_n, + text_regions_p.shape[0], + text_regions_p.shape[1]) regions_without_separators_d = (text_regions_p_1_n[:,:] == 1)*1 regions_without_separators_d[table_prediction_n[:,:] == 1] = 1 @@ -3502,11 +3580,18 @@ class Eynollah: else: if np.abs(slope_deskew) >= SLOPE_THRESHOLD: _, textline_mask_tot_d, text_regions_p_1_n, table_prediction_n = \ - rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, table_prediction, slope_deskew) + rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, + table_prediction, slope_deskew) - text_regions_p_1_n = resize_image(text_regions_p_1_n,text_regions_p.shape[0],text_regions_p.shape[1]) - textline_mask_tot_d = resize_image(textline_mask_tot_d,text_regions_p.shape[0],text_regions_p.shape[1]) - table_prediction_n = resize_image(table_prediction_n,text_regions_p.shape[0],text_regions_p.shape[1]) + text_regions_p_1_n = resize_image(text_regions_p_1_n, + text_regions_p.shape[0], + text_regions_p.shape[1]) + textline_mask_tot_d = resize_image(textline_mask_tot_d, + text_regions_p.shape[0], + text_regions_p.shape[1]) + table_prediction_n = resize_image(table_prediction_n, + text_regions_p.shape[0], + text_regions_p.shape[1]) regions_without_separators_d = (text_regions_p_1_n[:,:] == 1)*1 regions_without_separators_d[table_prediction_n[:,:] == 1] = 1 @@ -3565,7 +3650,8 @@ class Eynollah: pixel_line = 3 img_revised_tab2 = self.add_tables_heuristic_to_layout( - text_regions_p_tables, boxes_d, 0, splitter_y_new_d, peaks_neg_tot_tables_d, text_regions_p_tables, + text_regions_p_tables, boxes_d, 0, splitter_y_new_d, + peaks_neg_tot_tables_d, text_regions_p_tables, num_col_classifier, 0.000005, pixel_line) img_revised_tab2_d,_ = self.check_iou_of_bounding_box_and_contour_for_tables( @@ -3574,8 +3660,9 @@ class Eynollah: img_revised_tab2_d_rotated = np.round(img_revised_tab2_d_rotated) img_revised_tab2_d_rotated = img_revised_tab2_d_rotated.astype(np.int8) - - img_revised_tab2_d_rotated = resize_image(img_revised_tab2_d_rotated, text_regions_p.shape[0], text_regions_p.shape[1]) + img_revised_tab2_d_rotated = resize_image(img_revised_tab2_d_rotated, + text_regions_p.shape[0], + text_regions_p.shape[1]) if np.abs(slope_deskew) < 0.13: img_revised_tab = np.copy(img_revised_tab2[:,:,0]) @@ -3646,7 +3733,8 @@ class Eynollah: ##else: ##regions_fully_np = filter_small_drop_capitals_from_no_patch_layout(regions_fully_np, text_regions_p) - ###regions_fully = boosting_headers_by_longshot_region_segmentation(regions_fully, regions_fully_np, img_only_regions) + ###regions_fully = boosting_headers_by_longshot_region_segmentation(regions_fully, + ### regions_fully_np, img_only_regions) # plt.imshow(regions_fully[:,:,0]) # plt.show() text_regions_p[:, :][regions_fully[:, :, 0] == drop_capital_label_in_full_layout_model] = 4 @@ -3709,7 +3797,10 @@ class Eynollah: min_cont_size_to_be_dilated = 10 if len(contours_only_text_parent)>min_cont_size_to_be_dilated and self.light_version: - cx_conts, cy_conts, x_min_conts, x_max_conts, y_min_conts, y_max_conts, _ = find_new_features_of_contours(contours_only_text_parent) + (cx_conts, cy_conts, + x_min_conts, x_max_conts, + y_min_conts, y_max_conts, + _) = find_new_features_of_contours(contours_only_text_parent) args_cont_located = np.array(range(len(contours_only_text_parent))) diff_y_conts = np.abs(y_max_conts[:]-y_min_conts) @@ -3724,15 +3815,31 @@ class Eynollah: args_cont_located_excluded = args_cont_located[diff_x_ratio>=1.3] args_cont_located_included = args_cont_located[diff_x_ratio<1.3] - contours_only_text_parent_excluded = [contours_only_text_parent[ind] for ind in range(len(contours_only_text_parent)) if diff_x_ratio[ind]>=1.3]#contours_only_text_parent[diff_x_ratio>=1.3] - contours_only_text_parent_included = [contours_only_text_parent[ind] for ind in range(len(contours_only_text_parent)) if diff_x_ratio[ind]<1.3]#contours_only_text_parent[diff_x_ratio<1.3] + contours_only_text_parent_excluded = [contours_only_text_parent[ind] + #contours_only_text_parent[diff_x_ratio>=1.3] + for ind in range(len(contours_only_text_parent)) + if diff_x_ratio[ind]>=1.3] + contours_only_text_parent_included = [contours_only_text_parent[ind] + #contours_only_text_parent[diff_x_ratio<1.3] + for ind in range(len(contours_only_text_parent)) + if diff_x_ratio[ind]<1.3] - - cx_conts_excluded = [cx_conts[ind] for ind in range(len(cx_conts)) if diff_x_ratio[ind]>=1.3]#cx_conts[diff_x_ratio>=1.3] - cx_conts_included = [cx_conts[ind] for ind in range(len(cx_conts)) if diff_x_ratio[ind]<1.3]#cx_conts[diff_x_ratio<1.3] - - cy_conts_excluded = [cy_conts[ind] for ind in range(len(cy_conts)) if diff_x_ratio[ind]>=1.3]#cy_conts[diff_x_ratio>=1.3] - cy_conts_included = [cy_conts[ind] for ind in range(len(cy_conts)) if diff_x_ratio[ind]<1.3]#cy_conts[diff_x_ratio<1.3] + cx_conts_excluded = [cx_conts[ind] + #cx_conts[diff_x_ratio>=1.3] + for ind in range(len(cx_conts)) + if diff_x_ratio[ind]>=1.3] + cx_conts_included = [cx_conts[ind] + #cx_conts[diff_x_ratio<1.3] + for ind in range(len(cx_conts)) + if diff_x_ratio[ind]<1.3] + cy_conts_excluded = [cy_conts[ind] + #cy_conts[diff_x_ratio>=1.3] + for ind in range(len(cy_conts)) + if diff_x_ratio[ind]>=1.3] + cy_conts_included = [cy_conts[ind] + #cy_conts[diff_x_ratio<1.3] + for ind in range(len(cy_conts)) + if diff_x_ratio[ind]<1.3] #print(diff_x_ratio, 'ratio') text_regions_p = text_regions_p.astype('uint8') @@ -3754,7 +3861,10 @@ class Eynollah: contours_only_dilated, hir_on_text_dilated = return_contours_of_image(text_regions_p_textregions_dilated) contours_only_dilated = return_parent_contours(contours_only_dilated, hir_on_text_dilated) - indexes_of_located_cont, center_x_coordinates_of_located, center_y_coordinates_of_located = self.return_indexes_of_contours_loctaed_inside_another_list_of_contours(contours_only_dilated, contours_only_text_parent_included, cx_conts_included, cy_conts_included, args_cont_located_included) + indexes_of_located_cont, center_x_coordinates_of_located, center_y_coordinates_of_located = \ + self.return_indexes_of_contours_located_inside_another_list_of_contours( + contours_only_dilated, contours_only_text_parent_included, + cx_conts_included, cy_conts_included, args_cont_located_included) if len(args_cont_located_excluded)>0: @@ -3767,7 +3877,7 @@ class Eynollah: flattened_array = np.concatenate([arr.ravel() for arr in array_list]) #print(len( np.unique(flattened_array)), 'indexes_of_located_cont uniques') - missing_textregions = list( set(np.array(range(len(contours_only_text_parent))) ) - set(np.unique(flattened_array)) ) + missing_textregions = list( set(range(len(contours_only_text_parent))) - set(flattened_array) ) #print(missing_textregions, 'missing_textregions') for ind in missing_textregions: @@ -3887,12 +3997,13 @@ class Eynollah: region_with_curr_order = ordered[ind] if region_with_curr_order < len(contours_only_dilated): if np.isscalar(indexes_of_located_cont[region_with_curr_order]): - org_contours_indexes = org_contours_indexes + [indexes_of_located_cont[region_with_curr_order]] + org_contours_indexes.extend([indexes_of_located_cont[region_with_curr_order]]) else: arg_sort_located_cont = np.argsort(center_y_coordinates_of_located[region_with_curr_order]) - org_contours_indexes = org_contours_indexes + list(np.array(indexes_of_located_cont[region_with_curr_order])[arg_sort_located_cont]) ##org_contours_indexes + list ( + org_contours_indexes.extend( + np.array(indexes_of_located_cont[region_with_curr_order])[arg_sort_located_cont]) else: - org_contours_indexes = org_contours_indexes + [indexes_of_located_cont[region_with_curr_order]] + org_contours_indexes.extend([indexes_of_located_cont[region_with_curr_order]]) region_ids = ['region_%04d' % i for i in range(len(co_text_all_org))] return org_contours_indexes, region_ids @@ -3915,17 +4026,13 @@ class Eynollah: if len(peaks_real)>70: print(len(peaks_real), 'len(peaks_real)') - peaks_real = peaks_real[(peaks_realwidth1)] arg_sort = np.argsort(sum_smoothed[peaks_real]) - arg_sort4 =arg_sort[::-1][:4] - peaks_sort_4 = peaks_real[arg_sort][::-1][:4] argsort_sorted = np.argsort(peaks_sort_4) - first_4_sorted = peaks_sort_4[argsort_sorted] y_4_sorted = sum_smoothed[peaks_real][arg_sort4[argsort_sorted]] #print(first_4_sorted,'first_4_sorted') @@ -4109,7 +4216,8 @@ class Eynollah: return x_differential_new - def filter_contours_inside_a_bigger_one(self,contours, contours_d_ordered, image, marginal_cnts=None, type_contour="textregion"): + def filter_contours_inside_a_bigger_one(self, contours, contours_d_ordered, image, + marginal_cnts=None, type_contour="textregion"): if type_contour=="textregion": areas = [cv2.contourArea(contours[j]) for j in range(len(contours))] area_tot = image.shape[0]*image.shape[1] @@ -4129,7 +4237,10 @@ class Eynollah: results = [cv2.pointPolygonTest(contours[ind], (cx_main[ind_small], cy_main[ind_small]), False) for ind in contours_index_big] if marginal_cnts: - results_marginal = [cv2.pointPolygonTest(marginal_cnts[ind], (cx_main[ind_small], cy_main[ind_small]), False) + results_marginal = [cv2.pointPolygonTest(marginal_cnts[ind], + (cx_main[ind_small], + cy_main[ind_small]), + False) for ind in range(len(marginal_cnts))] results_marginal = np.array(results_marginal) @@ -4184,7 +4295,10 @@ class Eynollah: args_with_bigger_area = np.array(args_all)[areas_without > 1.5*area_of_con_interest] if len(args_with_bigger_area)>0: - results = [cv2.pointPolygonTest(contours_txtline_of_all_textregions[ind], (cx_main_tot[ij], cy_main_tot[ij]), False) + results = [cv2.pointPolygonTest(contours_txtline_of_all_textregions[ind], + (cx_main_tot[ij], + cy_main_tot[ij]), + False) for ind in args_with_bigger_area ] results = np.array(results) if np.any(results==1): @@ -4196,14 +4310,16 @@ class Eynollah: textregion_index_to_del = np.array(textregion_index_to_del) textline_in_textregion_index_to_del = np.array(textline_in_textregion_index_to_del) for ind_u_a_trs in np.unique(textregion_index_to_del): - textline_in_textregion_index_to_del_ind = textline_in_textregion_index_to_del[textregion_index_to_del==ind_u_a_trs] + textline_in_textregion_index_to_del_ind = \ + textline_in_textregion_index_to_del[textregion_index_to_del==ind_u_a_trs] textline_in_textregion_index_to_del_ind = np.sort(textline_in_textregion_index_to_del_ind)[::-1] for ittrd in textline_in_textregion_index_to_del_ind: contours[ind_u_a_trs].pop(ittrd) return contours - def return_indexes_of_contours_loctaed_inside_another_list_of_contours(self, contours, contours_loc, cx_main_loc, cy_main_loc, indexes_loc): + def return_indexes_of_contours_located_inside_another_list_of_contours( + self, contours, contours_loc, cx_main_loc, cy_main_loc, indexes_loc): indexes_of_located_cont = [] center_x_coordinates_of_located = [] center_y_coordinates_of_located = [] @@ -4217,7 +4333,8 @@ class Eynollah: for ind in range(len(cy_main_loc)) ] results = np.array(results) indexes_in = np.where((results == 0) | (results == 1)) - indexes = indexes_loc[indexes_in]# [(results == 0) | (results == 1)]#np.where((results == 0) | (results == 1)) + # [(results == 0) | (results == 1)]#np.where((results == 0) | (results == 1)) + indexes = indexes_loc[indexes_in] indexes_of_located_cont.append(indexes) center_x_coordinates_of_located.append(np.array(cx_main_loc)[indexes_in] ) @@ -4247,7 +4364,10 @@ class Eynollah: ###contours_with_textline = [] ###for ind_tr, con_tr in enumerate(contours): - ###results = [cv2.pointPolygonTest(con_tr, (cx_main_textline[index_textline_con], cy_main_textline[index_textline_con]), False) + ###results = [cv2.pointPolygonTest(con_tr, + ### (cx_main_textline[index_textline_con], + ### cy_main_textline[index_textline_con]), + ### False) ### for index_textline_con in range(len(contours_txtline_of_all_textregions)) ] ###results = np.array(results) ###if np.any(results==1): @@ -4300,7 +4420,9 @@ class Eynollah: return (slopes_rem, all_found_textline_polygons_rem, boxes_text_rem, txt_con_org_rem, contours_only_text_parent_rem, index_by_text_par_con_rem_sort) - def separate_marginals_to_left_and_right_and_order_from_top_to_down(self, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes_marginals, mid_point_of_page_width): + def separate_marginals_to_left_and_right_and_order_from_top_to_down( + self, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, + slopes_marginals, mid_point_of_page_width): cx_marg, cy_marg, _, _, _, _, _ = find_new_features_of_contours( polygons_of_marginals) @@ -4310,8 +4432,10 @@ class Eynollah: poly_marg_left = list( np.array(polygons_of_marginals)[cx_marg < mid_point_of_page_width] ) poly_marg_right = list( np.array(polygons_of_marginals)[cx_marg >= mid_point_of_page_width] ) - all_found_textline_polygons_marginals_left = list( np.array(all_found_textline_polygons_marginals)[cx_marg < mid_point_of_page_width] ) - all_found_textline_polygons_marginals_right = list( np.array(all_found_textline_polygons_marginals)[cx_marg >= mid_point_of_page_width] ) + all_found_textline_polygons_marginals_left = \ + list( np.array(all_found_textline_polygons_marginals)[cx_marg < mid_point_of_page_width] ) + all_found_textline_polygons_marginals_right = \ + list( np.array(all_found_textline_polygons_marginals)[cx_marg >= mid_point_of_page_width] ) all_box_coord_marginals_left = list( np.array(all_box_coord_marginals)[cx_marg < mid_point_of_page_width] ) all_box_coord_marginals_right = list( np.array(all_box_coord_marginals)[cx_marg >= mid_point_of_page_width] ) @@ -4322,20 +4446,38 @@ class Eynollah: cy_marg_left = cy_marg[cx_marg < mid_point_of_page_width] cy_marg_right = cy_marg[cx_marg >= mid_point_of_page_width] - ordered_left_marginals = [poly for _, poly in sorted(zip(cy_marg_left, poly_marg_left), key=lambda x: x[0])] - ordered_right_marginals = [poly for _, poly in sorted(zip(cy_marg_right, poly_marg_right), key=lambda x: x[0])] + ordered_left_marginals = [poly for _, poly in sorted(zip(cy_marg_left, poly_marg_left), + key=lambda x: x[0])] + ordered_right_marginals = [poly for _, poly in sorted(zip(cy_marg_right, poly_marg_right), + key=lambda x: x[0])] - ordered_left_marginals_textline = [poly for _, poly in sorted(zip(cy_marg_left, all_found_textline_polygons_marginals_left), key=lambda x: x[0])] - ordered_right_marginals_textline = [poly for _, poly in sorted(zip(cy_marg_right, all_found_textline_polygons_marginals_right), key=lambda x: x[0])] + ordered_left_marginals_textline = [poly for _, poly in sorted(zip(cy_marg_left, + all_found_textline_polygons_marginals_left), + key=lambda x: x[0])] + ordered_right_marginals_textline = [poly for _, poly in sorted(zip(cy_marg_right, + all_found_textline_polygons_marginals_right), + key=lambda x: x[0])] - ordered_left_marginals_bbox = [poly for _, poly in sorted(zip(cy_marg_left, all_box_coord_marginals_left), key=lambda x: x[0])] - ordered_right_marginals_bbox = [poly for _, poly in sorted(zip(cy_marg_right, all_box_coord_marginals_right), key=lambda x: x[0])] + ordered_left_marginals_bbox = [poly for _, poly in sorted(zip(cy_marg_left, + all_box_coord_marginals_left), + key=lambda x: x[0])] + ordered_right_marginals_bbox = [poly for _, poly in sorted(zip(cy_marg_right, + all_box_coord_marginals_right), + key=lambda x: x[0])] - ordered_left_slopes_marginals = [poly for _, poly in sorted(zip(cy_marg_left, slopes_marg_left), key=lambda x: x[0])] - ordered_right_slopes_marginals = [poly for _, poly in sorted(zip(cy_marg_right, slopes_marg_right), key=lambda x: x[0])] + ordered_left_slopes_marginals = [poly for _, poly in sorted(zip(cy_marg_left, slopes_marg_left), + key=lambda x: x[0])] + ordered_right_slopes_marginals = [poly for _, poly in sorted(zip(cy_marg_right, slopes_marg_right), + key=lambda x: x[0])] - return ordered_left_marginals, ordered_right_marginals, ordered_left_marginals_textline, ordered_right_marginals_textline, ordered_left_marginals_bbox, ordered_right_marginals_bbox, ordered_left_slopes_marginals, ordered_right_slopes_marginals - + return (ordered_left_marginals, + ordered_right_marginals, + ordered_left_marginals_textline, + ordered_right_marginals_textline, + ordered_left_marginals_bbox, + ordered_right_marginals_bbox, + ordered_left_slopes_marginals, + ordered_right_slopes_marginals) def run(self, overwrite: bool = False, @@ -4420,9 +4562,11 @@ class Eynollah: self.logger.info(f"Processing file: {self.writer.image_filename}") self.logger.info("Step 1/5: Image Enhancement") - img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(self.light_version) + img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = \ + self.run_enhancement(self.light_version) - self.logger.info(f"Image: {self.image.shape[1]}x{self.image.shape[0]}, {self.dpi} DPI, {num_col_classifier} columns") + self.logger.info(f"Image: {self.image.shape[1]}x{self.image.shape[0]}, " + f"{self.dpi} DPI, {num_col_classifier} columns") if is_image_enhanced: self.logger.info("Enhancement applied") @@ -4433,7 +4577,8 @@ class Eynollah: if self.extract_only_images: self.logger.info("Step 2/5: Image Extraction Mode") - text_regions_p_1, erosion_hurts, polygons_seplines, polygons_of_images, image_page, page_coord, cont_page = \ + text_regions_p_1, erosion_hurts, polygons_seplines, polygons_of_images, \ + image_page, page_coord, cont_page = \ self.get_regions_light_v_extract_only_images(img_res, is_image_enhanced, num_col_classifier) pcgts = self.writer.build_pagexml_no_full_layout( [], page_coord, [], [], [], [], @@ -4465,20 +4610,20 @@ class Eynollah: M_main_tot = [cv2.moments(all_found_textline_polygons[j]) for j in range(len(all_found_textline_polygons))] - w_h_textlines = [cv2.boundingRect(all_found_textline_polygons[j])[2:] for j in range(len(all_found_textline_polygons))] + w_h_textlines = [cv2.boundingRect(all_found_textline_polygons[j])[2:] + for j in range(len(all_found_textline_polygons))] w_h_textlines = [w_h_textlines[j][0] / float(w_h_textlines[j][1]) for j in range(len(w_h_textlines))] cx_main_tot = [(M_main_tot[j]["m10"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] cy_main_tot = [(M_main_tot[j]["m01"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] - all_found_textline_polygons = self.get_textlines_of_a_textregion_sorted(all_found_textline_polygons, cx_main_tot, cy_main_tot, w_h_textlines)#all_found_textline_polygons[::-1] - - all_found_textline_polygons=[ all_found_textline_polygons ] - + all_found_textline_polygons = self.get_textlines_of_a_textregion_sorted( + #all_found_textline_polygons[::-1] + all_found_textline_polygons, cx_main_tot, cy_main_tot, w_h_textlines) + all_found_textline_polygons = [ all_found_textline_polygons ] all_found_textline_polygons = dilate_textline_contours(all_found_textline_polygons) all_found_textline_polygons = self.filter_contours_inside_a_bigger_one( all_found_textline_polygons, None, textline_mask_tot_ea, type_contour="textline") - order_text_new = [0] slopes =[0] id_of_texts_tot =['region_0001'] @@ -4498,15 +4643,23 @@ class Eynollah: if self.ocr and not self.tr: gc.collect() - ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons, self.prediction_model, self.b_s_ocr, self.num_to_char, textline_light=True) + ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines( + image_page, all_found_textline_polygons, self.prediction_model, + self.b_s_ocr, self.num_to_char, textline_light=True) else: ocr_all_textlines = None pcgts = self.writer.build_pagexml_no_full_layout( cont_page, page_coord, order_text_new, id_of_texts_tot, - all_found_textline_polygons, page_coord, polygons_of_images, polygons_of_marginals_left, polygons_of_marginals_right, - all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_marginals_left, slopes_marginals_right, - cont_page, polygons_seplines, contours_tables, ocr_all_textlines=ocr_all_textlines, conf_contours_textregion=conf_contours_textregions, skip_layout_reading_order=self.skip_layout_and_reading_order) + all_found_textline_polygons, page_coord, polygons_of_images, + polygons_of_marginals_left, polygons_of_marginals_right, + all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, + all_box_coord_marginals_left, all_box_coord_marginals_right, + slopes, slopes_marginals_left, slopes_marginals_right, + cont_page, polygons_seplines, contours_tables, + ocr_all_textlines=ocr_all_textlines, + conf_contours_textregion=conf_contours_textregions, + skip_layout_reading_order=self.skip_layout_and_reading_order) self.logger.info("Basic processing complete") return pcgts @@ -4516,7 +4669,8 @@ class Eynollah: if self.light_version: self.logger.info("Using light version processing") - text_regions_p_1 ,erosion_hurts, polygons_seplines, textline_mask_tot_ea, img_bin_light, confidence_matrix = \ + text_regions_p_1 ,erosion_hurts, polygons_seplines, textline_mask_tot_ea, \ + img_bin_light, confidence_matrix = \ self.get_regions_light_v(img_res, is_image_enhanced, num_col_classifier) #print("text region early -2 in %.1fs", time.time() - t0) @@ -4528,7 +4682,6 @@ class Eynollah: img_h_new = img_w_new * textline_mask_tot_ea.shape[0] // textline_mask_tot_ea.shape[1] textline_mask_tot_ea_deskew = resize_image(textline_mask_tot_ea,img_h_new, img_w_new ) - slope_deskew = self.run_deskew(textline_mask_tot_ea_deskew) else: slope_deskew = self.run_deskew(textline_mask_tot_ea) @@ -4537,7 +4690,8 @@ class Eynollah: num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, \ text_regions_p_1, cont_page, table_prediction, textline_mask_tot_ea, img_bin_light = \ self.run_graphics_and_columns_light(text_regions_p_1, textline_mask_tot_ea, - num_col_classifier, num_column_is_classified, erosion_hurts, img_bin_light) + num_col_classifier, num_column_is_classified, + erosion_hurts, img_bin_light) #self.logger.info("run graphics %.1fs ", time.time() - t1t) #print("text region early -3 in %.1fs", time.time() - t0) textline_mask_tot_ea_org = np.copy(textline_mask_tot_ea) @@ -4552,7 +4706,8 @@ class Eynollah: t1 = time.time() num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, \ text_regions_p_1, cont_page, table_prediction = \ - self.run_graphics_and_columns(text_regions_p_1, num_col_classifier, num_column_is_classified, erosion_hurts) + self.run_graphics_and_columns(text_regions_p_1, num_col_classifier, num_column_is_classified, + erosion_hurts) self.logger.info(f"Graphics detection took {time.time() - t1:.1f}s") #self.logger.info('cont_page %s', cont_page) #plt.imshow(table_prediction) @@ -4617,13 +4772,15 @@ class Eynollah: ## birdan sora chock chakir t1 = time.time() if not self.full_layout: - polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, regions_without_separators_d, \ + polygons_of_images, img_revised_tab, text_regions_p_1_n, \ + textline_mask_tot_d, regions_without_separators_d, \ boxes, boxes_d, polygons_of_marginals, contours_tables = \ self.run_boxes_no_full_layout(image_page, textline_mask_tot, text_regions_p, slope_deskew, num_col_classifier, table_prediction, erosion_hurts) ###polygons_of_marginals = dilate_textregion_contours(polygons_of_marginals) else: - polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, regions_without_separators_d, \ + polygons_of_images, img_revised_tab, text_regions_p_1_n, \ + textline_mask_tot_d, regions_without_separators_d, \ regions_fully, regions_without_separators, polygons_of_marginals, contours_tables = \ self.run_boxes_full_layout(image_page, textline_mask_tot, text_regions_p, slope_deskew, num_col_classifier, img_only_regions, table_prediction, erosion_hurts, @@ -4690,8 +4847,10 @@ class Eynollah: areas_cnt_text_d = self.return_list_of_contours_with_desired_order( areas_cnt_text_d, index_con_parents_d) - cx_bigest_d_big, cy_biggest_d_big, _, _, _, _, _ = find_new_features_of_contours([contours_biggest_d]) - cx_bigest_d, cy_biggest_d, _, _, _, _, _ = find_new_features_of_contours(contours_only_text_parent_d) + cx_bigest_d_big, cy_biggest_d_big, _, _, _, _, _ = \ + find_new_features_of_contours([contours_biggest_d]) + cx_bigest_d, cy_biggest_d, _, _, _, _, _ = \ + find_new_features_of_contours(contours_only_text_parent_d) try: if len(cx_bigest_d) >= 5: cx_bigest_d_last5 = cx_bigest_d[-5:] @@ -4751,13 +4910,19 @@ class Eynollah: pcgts = self.writer.build_pagexml_full_layout( [], [], page_coord, [], [], [], [], [], [], polygons_of_images, contours_tables, [], - polygons_of_marginals, polygons_of_marginals, empty_marginals, empty_marginals, empty_marginals, empty_marginals, [], [], [], [], + polygons_of_marginals, polygons_of_marginals, + empty_marginals, empty_marginals, + empty_marginals, empty_marginals, + [], [], [], [], cont_page, polygons_seplines) else: pcgts = self.writer.build_pagexml_no_full_layout( [], page_coord, [], [], [], [], polygons_of_images, - polygons_of_marginals, polygons_of_marginals, empty_marginals, empty_marginals, empty_marginals, empty_marginals, [], [], [], + polygons_of_marginals, polygons_of_marginals, + empty_marginals, empty_marginals, + empty_marginals, empty_marginals, + [], [], [], cont_page, polygons_seplines, contours_tables) return pcgts @@ -4767,7 +4932,8 @@ class Eynollah: if self.light_version: contours_only_text_parent = dilate_textregion_contours(contours_only_text_parent) contours_only_text_parent , contours_only_text_parent_d_ordered = self.filter_contours_inside_a_bigger_one( - contours_only_text_parent, contours_only_text_parent_d_ordered, text_only, marginal_cnts=polygons_of_marginals) + contours_only_text_parent, contours_only_text_parent_d_ordered, text_only, + marginal_cnts=polygons_of_marginals) #print("text region early 3.5 in %.1fs", time.time() - t0) txt_con_org , conf_contours_textregions = get_textregion_contours_in_org_image_light( contours_only_text_parent, self.image, confidence_matrix) @@ -4793,19 +4959,26 @@ class Eynollah: polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea_org, boxes_marginals, slope_deskew) - #slopes, all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, index_by_text_par_con = \ + #slopes, all_found_textline_polygons, boxes_text, txt_con_org, \ + # contours_only_text_parent, index_by_text_par_con = \ # self.delete_regions_without_textlines(slopes, all_found_textline_polygons, # boxes_text, txt_con_org, contours_only_text_parent, index_by_text_par_con) - #slopes_marginals, all_found_textline_polygons_marginals, boxes_marginals, polygons_of_marginals, polygons_of_marginals, _ = \ + #slopes_marginals, all_found_textline_polygons_marginals, boxes_marginals, \ + # polygons_of_marginals, polygons_of_marginals, _ = \ # self.delete_regions_without_textlines(slopes_marginals, all_found_textline_polygons_marginals, - # boxes_marginals, polygons_of_marginals, polygons_of_marginals, np.array(range(len(polygons_of_marginals)))) - all_found_textline_polygons = dilate_textline_contours(all_found_textline_polygons) + # boxes_marginals, polygons_of_marginals, polygons_of_marginals, + # np.array(range(len(polygons_of_marginals)))) + all_found_textline_polygons = dilate_textline_contours( + all_found_textline_polygons) all_found_textline_polygons = self.filter_contours_inside_a_bigger_one( all_found_textline_polygons, None, textline_mask_tot_ea_org, type_contour="textline") - all_found_textline_polygons_marginals = dilate_textline_contours(all_found_textline_polygons_marginals) - contours_only_text_parent, txt_con_org, conf_contours_textregions, all_found_textline_polygons, contours_only_text_parent_d_ordered, \ + all_found_textline_polygons_marginals = dilate_textline_contours( + all_found_textline_polygons_marginals) + contours_only_text_parent, txt_con_org, conf_contours_textregions, \ + all_found_textline_polygons, contours_only_text_parent_d_ordered, \ index_by_text_par_con = self.filter_contours_without_textline_inside( - contours_only_text_parent, txt_con_org, all_found_textline_polygons, contours_only_text_parent_d_ordered, conf_contours_textregions) + contours_only_text_parent, txt_con_org, all_found_textline_polygons, + contours_only_text_parent_d_ordered, conf_contours_textregions) else: textline_mask_tot_ea = cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1) all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, \ @@ -4847,7 +5020,13 @@ class Eynollah: all_found_textline_polygons_marginals, textline_mask_tot_ea, num_col_classifier) mid_point_of_page_width = text_regions_p.shape[1] / 2. - polygons_of_marginals_left, polygons_of_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes_marginals_left, slopes_marginals_right = self.separate_marginals_to_left_and_right_and_order_from_top_to_down(polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes_marginals, mid_point_of_page_width) + (polygons_of_marginals_left, polygons_of_marginals_right, + all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, + all_box_coord_marginals_left, all_box_coord_marginals_right, + slopes_marginals_left, slopes_marginals_right) = \ + self.separate_marginals_to_left_and_right_and_order_from_top_to_down( + polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, + slopes_marginals, mid_point_of_page_width) #print(len(polygons_of_marginals), len(ordered_left_marginals), len(ordered_right_marginals), 'marginals ordred') if self.full_layout: @@ -4871,40 +5050,41 @@ class Eynollah: all_found_textline_polygons, all_found_textline_polygons_h, slopes, slopes_h, \ contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, \ conf_contours_textregions, conf_contours_textregions_h = fun( - text_regions_p, regions_fully, contours_only_text_parent, - all_box_coord, all_found_textline_polygons, slopes, contours_only_text_parent_d_ordered, conf_contours_textregions) + text_regions_p, regions_fully, contours_only_text_parent, + all_box_coord, all_found_textline_polygons, + slopes, contours_only_text_parent_d_ordered, conf_contours_textregions) if self.plotter: self.plotter.save_plot_of_layout(text_regions_p, image_page) self.plotter.save_plot_of_layout_all(text_regions_p, image_page) - pixel_img = 4 - polygons_of_drop_capitals = return_contours_of_interested_region_by_min_size(text_regions_p, pixel_img) + label_img = 4 + polygons_of_drop_capitals = return_contours_of_interested_region_by_min_size(text_regions_p, label_img) ##all_found_textline_polygons = adhere_drop_capital_region_into_corresponding_textline( ##text_regions_p, polygons_of_drop_capitals, contours_only_text_parent, contours_only_text_parent_h, ##all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, ##kernel=KERNEL, curved_line=self.curved_line, textline_light=self.textline_light) if not self.reading_order_machine_based: - pixel_seps = 6 + label_seps = 6 if not self.headers_off: if np.abs(slope_deskew) < SLOPE_THRESHOLD: num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), - num_col_classifier, self.tables, pixel_seps, contours_only_text_parent_h) + num_col_classifier, self.tables, label_seps, contours_only_text_parent_h) else: _, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), - num_col_classifier, self.tables, pixel_seps, contours_only_text_parent_h_d_ordered) + num_col_classifier, self.tables, label_seps, contours_only_text_parent_h_d_ordered) elif self.headers_off: if np.abs(slope_deskew) < SLOPE_THRESHOLD: num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), - num_col_classifier, self.tables, pixel_seps) + num_col_classifier, self.tables, label_seps) else: _, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), - num_col_classifier, self.tables, pixel_seps) + num_col_classifier, self.tables, label_seps) if num_col_classifier >= 3: if np.abs(slope_deskew) < SLOPE_THRESHOLD: @@ -4949,7 +5129,8 @@ class Eynollah: contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot) else: order_text_new, id_of_texts_tot = self.do_order_of_regions( - contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d) + contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, + boxes_d, textline_mask_tot_d) self.logger.info(f"Detection of reading order took {time.time() - t_order:.1f}s") if self.ocr and not self.tr: @@ -4962,27 +5143,37 @@ class Eynollah: gc.collect() if len(all_found_textline_polygons)>0: - ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines( + image_page, all_found_textline_polygons, self.prediction_model, + self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) else: ocr_all_textlines = None if all_found_textline_polygons_marginals_left and len(all_found_textline_polygons_marginals_left)>0: - ocr_all_textlines_marginals_left = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons_marginals_left, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + ocr_all_textlines_marginals_left = return_rnn_cnn_ocr_of_given_textlines( + image_page, all_found_textline_polygons_marginals_left, self.prediction_model, + self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) else: ocr_all_textlines_marginals_left = None if all_found_textline_polygons_marginals_right and len(all_found_textline_polygons_marginals_right)>0: - ocr_all_textlines_marginals_right = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons_marginals_right, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + ocr_all_textlines_marginals_right = return_rnn_cnn_ocr_of_given_textlines( + image_page, all_found_textline_polygons_marginals_right, self.prediction_model, + self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) else: ocr_all_textlines_marginals_right = None if all_found_textline_polygons_h and len(all_found_textline_polygons)>0: - ocr_all_textlines_h = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons_h, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + ocr_all_textlines_h = return_rnn_cnn_ocr_of_given_textlines( + image_page, all_found_textline_polygons_h, self.prediction_model, + self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) else: ocr_all_textlines_h = None if polygons_of_drop_capitals and len(polygons_of_drop_capitals)>0: - ocr_all_textlines_drop = return_rnn_cnn_ocr_of_given_textlines(image_page, polygons_of_drop_capitals, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + ocr_all_textlines_drop = return_rnn_cnn_ocr_of_given_textlines( + image_page, polygons_of_drop_capitals, self.prediction_model, + self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) else: ocr_all_textlines_drop = None else: @@ -4997,9 +5188,15 @@ class Eynollah: pcgts = self.writer.build_pagexml_full_layout( contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, - polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals_left, polygons_of_marginals_right, - all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_h, slopes_marginals_left, slopes_marginals_right, - cont_page, polygons_seplines, ocr_all_textlines, ocr_all_textlines_h, ocr_all_textlines_marginals_left, ocr_all_textlines_marginals_right, ocr_all_textlines_drop, conf_contours_textregions, conf_contours_textregions_h) + polygons_of_images, contours_tables, polygons_of_drop_capitals, + polygons_of_marginals_left, polygons_of_marginals_right, + all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, + all_box_coord_marginals_left, all_box_coord_marginals_right, + slopes, slopes_h, slopes_marginals_left, slopes_marginals_right, + cont_page, polygons_seplines, ocr_all_textlines, ocr_all_textlines_h, + ocr_all_textlines_marginals_left, ocr_all_textlines_marginals_right, + ocr_all_textlines_drop, + conf_contours_textregions, conf_contours_textregions_h) return pcgts @@ -5034,18 +5231,14 @@ class Eynollah: if self.ocr and self.tr: self.logger.info("Step 4.5/5: OCR Processing") - if torch.cuda.is_available(): self.logger.info("Using GPU acceleration") else: self.logger.info("Using CPU processing") - if self.light_version: self.logger.info("Using light version OCR") - if self.textline_light: self.logger.info("Using light text line detection for OCR") - self.logger.info("Processing text lines...") device = cuda.get_current_device() @@ -5090,7 +5283,8 @@ class Eynollah: img_croped = img_poly_on_img[y:y+h, x:x+w, :] #cv2.imwrite('./extracted_lines/'+str(ind_tot)+'.jpg', img_croped) - text_ocr = self.return_ocr_of_textline_without_common_section(img_croped, model_ocr, processor, device, w, h2w_ratio, ind_tot) + text_ocr = self.return_ocr_of_textline_without_common_section( + img_croped, model_ocr, processor, device, w, h2w_ratio, ind_tot) ocr_textline_in_textregion.append(text_ocr) ind_tot = ind_tot +1 ocr_all_textlines.append(ocr_textline_in_textregion) @@ -5098,13 +5292,19 @@ class Eynollah: elif self.ocr and not self.tr: gc.collect() if len(all_found_textline_polygons)>0: - ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines( + image_page, all_found_textline_polygons, self.prediction_model, + self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) if all_found_textline_polygons_marginals_left and len(all_found_textline_polygons_marginals_left)>0: - ocr_all_textlines_marginals_left = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons_marginals_left, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + ocr_all_textlines_marginals_left = return_rnn_cnn_ocr_of_given_textlines( + image_page, all_found_textline_polygons_marginals_left, self.prediction_model, + self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) if all_found_textline_polygons_marginals_right and len(all_found_textline_polygons_marginals_right)>0: - ocr_all_textlines_marginals_right = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons_marginals_right, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + ocr_all_textlines_marginals_right = return_rnn_cnn_ocr_of_given_textlines( + image_page, all_found_textline_polygons_marginals_right, self.prediction_model, + self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) else: ocr_all_textlines = None @@ -5117,9 +5317,14 @@ class Eynollah: pcgts = self.writer.build_pagexml_no_full_layout( txt_con_org, page_coord, order_text_new, id_of_texts_tot, - all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals_left, polygons_of_marginals_right, - all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_marginals_left, slopes_marginals_right, - cont_page, polygons_seplines, contours_tables, ocr_all_textlines, ocr_all_textlines_marginals_left, ocr_all_textlines_marginals_right, conf_contours_textregions) + all_found_textline_polygons, all_box_coord, polygons_of_images, + polygons_of_marginals_left, polygons_of_marginals_right, + all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, + all_box_coord_marginals_left, all_box_coord_marginals_right, + slopes, slopes_marginals_left, slopes_marginals_right, + cont_page, polygons_seplines, contours_tables, ocr_all_textlines, + ocr_all_textlines_marginals_left, ocr_all_textlines_marginals_right, + conf_contours_textregions) return pcgts @@ -5138,7 +5343,6 @@ class Eynollah_ocr: min_conf_value_of_textline_text : Optional[float]=None, logger=None, ): - self.dir_models = dir_models self.model_name = model_name self.tr_ocr = tr_ocr self.export_textline_images_and_text = export_textline_images_and_text @@ -5261,7 +5465,9 @@ class Eynollah_ocr: if child_textlines.tag.endswith("Coords"): cropped_lines_region_indexer.append(indexer_text_region) p_h=child_textlines.attrib['points'].split(' ') - textline_coords = np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) + textline_coords = np.array( [ [int(x.split(',')[0]), + int(x.split(',')[1]) ] + for x in p_h] ) x,y,w,h = cv2.boundingRect(textline_coords) if dir_out_image_text: @@ -5277,9 +5483,12 @@ class Eynollah_ocr: img_crop = img_poly_on_img[y:y+h, x:x+w, :] img_crop[mask_poly==0] = 255 - self.logger.debug("processing %d lines for '%s'", len(cropped_lines), nn.attrib['id']) + self.logger.debug("processing %d lines for '%s'", + len(cropped_lines), nn.attrib['id']) if h2w_ratio > 0.1: - cropped_lines.append(resize_image(img_crop, tr_ocr_input_height_and_width, tr_ocr_input_height_and_width) ) + cropped_lines.append(resize_image(img_crop, + tr_ocr_input_height_and_width, + tr_ocr_input_height_and_width) ) cropped_lines_meging_indexing.append(0) indexer_b_s+=1 if indexer_b_s==self.b_s: @@ -5288,8 +5497,10 @@ class Eynollah_ocr: indexer_b_s = 0 pixel_values_merged = self.processor(imgs, return_tensors="pt").pixel_values - generated_ids_merged = self.model_ocr.generate(pixel_values_merged.to(self.device)) - generated_text_merged = self.processor.batch_decode(generated_ids_merged, skip_special_tokens=True) + generated_ids_merged = self.model_ocr.generate( + pixel_values_merged.to(self.device)) + generated_text_merged = self.processor.batch_decode( + generated_ids_merged, skip_special_tokens=True) extracted_texts = extracted_texts + generated_text_merged @@ -5297,7 +5508,9 @@ class Eynollah_ocr: splited_images, _ = return_textlines_split_if_needed(img_crop, None) #print(splited_images) if splited_images: - cropped_lines.append(resize_image(splited_images[0], tr_ocr_input_height_and_width, tr_ocr_input_height_and_width)) + cropped_lines.append(resize_image(splited_images[0], + tr_ocr_input_height_and_width, + tr_ocr_input_height_and_width)) cropped_lines_meging_indexing.append(1) indexer_b_s+=1 @@ -5307,13 +5520,17 @@ class Eynollah_ocr: indexer_b_s = 0 pixel_values_merged = self.processor(imgs, return_tensors="pt").pixel_values - generated_ids_merged = self.model_ocr.generate(pixel_values_merged.to(self.device)) - generated_text_merged = self.processor.batch_decode(generated_ids_merged, skip_special_tokens=True) + generated_ids_merged = self.model_ocr.generate( + pixel_values_merged.to(self.device)) + generated_text_merged = self.processor.batch_decode( + generated_ids_merged, skip_special_tokens=True) extracted_texts = extracted_texts + generated_text_merged - cropped_lines.append(resize_image(splited_images[1], tr_ocr_input_height_and_width, tr_ocr_input_height_and_width)) + cropped_lines.append(resize_image(splited_images[1], + tr_ocr_input_height_and_width, + tr_ocr_input_height_and_width)) cropped_lines_meging_indexing.append(-1) indexer_b_s+=1 @@ -5323,8 +5540,10 @@ class Eynollah_ocr: indexer_b_s = 0 pixel_values_merged = self.processor(imgs, return_tensors="pt").pixel_values - generated_ids_merged = self.model_ocr.generate(pixel_values_merged.to(self.device)) - generated_text_merged = self.processor.batch_decode(generated_ids_merged, skip_special_tokens=True) + generated_ids_merged = self.model_ocr.generate( + pixel_values_merged.to(self.device)) + generated_text_merged = self.processor.batch_decode( + generated_ids_merged, skip_special_tokens=True) extracted_texts = extracted_texts + generated_text_merged @@ -5339,8 +5558,10 @@ class Eynollah_ocr: indexer_b_s = 0 pixel_values_merged = self.processor(imgs, return_tensors="pt").pixel_values - generated_ids_merged = self.model_ocr.generate(pixel_values_merged.to(self.device)) - generated_text_merged = self.processor.batch_decode(generated_ids_merged, skip_special_tokens=True) + generated_ids_merged = self.model_ocr.generate( + pixel_values_merged.to(self.device)) + generated_text_merged = self.processor.batch_decode( + generated_ids_merged, skip_special_tokens=True) extracted_texts = extracted_texts + generated_text_merged @@ -5371,15 +5592,22 @@ class Eynollah_ocr: ####n_end = (i+1)*self.b_s ####imgs = cropped_lines[n_start:n_end] ####pixel_values_merged = self.processor(imgs, return_tensors="pt").pixel_values - ####generated_ids_merged = self.model_ocr.generate(pixel_values_merged.to(self.device)) - ####generated_text_merged = self.processor.batch_decode(generated_ids_merged, skip_special_tokens=True) + ####generated_ids_merged = self.model_ocr.generate( + #### pixel_values_merged.to(self.device)) + ####generated_text_merged = self.processor.batch_decode( + #### generated_ids_merged, skip_special_tokens=True) ####extracted_texts = extracted_texts + generated_text_merged del cropped_lines gc.collect() - extracted_texts_merged = [extracted_texts[ind] if cropped_lines_meging_indexing[ind]==0 else extracted_texts[ind]+" "+extracted_texts[ind+1] if cropped_lines_meging_indexing[ind]==1 else None for ind in range(len(cropped_lines_meging_indexing))] + extracted_texts_merged = [extracted_texts[ind] + if cropped_lines_meging_indexing[ind]==0 + else extracted_texts[ind]+" "+extracted_texts[ind+1] + if cropped_lines_meging_indexing[ind]==1 + else None + for ind in range(len(cropped_lines_meging_indexing))] extracted_texts_merged = [ind for ind in extracted_texts_merged if ind is not None] #print(extracted_texts_merged, len(extracted_texts_merged)) @@ -5401,7 +5629,8 @@ class Eynollah_ocr: w_bb = bb_ind[2] h_bb = bb_ind[3] - font = fit_text_single_line(draw, extracted_texts_merged[indexer_text], font.path, w_bb, int(h_bb*0.4) ) + font = fit_text_single_line(draw, extracted_texts_merged[indexer_text], + font.path, w_bb, int(h_bb*0.4) ) ##draw.rectangle([x_bb, y_bb, x_bb + w_bb, y_bb + h_bb], outline="red", width=2) @@ -5419,25 +5648,27 @@ class Eynollah_ocr: #print(len(unique_cropped_lines_region_indexer), 'unique_cropped_lines_region_indexer') #######text_by_textregion = [] #######for ind in unique_cropped_lines_region_indexer: - #######extracted_texts_merged_un = np.array(extracted_texts_merged)[np.array(cropped_lines_region_indexer)==ind] - + #######ind = np.array(cropped_lines_region_indexer)==ind + #######extracted_texts_merged_un = np.array(extracted_texts_merged)[ind] #######text_by_textregion.append(" ".join(extracted_texts_merged_un)) text_by_textregion = [] for ind in unique_cropped_lines_region_indexer: - extracted_texts_merged_un = np.array(extracted_texts_merged)[np.array(cropped_lines_region_indexer)==ind] + ind = np.array(cropped_lines_region_indexer) == ind + extracted_texts_merged_un = np.array(extracted_texts_merged)[ind] if len(extracted_texts_merged_un)>1: text_by_textregion_ind = "" next_glue = "" for indt in range(len(extracted_texts_merged_un)): - if extracted_texts_merged_un[indt].endswith('⸗') or extracted_texts_merged_un[indt].endswith('-') or extracted_texts_merged_un[indt].endswith('¬'): - text_by_textregion_ind = text_by_textregion_ind + next_glue + extracted_texts_merged_un[indt][:-1] + if (extracted_texts_merged_un[indt].endswith('⸗') or + extracted_texts_merged_un[indt].endswith('-') or + extracted_texts_merged_un[indt].endswith('¬')): + text_by_textregion_ind += next_glue + extracted_texts_merged_un[indt][:-1] next_glue = "" else: - text_by_textregion_ind = text_by_textregion_ind + next_glue + extracted_texts_merged_un[indt] + text_by_textregion_ind += next_glue + extracted_texts_merged_un[indt] next_glue = " " text_by_textregion.append(text_by_textregion_ind) - else: text_by_textregion.append(" ".join(extracted_texts_merged_un)) @@ -5495,7 +5726,9 @@ class Eynollah_ocr: unicode_textregion.text = text_by_textregion[indexer_textregion] indexer_textregion = indexer_textregion + 1 - ###sample_order = [(id_to_order[tid], text) for tid, text in zip(id_textregions, textregions_by_existing_ids) if tid in id_to_order] + ###sample_order = [(id_to_order[tid], text) + ### for tid, text in zip(id_textregions, textregions_by_existing_ids) + ### if tid in id_to_order] ##ordered_texts_sample = [text for _, text in sorted(sample_order)] ##tot_page_text = ' '.join(ordered_texts_sample) @@ -5569,7 +5802,9 @@ class Eynollah_ocr: if child_textlines.tag.endswith("Coords"): cropped_lines_region_indexer.append(indexer_text_region) p_h=child_textlines.attrib['points'].split(' ') - textline_coords = np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) + textline_coords = np.array( [ [int(x.split(',')[0]), + int(x.split(',')[1]) ] + for x in p_h] ) x,y,w,h = cv2.boundingRect(textline_coords) @@ -5601,17 +5836,19 @@ class Eynollah_ocr: img_crop[mask_poly==0] = 255 else: - #print(file_name, angle_degrees,w*h , mask_poly[:,:,0].sum(), mask_poly[:,:,0].sum() /float(w*h) , 'didi') + # print(file_name, angle_degrees, w*h, + # mask_poly[:,:,0].sum(), + # mask_poly[:,:,0].sum() /float(w*h) , + # 'didi') if angle_degrees > 3: better_des_slope = get_orientation_moments(textline_coords) - img_crop = rotate_image_with_padding(img_crop, better_des_slope ) - + img_crop = rotate_image_with_padding(img_crop, better_des_slope) if dir_in_bin is not None: - img_crop_bin = rotate_image_with_padding(img_crop_bin, better_des_slope ) + img_crop_bin = rotate_image_with_padding(img_crop_bin, better_des_slope) - mask_poly = rotate_image_with_padding(mask_poly, better_des_slope ) + mask_poly = rotate_image_with_padding(mask_poly, better_des_slope) mask_poly = mask_poly.astype('uint8') #new bounding box @@ -5622,7 +5859,6 @@ class Eynollah_ocr: if not self.do_not_mask_with_textline_contour: img_crop[mask_poly==0] = 255 - if dir_in_bin is not None: img_crop_bin = img_crop_bin[y_n:y_n+h_n, x_n:x_n+w_n, :] if not self.do_not_mask_with_textline_contour: @@ -5630,11 +5866,14 @@ class Eynollah_ocr: if mask_poly[:,:,0].sum() /float(w_n*h_n) < 0.50 and w_scaled > 90: if dir_in_bin is not None: - img_crop, img_crop_bin = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly, img_crop_bin) + img_crop, img_crop_bin = \ + break_curved_line_into_small_pieces_and_then_merge( + img_crop, mask_poly, img_crop_bin) else: - img_crop, _ = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly) + img_crop, _ = \ + break_curved_line_into_small_pieces_and_then_merge( + img_crop, mask_poly) - else: better_des_slope = 0 if not self.do_not_mask_with_textline_contour: @@ -5647,13 +5886,18 @@ class Eynollah_ocr: else: if mask_poly[:,:,0].sum() /float(w*h) < 0.50 and w_scaled > 90: if dir_in_bin is not None: - img_crop, img_crop_bin = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly, img_crop_bin) + img_crop, img_crop_bin = \ + break_curved_line_into_small_pieces_and_then_merge( + img_crop, mask_poly, img_crop_bin) else: - img_crop, _ = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly) + img_crop, _ = \ + break_curved_line_into_small_pieces_and_then_merge( + img_crop, mask_poly) if not self.export_textline_images_and_text: if w_scaled < 750:#1.5*image_width: - img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) + img_fin = preprocess_and_resize_image_for_ocrcnn_model( + img_crop, image_height, image_width) cropped_lines.append(img_fin) if abs(better_des_slope) > 45: cropped_lines_ver_index.append(1) @@ -5662,13 +5906,15 @@ class Eynollah_ocr: cropped_lines_meging_indexing.append(0) if dir_in_bin is not None: - img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop_bin, image_height, image_width) + img_fin = preprocess_and_resize_image_for_ocrcnn_model( + img_crop_bin, image_height, image_width) cropped_lines_bin.append(img_fin) else: splited_images, splited_images_bin = return_textlines_split_if_needed( img_crop, img_crop_bin if dir_in_bin is not None else None) if splited_images: - img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images[0], image_height, image_width) + img_fin = preprocess_and_resize_image_for_ocrcnn_model( + splited_images[0], image_height, image_width) cropped_lines.append(img_fin) cropped_lines_meging_indexing.append(1) @@ -5677,7 +5923,8 @@ class Eynollah_ocr: else: cropped_lines_ver_index.append(0) - img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images[1], image_height, image_width) + img_fin = preprocess_and_resize_image_for_ocrcnn_model( + splited_images[1], image_height, image_width) cropped_lines.append(img_fin) cropped_lines_meging_indexing.append(-1) @@ -5688,13 +5935,16 @@ class Eynollah_ocr: cropped_lines_ver_index.append(0) if dir_in_bin is not None: - img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images_bin[0], image_height, image_width) + img_fin = preprocess_and_resize_image_for_ocrcnn_model( + splited_images_bin[0], image_height, image_width) cropped_lines_bin.append(img_fin) - img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images_bin[1], image_height, image_width) + img_fin = preprocess_and_resize_image_for_ocrcnn_model( + splited_images_bin[1], image_height, image_width) cropped_lines_bin.append(img_fin) else: - img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) + img_fin = preprocess_and_resize_image_for_ocrcnn_model( + img_crop, image_height, image_width) cropped_lines.append(img_fin) cropped_lines_meging_indexing.append(0) @@ -5704,7 +5954,8 @@ class Eynollah_ocr: cropped_lines_ver_index.append(0) if dir_in_bin is not None: - img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop_bin, image_height, image_width) + img_fin = preprocess_and_resize_image_for_ocrcnn_model( + img_crop_bin, image_height, image_width) cropped_lines_bin.append(img_fin) if self.export_textline_images_and_text: @@ -5716,7 +5967,8 @@ class Eynollah_ocr: if cheild_text.tag.endswith("Unicode"): textline_text = cheild_text.text if textline_text: - base_name = os.path.join(dir_out, file_name + '_line_' + str(indexer_textlines)) + base_name = os.path.join( + dir_out, file_name + '_line_' + str(indexer_textlines)) if self.pref_of_dataset: base_name += '_' + self.pref_of_dataset if not self.do_not_mask_with_textline_contour: @@ -5806,25 +6058,31 @@ class Eynollah_ocr: preds_max_fliped = np.max(preds_flipped, axis=2 ) preds_max_args_flipped = np.argmax(preds_flipped, axis=2 ) pred_max_not_unk_mask_bool_flipped = preds_max_args_flipped[:,:]!=self.end_character - masked_means_flipped = np.sum(preds_max_fliped * pred_max_not_unk_mask_bool_flipped, axis=1) / np.sum(pred_max_not_unk_mask_bool_flipped, axis=1) + masked_means_flipped = \ + np.sum(preds_max_fliped * pred_max_not_unk_mask_bool_flipped, axis=1) / \ + np.sum(pred_max_not_unk_mask_bool_flipped, axis=1) masked_means_flipped[np.isnan(masked_means_flipped)] = 0 preds_max = np.max(preds, axis=2 ) preds_max_args = np.argmax(preds, axis=2 ) pred_max_not_unk_mask_bool = preds_max_args[:,:]!=self.end_character - masked_means = np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / np.sum(pred_max_not_unk_mask_bool, axis=1) + masked_means = \ + np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / \ + np.sum(pred_max_not_unk_mask_bool, axis=1) masked_means[np.isnan(masked_means)] = 0 masked_means_ver = masked_means[indices_ver] #print(masked_means_ver, 'pred_max_not_unk') - indices_where_flipped_conf_value_is_higher = np.where(masked_means_flipped > masked_means_ver)[0] + indices_where_flipped_conf_value_is_higher = \ + np.where(masked_means_flipped > masked_means_ver)[0] #print(indices_where_flipped_conf_value_is_higher, 'indices_where_flipped_conf_value_is_higher') if len(indices_where_flipped_conf_value_is_higher)>0: indices_to_be_replaced = indices_ver[indices_where_flipped_conf_value_is_higher] - preds[indices_to_be_replaced,:,:] = preds_flipped[indices_where_flipped_conf_value_is_higher, :, :] + preds[indices_to_be_replaced,:,:] = \ + preds_flipped[indices_where_flipped_conf_value_is_higher, :, :] if dir_in_bin is not None: preds_bin = self.prediction_model.predict(imgs_bin, verbose=0) @@ -5833,35 +6091,42 @@ class Eynollah_ocr: preds_max_fliped = np.max(preds_flipped, axis=2 ) preds_max_args_flipped = np.argmax(preds_flipped, axis=2 ) pred_max_not_unk_mask_bool_flipped = preds_max_args_flipped[:,:]!=self.end_character - masked_means_flipped = np.sum(preds_max_fliped * pred_max_not_unk_mask_bool_flipped, axis=1) / np.sum(pred_max_not_unk_mask_bool_flipped, axis=1) + masked_means_flipped = \ + np.sum(preds_max_fliped * pred_max_not_unk_mask_bool_flipped, axis=1) / \ + np.sum(pred_max_not_unk_mask_bool_flipped, axis=1) masked_means_flipped[np.isnan(masked_means_flipped)] = 0 preds_max = np.max(preds, axis=2 ) preds_max_args = np.argmax(preds, axis=2 ) pred_max_not_unk_mask_bool = preds_max_args[:,:]!=self.end_character - masked_means = np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / np.sum(pred_max_not_unk_mask_bool, axis=1) + masked_means = \ + np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / \ + np.sum(pred_max_not_unk_mask_bool, axis=1) masked_means[np.isnan(masked_means)] = 0 masked_means_ver = masked_means[indices_ver] #print(masked_means_ver, 'pred_max_not_unk') - indices_where_flipped_conf_value_is_higher = np.where(masked_means_flipped > masked_means_ver)[0] + indices_where_flipped_conf_value_is_higher = \ + np.where(masked_means_flipped > masked_means_ver)[0] #print(indices_where_flipped_conf_value_is_higher, 'indices_where_flipped_conf_value_is_higher') if len(indices_where_flipped_conf_value_is_higher)>0: indices_to_be_replaced = indices_ver[indices_where_flipped_conf_value_is_higher] - preds_bin[indices_to_be_replaced,:,:] = preds_flipped[indices_where_flipped_conf_value_is_higher, :, :] + preds_bin[indices_to_be_replaced,:,:] = \ + preds_flipped[indices_where_flipped_conf_value_is_higher, :, :] preds = (preds + preds_bin) / 2. - pred_texts = decode_batch_predictions(preds, self.num_to_char) preds_max = np.max(preds, axis=2 ) preds_max_args = np.argmax(preds, axis=2 ) pred_max_not_unk_mask_bool = preds_max_args[:,:]!=self.end_character - masked_means = np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / np.sum(pred_max_not_unk_mask_bool, axis=1) + masked_means = \ + np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / \ + np.sum(pred_max_not_unk_mask_bool, axis=1) for ib in range(imgs.shape[0]): pred_texts_ib = pred_texts[ib].replace("[UNK]", "") @@ -5876,31 +6141,40 @@ class Eynollah_ocr: del cropped_lines_bin gc.collect() - extracted_texts_merged = [extracted_texts[ind] if cropped_lines_meging_indexing[ind]==0 else extracted_texts[ind]+" "+extracted_texts[ind+1] if cropped_lines_meging_indexing[ind]==1 else None for ind in range(len(cropped_lines_meging_indexing))] + extracted_texts_merged = [extracted_texts[ind] + if cropped_lines_meging_indexing[ind]==0 + else extracted_texts[ind]+" "+extracted_texts[ind+1] + if cropped_lines_meging_indexing[ind]==1 + else None + for ind in range(len(cropped_lines_meging_indexing))] - extracted_conf_value_merged = [extracted_conf_value[ind] if cropped_lines_meging_indexing[ind]==0 else (extracted_conf_value[ind]+extracted_conf_value[ind+1])/2. if cropped_lines_meging_indexing[ind]==1 else None for ind in range(len(cropped_lines_meging_indexing))] + extracted_conf_value_merged = [extracted_conf_value[ind] + if cropped_lines_meging_indexing[ind]==0 + else (extracted_conf_value[ind]+extracted_conf_value[ind+1])/2. + if cropped_lines_meging_indexing[ind]==1 + else None + for ind in range(len(cropped_lines_meging_indexing))] - extracted_conf_value_merged = [extracted_conf_value_merged[ind_cfm] for ind_cfm in range(len(extracted_texts_merged)) if extracted_texts_merged[ind_cfm] is not None] + extracted_conf_value_merged = [extracted_conf_value_merged[ind_cfm] + for ind_cfm in range(len(extracted_texts_merged)) + if extracted_texts_merged[ind_cfm] is not None] extracted_texts_merged = [ind for ind in extracted_texts_merged if ind is not None] unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer) - if dir_out_image_text: - #font_path = "Charis-7.000/Charis-Regular.ttf" # Make sure this file exists! font = importlib_resources.files(__package__) / "Charis-Regular.ttf" with importlib_resources.as_file(font) as font: font = ImageFont.truetype(font=font, size=40) for indexer_text, bb_ind in enumerate(total_bb_coordinates): - - x_bb = bb_ind[0] y_bb = bb_ind[1] w_bb = bb_ind[2] h_bb = bb_ind[3] - font = fit_text_single_line(draw, extracted_texts_merged[indexer_text], font.path, w_bb, int(h_bb*0.4) ) + font = fit_text_single_line(draw, extracted_texts_merged[indexer_text], + font.path, w_bb, int(h_bb*0.4) ) ##draw.rectangle([x_bb, y_bb, x_bb + w_bb, y_bb + h_bb], outline="red", width=2) @@ -5917,24 +6191,25 @@ class Eynollah_ocr: text_by_textregion = [] for ind in unique_cropped_lines_region_indexer: - extracted_texts_merged_un = np.array(extracted_texts_merged)[np.array(cropped_lines_region_indexer)==ind] + ind = np.array(cropped_lines_region_indexer)==ind + extracted_texts_merged_un = np.array(extracted_texts_merged)[ind] if len(extracted_texts_merged_un)>1: text_by_textregion_ind = "" next_glue = "" for indt in range(len(extracted_texts_merged_un)): - if extracted_texts_merged_un[indt].endswith('⸗') or extracted_texts_merged_un[indt].endswith('-') or extracted_texts_merged_un[indt].endswith('¬'): - text_by_textregion_ind = text_by_textregion_ind + next_glue + extracted_texts_merged_un[indt][:-1] + if (extracted_texts_merged_un[indt].endswith('⸗') or + extracted_texts_merged_un[indt].endswith('-') or + extracted_texts_merged_un[indt].endswith('¬')): + text_by_textregion_ind += next_glue + extracted_texts_merged_un[indt][:-1] next_glue = "" else: - text_by_textregion_ind = text_by_textregion_ind + next_glue + extracted_texts_merged_un[indt] + text_by_textregion_ind += next_glue + extracted_texts_merged_un[indt] next_glue = " " text_by_textregion.append(text_by_textregion_ind) - else: text_by_textregion.append(" ".join(extracted_texts_merged_un)) #print(text_by_textregion, 'text_by_textregiontext_by_textregiontext_by_textregiontext_by_textregiontext_by_textregion') - - + ###index_tot_regions = [] ###tot_region_ref = [] @@ -5983,7 +6258,8 @@ class Eynollah_ocr: if childtest3.tag.endswith("TextEquiv"): for child_uc in childtest3: if child_uc.tag.endswith("Unicode"): - childtest3.set('conf', f"{extracted_conf_value_merged[indexer]:.2f}") + childtest3.set('conf', + f"{extracted_conf_value_merged[indexer]:.2f}") child_uc.text = extracted_texts_merged[indexer] indexer = indexer + 1 @@ -5999,7 +6275,9 @@ class Eynollah_ocr: unicode_textregion.text = text_by_textregion[indexer_textregion] indexer_textregion = indexer_textregion + 1 - ###sample_order = [(id_to_order[tid], text) for tid, text in zip(id_textregions, textregions_by_existing_ids) if tid in id_to_order] + ###sample_order = [(id_to_order[tid], text) + ### for tid, text in zip(id_textregions, textregions_by_existing_ids) + ### if tid in id_to_order] ##ordered_texts_sample = [text for _, text in sorted(sample_order)] ##tot_page_text = ' '.join(ordered_texts_sample) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index f8926cf..52bf3ef 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1012,8 +1012,13 @@ def check_any_text_region_in_model_one_is_main_or_header_light( (regions_model_full[:,:,0]==2)).sum() pixels_main = all_pixels - pixels_header - if ( (pixels_header/float(pixels_main)>=0.6) and ( (length_con[ii]/float(height_con[ii]) )>=1.3 ) and ( (length_con[ii]/float(height_con[ii]) )<=3 )) or ( (pixels_header/float(pixels_main)>=0.3) and ( (length_con[ii]/float(height_con[ii]) )>=3 ) ): - regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ]=2 + if (( pixels_header / float(pixels_main) >= 0.6 and + length_con[ii] / float(height_con[ii]) >= 1.3 and + length_con[ii] / float(height_con[ii]) <= 3 ) or + ( pixels_header / float(pixels_main) >= 0.3 and + length_con[ii] / float(height_con[ii]) >=3 )): + + regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ] = 2 contours_only_text_parent_head.append(contours_only_text_parent[ii]) conf_contours_head.append(None) # why not conf_contours[ii], too? if contours_only_text_parent_d_ordered is not None: @@ -1021,8 +1026,9 @@ def check_any_text_region_in_model_one_is_main_or_header_light( all_box_coord_head.append(all_box_coord[ii]) slopes_head.append(slopes[ii]) all_found_textline_polygons_head.append(all_found_textline_polygons[ii]) + else: - regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ]=1 + regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ] = 1 contours_only_text_parent_main.append(contours_only_text_parent[ii]) conf_contours_main.append(conf_contours[ii]) if contours_only_text_parent_d_ordered is not None: @@ -1883,7 +1889,8 @@ def return_boxes_of_images_by_order_of_reading_new( range(x_start_without_mother[dj], x_end_without_mother[dj])) columns_not_covered = list(all_columns - columns_covered_by_mothers) - y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + len(x_start_without_mother), + y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + + len(x_start_without_mother), dtype=int) * splitter_y_new[i]) ##y_lines_by_order = np.append(y_lines_by_order, [splitter_y_new[i]] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) @@ -1938,7 +1945,8 @@ def return_boxes_of_images_by_order_of_reading_new( columns_covered_by_with_child_no_mothers.update( range(x_start_with_child_without_mother[dj], x_end_with_child_without_mother[dj])) - columns_not_covered_child_no_mother = list(all_columns - columns_covered_by_with_child_no_mothers) + columns_not_covered_child_no_mother = list( + all_columns - columns_covered_by_with_child_no_mothers) #indexes_to_be_spanned=[] for i_s in range(len(x_end_with_child_without_mother)): columns_not_covered_child_no_mother.append(x_start_with_child_without_mother[i_s]) @@ -1948,7 +1956,8 @@ def return_boxes_of_images_by_order_of_reading_new( x_start_with_child_without_mother = np.array(x_start_with_child_without_mother, int) for i_s_nc in columns_not_covered_child_no_mother: if i_s_nc in x_start_with_child_without_mother: - x_end_biggest_column = x_end_with_child_without_mother[x_start_with_child_without_mother==i_s_nc][0] + x_end_biggest_column = \ + x_end_with_child_without_mother[x_start_with_child_without_mother==i_s_nc][0] args_all_biggest_lines = ind_args[(x_starting==i_s_nc) & (x_ending==x_end_biggest_column)] y_column_nc = y_type_2[args_all_biggest_lines] @@ -1996,9 +2005,12 @@ def return_boxes_of_images_by_order_of_reading_new( np.array(list(set(list(range(len(y_all_between_nm_wc)))) - set(list(index_lines_so_close_to_top_separator)))) if len(indexes_remained_after_deleting_closed_lines) > 0: - y_all_between_nm_wc = y_all_between_nm_wc[indexes_remained_after_deleting_closed_lines] - x_starting_all_between_nm_wc = x_starting_all_between_nm_wc[indexes_remained_after_deleting_closed_lines] - x_ending_all_between_nm_wc = x_ending_all_between_nm_wc[indexes_remained_after_deleting_closed_lines] + y_all_between_nm_wc = \ + y_all_between_nm_wc[indexes_remained_after_deleting_closed_lines] + x_starting_all_between_nm_wc = \ + x_starting_all_between_nm_wc[indexes_remained_after_deleting_closed_lines] + x_ending_all_between_nm_wc = \ + x_ending_all_between_nm_wc[indexes_remained_after_deleting_closed_lines] y_all_between_nm_wc = np.append(y_all_between_nm_wc, y_column_nc[i_c]) x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, i_s_nc) diff --git a/src/eynollah/utils/separate_lines.py b/src/eynollah/utils/separate_lines.py index 7a8926d..d41dda1 100644 --- a/src/eynollah/utils/separate_lines.py +++ b/src/eynollah/utils/separate_lines.py @@ -67,7 +67,8 @@ def dedup_separate_lines(img_patch, contour_text_interest, thetha, axis): peaks_neg_e, _ = find_peaks(y_padded_up_to_down_padded_e, height=0) neg_peaks_max = np.max(y_padded_up_to_down_padded_e[peaks_neg_e]) - arg_neg_must_be_deleted = np.arange(len(peaks_neg_e))[y_padded_up_to_down_padded_e[peaks_neg_e] / float(neg_peaks_max) < 0.3] + arg_neg_must_be_deleted = np.arange(len(peaks_neg_e))[ + y_padded_up_to_down_padded_e[peaks_neg_e] / float(neg_peaks_max) < 0.3] diff_arg_neg_must_be_deleted = np.diff(arg_neg_must_be_deleted) arg_diff = np.array(range(len(diff_arg_neg_must_be_deleted))) @@ -78,11 +79,14 @@ def dedup_separate_lines(img_patch, contour_text_interest, thetha, axis): clusters_to_be_deleted = [] if len(arg_diff_cluster) > 0: - clusters_to_be_deleted.append(arg_neg_must_be_deleted[0 : arg_diff_cluster[0] + 1]) + clusters_to_be_deleted.append( + arg_neg_must_be_deleted[0 : arg_diff_cluster[0] + 1]) for i in range(len(arg_diff_cluster) - 1): - clusters_to_be_deleted.append(arg_neg_must_be_deleted[arg_diff_cluster[i] + 1 : - arg_diff_cluster[i + 1] + 1]) - clusters_to_be_deleted.append(arg_neg_must_be_deleted[arg_diff_cluster[len(arg_diff_cluster) - 1] + 1 :]) + clusters_to_be_deleted.append( + arg_neg_must_be_deleted[arg_diff_cluster[i] + 1 : + arg_diff_cluster[i + 1] + 1]) + clusters_to_be_deleted.append( + arg_neg_must_be_deleted[arg_diff_cluster[len(arg_diff_cluster) - 1] + 1 :]) if len(clusters_to_be_deleted) > 0: peaks_new_extra = [] for m in range(len(clusters_to_be_deleted)): @@ -179,7 +183,8 @@ def separate_lines(img_patch, contour_text_interest, thetha, x_help, y_help): peaks_neg_e, _ = find_peaks(y_padded_up_to_down_padded_e, height=0) neg_peaks_max=np.max(y_padded_up_to_down_padded_e[peaks_neg_e]) - arg_neg_must_be_deleted= np.arange(len(peaks_neg_e))[y_padded_up_to_down_padded_e[peaks_neg_e]/float(neg_peaks_max)<0.3] + arg_neg_must_be_deleted = np.arange(len(peaks_neg_e))[ + y_padded_up_to_down_padded_e[peaks_neg_e]/float(neg_peaks_max)<0.3] diff_arg_neg_must_be_deleted=np.diff(arg_neg_must_be_deleted) arg_diff=np.array(range(len(diff_arg_neg_must_be_deleted))) @@ -239,7 +244,8 @@ def separate_lines(img_patch, contour_text_interest, thetha, x_help, y_help): try: neg_peaks_max=np.max(y_padded_smoothed[peaks]) - arg_neg_must_be_deleted= np.arange(len(peaks_neg))[y_padded_up_to_down_padded[peaks_neg]/float(neg_peaks_max)<0.42] + arg_neg_must_be_deleted = np.arange(len(peaks_neg))[ + y_padded_up_to_down_padded[peaks_neg]/float(neg_peaks_max)<0.42] diff_arg_neg_must_be_deleted=np.diff(arg_neg_must_be_deleted) arg_diff=np.array(range(len(diff_arg_neg_must_be_deleted))) @@ -316,23 +322,36 @@ def separate_lines(img_patch, contour_text_interest, thetha, x_help, y_help): if peaks_values[jj]>mean_value_of_peaks-std_value_of_peaks/2.: point_up = peaks[jj] + first_nonzero - int(1.3 * dis_to_next_up) ##+int(dis_to_next_up*1./4.0) - point_down =y_max_cont-1##peaks[jj] + first_nonzero + int(1.3 * dis_to_next_down) #point_up# np.max(y_cont)#peaks[jj] + first_nonzero + int(1.4 * dis_to_next_down) ###-int(dis_to_next_down*1./4.0) + point_down =y_max_cont-1 + ##peaks[jj] + first_nonzero + int(1.3 * dis_to_next_down) + #point_up + # np.max(y_cont)#peaks[jj] + first_nonzero + int(1.4 * dis_to_next_down) + ###-int(dis_to_next_down*1./4.0) else: point_up = peaks[jj] + first_nonzero - int(1.4 * dis_to_next_up) ##+int(dis_to_next_up*1./4.0) - point_down =y_max_cont-1##peaks[jj] + first_nonzero + int(1.6 * dis_to_next_down) #point_up# np.max(y_cont)#peaks[jj] + first_nonzero + int(1.4 * dis_to_next_down) ###-int(dis_to_next_down*1./4.0) + point_down =y_max_cont-1 + ##peaks[jj] + first_nonzero + int(1.6 * dis_to_next_down) + #point_up + # np.max(y_cont)#peaks[jj] + first_nonzero + int(1.4 * dis_to_next_down) + ###-int(dis_to_next_down*1./4.0) point_down_narrow = peaks[jj] + first_nonzero + int( - 1.4 * dis_to_next_down) ###-int(dis_to_next_down*1./2) + 1.4 * dis_to_next_down) + ###-int(dis_to_next_down*1./2) else: dis_to_next_up = abs(peaks[jj] - peaks_neg[jj]) dis_to_next_down = abs(peaks[jj] - peaks_neg[jj + 1]) if peaks_values[jj]>mean_value_of_peaks-std_value_of_peaks/2.: - point_up = peaks[jj] + first_nonzero - int(1.1 * dis_to_next_up) ##+int(dis_to_next_up*1./4.0) - point_down = peaks[jj] + first_nonzero + int(1.1 * dis_to_next_down) ###-int(dis_to_next_down*1./4.0) + point_up = peaks[jj] + first_nonzero - int(1.1 * dis_to_next_up) + ##+int(dis_to_next_up*1./4.0) + point_down = peaks[jj] + first_nonzero + int(1.1 * dis_to_next_down) + ###-int(dis_to_next_down*1./4.0) else: - point_up = peaks[jj] + first_nonzero - int(1.23 * dis_to_next_up) ##+int(dis_to_next_up*1./4.0) - point_down = peaks[jj] + first_nonzero + int(1.33 * dis_to_next_down) ###-int(dis_to_next_down*1./4.0) + point_up = peaks[jj] + first_nonzero - int(1.23 * dis_to_next_up) + ##+int(dis_to_next_up*1./4.0) + point_down = peaks[jj] + first_nonzero + int(1.33 * dis_to_next_down) + ###-int(dis_to_next_down*1./4.0) point_down_narrow = peaks[jj] + first_nonzero + int( 1.1 * dis_to_next_down) ###-int(dis_to_next_down*1./2) @@ -341,7 +360,9 @@ def separate_lines(img_patch, contour_text_interest, thetha, x_help, y_help): point_down_narrow = img_patch.shape[0] - 2 - distances = [cv2.pointPolygonTest(contour_text_interest_copy, tuple(int(x) for x in np.array([xv[mj], peaks[jj] + first_nonzero])), True) + distances = [cv2.pointPolygonTest(contour_text_interest_copy, + tuple(int(x) for x in np.array([xv[mj], peaks[jj] + first_nonzero])), + True) for mj in range(len(xv))] distances = np.array(distances) @@ -468,7 +489,8 @@ def separate_lines(img_patch, contour_text_interest, thetha, x_help, y_help): point_up =peaks[jj] + first_nonzero - int(1. / 1.8 * dis_to_next) distances = [cv2.pointPolygonTest(contour_text_interest_copy, - tuple(int(x) for x in np.array([xv[mj], peaks[jj] + first_nonzero])), True) + tuple(int(x) for x in np.array([xv[mj], peaks[jj] + first_nonzero])), + True) for mj in range(len(xv))] distances = np.array(distances) @@ -543,7 +565,8 @@ def separate_lines(img_patch, contour_text_interest, thetha, x_help, y_help): point_down = peaks[jj] + first_nonzero + int(1. / 1.9 * dis_to_next_down) distances = [cv2.pointPolygonTest(contour_text_interest_copy, - tuple(int(x) for x in np.array([xv[mj], peaks[jj] + first_nonzero])), True) + tuple(int(x) for x in np.array([xv[mj], peaks[jj] + first_nonzero])), + True) for mj in range(len(xv))] distances = np.array(distances) @@ -613,7 +636,8 @@ def separate_lines_vertical(img_patch, contour_text_interest, thetha): neg_peaks_max = np.max(y_padded_up_to_down_padded[peaks_neg]) - arg_neg_must_be_deleted = np.arange(len(peaks_neg))[y_padded_up_to_down_padded[peaks_neg] / float(neg_peaks_max) < 0.42] + arg_neg_must_be_deleted = np.arange(len(peaks_neg))[ + y_padded_up_to_down_padded[peaks_neg] / float(neg_peaks_max) < 0.42] diff_arg_neg_must_be_deleted = np.diff(arg_neg_must_be_deleted) arg_diff = np.array(range(len(diff_arg_neg_must_be_deleted))) @@ -689,30 +713,50 @@ def separate_lines_vertical(img_patch, contour_text_interest, thetha): dis_to_next_down = abs(peaks[jj] - peaks_neg[jj + 1]) if peaks_values[jj] > mean_value_of_peaks - std_value_of_peaks / 2.0: - point_up = peaks[jj] + first_nonzero - int(1.3 * dis_to_next_up) ##+int(dis_to_next_up*1./4.0) - point_down = x_max_cont - 1 ##peaks[jj] + first_nonzero + int(1.3 * dis_to_next_down) #point_up# np.max(y_cont)#peaks[jj] + first_nonzero + int(1.4 * dis_to_next_down) ###-int(dis_to_next_down*1./4.0) + point_up = peaks[jj] + first_nonzero - int(1.3 * dis_to_next_up) + ##+int(dis_to_next_up*1./4.0) + point_down = x_max_cont - 1 + ##peaks[jj] + first_nonzero + int(1.3 * dis_to_next_down) + #point_up + # np.max(y_cont)#peaks[jj] + first_nonzero + int(1.4 * dis_to_next_down) + ###-int(dis_to_next_down*1./4.0) else: - point_up = peaks[jj] + first_nonzero - int(1.4 * dis_to_next_up) ##+int(dis_to_next_up*1./4.0) - point_down = x_max_cont - 1 ##peaks[jj] + first_nonzero + int(1.6 * dis_to_next_down) #point_up# np.max(y_cont)#peaks[jj] + first_nonzero + int(1.4 * dis_to_next_down) ###-int(dis_to_next_down*1./4.0) + point_up = peaks[jj] + first_nonzero - int(1.4 * dis_to_next_up) + ##+int(dis_to_next_up*1./4.0) + point_down = x_max_cont - 1 + ##peaks[jj] + first_nonzero + int(1.6 * dis_to_next_down) + #point_up + # np.max(y_cont) + #peaks[jj] + first_nonzero + int(1.4 * dis_to_next_down) + ###-int(dis_to_next_down*1./4.0) - point_down_narrow = peaks[jj] + first_nonzero + int(1.4 * dis_to_next_down) ###-int(dis_to_next_down*1./2) + point_down_narrow = peaks[jj] + first_nonzero + int(1.4 * dis_to_next_down) + ###-int(dis_to_next_down*1./2) else: dis_to_next_up = abs(peaks[jj] - peaks_neg[jj]) dis_to_next_down = abs(peaks[jj] - peaks_neg[jj + 1]) if peaks_values[jj] > mean_value_of_peaks - std_value_of_peaks / 2.0: - point_up = peaks[jj] + first_nonzero - int(1.1 * dis_to_next_up) ##+int(dis_to_next_up*1./4.0) - point_down = peaks[jj] + first_nonzero + int(1.1 * dis_to_next_down) ###-int(dis_to_next_down*1./4.0) + point_up = peaks[jj] + first_nonzero - int(1.1 * dis_to_next_up) + ##+int(dis_to_next_up*1./4.0) + point_down = peaks[jj] + first_nonzero + int(1.1 * dis_to_next_down) + ###-int(dis_to_next_down*1./4.0) else: - point_up = peaks[jj] + first_nonzero - int(1.23 * dis_to_next_up) ##+int(dis_to_next_up*1./4.0) - point_down = peaks[jj] + first_nonzero + int(1.33 * dis_to_next_down) ###-int(dis_to_next_down*1./4.0) + point_up = peaks[jj] + first_nonzero - int(1.23 * dis_to_next_up) + ##+int(dis_to_next_up*1./4.0) + point_down = peaks[jj] + first_nonzero + int(1.33 * dis_to_next_down) + ###-int(dis_to_next_down*1./4.0) - point_down_narrow = peaks[jj] + first_nonzero + int(1.1 * dis_to_next_down) ###-int(dis_to_next_down*1./2) + point_down_narrow = peaks[jj] + first_nonzero + int(1.1 * dis_to_next_down) + ###-int(dis_to_next_down*1./2) if point_down_narrow >= img_patch.shape[0]: point_down_narrow = img_patch.shape[0] - 2 - distances = [cv2.pointPolygonTest(contour_text_interest_copy, tuple(int(x) for x in np.array([xv[mj], peaks[jj] + first_nonzero])), True) for mj in range(len(xv))] + distances = [cv2.pointPolygonTest(contour_text_interest_copy, + tuple(int(x) for x in np.array([xv[mj], peaks[jj] + first_nonzero])), + True) + for mj in range(len(xv))] distances = np.array(distances) xvinside = xv[distances >= 0] @@ -801,7 +845,8 @@ def separate_lines_vertical(img_patch, contour_text_interest, thetha): point_up = peaks[jj] + first_nonzero - int(1.0 / 1.8 * dis_to_next) distances = [cv2.pointPolygonTest(contour_text_interest_copy, - tuple(int(x) for x in np.array([xv[mj], peaks[jj] + first_nonzero])), True) + tuple(int(x) for x in np.array([xv[mj], peaks[jj] + first_nonzero])), + True) for mj in range(len(xv))] distances = np.array(distances) @@ -866,7 +911,8 @@ def separate_lines_vertical(img_patch, contour_text_interest, thetha): point_down = peaks[jj] + first_nonzero + int(1.0 / 1.9 * dis_to_next_down) distances = [cv2.pointPolygonTest(contour_text_interest_copy, - tuple(int(x) for x in np.array([xv[mj], peaks[jj] + first_nonzero])), True) + tuple(int(x) for x in np.array([xv[mj], peaks[jj] + first_nonzero])), + True) for mj in range(len(xv))] distances = np.array(distances) @@ -950,7 +996,8 @@ def separate_lines_new_inside_tiles2(img_patch, thetha): peaks_neg_e, _ = find_peaks(y_padded_up_to_down_padded_e, height=0) neg_peaks_max = np.max(y_padded_up_to_down_padded_e[peaks_neg_e]) - arg_neg_must_be_deleted = np.arange(len(peaks_neg_e))[y_padded_up_to_down_padded_e[peaks_neg_e] / float(neg_peaks_max) < 0.3] + arg_neg_must_be_deleted = np.arange(len(peaks_neg_e))[ + y_padded_up_to_down_padded_e[peaks_neg_e] / float(neg_peaks_max) < 0.3] diff_arg_neg_must_be_deleted = np.diff(arg_neg_must_be_deleted) arg_diff = np.array(range(len(diff_arg_neg_must_be_deleted))) @@ -963,8 +1010,11 @@ def separate_lines_new_inside_tiles2(img_patch, thetha): if len(arg_diff_cluster) > 0: clusters_to_be_deleted.append(arg_neg_must_be_deleted[0 : arg_diff_cluster[0] + 1]) for i in range(len(arg_diff_cluster) - 1): - clusters_to_be_deleted.append(arg_neg_must_be_deleted[arg_diff_cluster[i] + 1 : arg_diff_cluster[i + 1] + 1]) - clusters_to_be_deleted.append(arg_neg_must_be_deleted[arg_diff_cluster[len(arg_diff_cluster) - 1] + 1 :]) + clusters_to_be_deleted.append( + arg_neg_must_be_deleted[arg_diff_cluster[i] + 1: + arg_diff_cluster[i + 1] + 1]) + clusters_to_be_deleted.append( + arg_neg_must_be_deleted[arg_diff_cluster[len(arg_diff_cluster) - 1] + 1 :]) if len(clusters_to_be_deleted) > 0: peaks_new_extra = [] for m in range(len(clusters_to_be_deleted)): @@ -1014,7 +1064,8 @@ def separate_lines_new_inside_tiles2(img_patch, thetha): try: neg_peaks_max = np.max(y_padded_smoothed[peaks]) - arg_neg_must_be_deleted = np.arange(len(peaks_neg))[y_padded_up_to_down_padded[peaks_neg] / float(neg_peaks_max) < 0.24] + arg_neg_must_be_deleted = np.arange(len(peaks_neg))[ + y_padded_up_to_down_padded[peaks_neg] / float(neg_peaks_max) < 0.24] diff_arg_neg_must_be_deleted = np.diff(arg_neg_must_be_deleted) arg_diff = np.array(range(len(diff_arg_neg_must_be_deleted))) @@ -1290,7 +1341,9 @@ def separate_lines_vertical_cont(img_patch, contour_text_interest, thetha, box_i return None, cont_final -def textline_contours_postprocessing(textline_mask, slope, contour_text_interest, box_ind, add_boxes_coor_into_textlines=False): +def textline_contours_postprocessing(textline_mask, slope, + contour_text_interest, box_ind, + add_boxes_coor_into_textlines=False): textline_mask = np.repeat(textline_mask[:, :, np.newaxis], 3, axis=2) * 255 textline_mask = textline_mask.astype(np.uint8) kernel = np.ones((5, 5), np.uint8) @@ -1485,7 +1538,8 @@ def return_deskew_slop(img_patch_org, sigma_des,n_tot_angles=100, onset_y=int((img_resized.shape[0]-img_int.shape[0])/2.) #img_resized=np.zeros((int( img_int.shape[0]*(1.8) ) , int( img_int.shape[1]*(2.6) ) )) - #img_resized[ int( img_int.shape[0]*(.4)):int( img_int.shape[0]*(.4))+img_int.shape[0] , int( img_int.shape[1]*(.8)):int( img_int.shape[1]*(.8))+img_int.shape[1] ]=img_int[:,:] + #img_resized[ int( img_int.shape[0]*(.4)):int( img_int.shape[0]*(.4))+img_int.shape[0], + # int( img_int.shape[1]*(.8)):int( img_int.shape[1]*(.8))+img_int.shape[1] ]=img_int[:,:] img_resized[ onset_y:onset_y+img_int.shape[0] , onset_x:onset_x+img_int.shape[1] ]=img_int[:,:] if main_page and img_patch_org.shape[1] > img_patch_org.shape[0]: @@ -1689,14 +1743,18 @@ def do_work_of_slopes_new_curved( mask_biggest2 = cv2.dilate(mask_biggest2, KERNEL, iterations=4) pixel_img = 1 - mask_biggest2 = resize_image(mask_biggest2, int(mask_biggest2.shape[0] * scale_par), int(mask_biggest2.shape[1] * scale_par)) + mask_biggest2 = resize_image(mask_biggest2, + int(mask_biggest2.shape[0] * scale_par), + int(mask_biggest2.shape[1] * scale_par)) cnt_textlines_in_image_ind = return_contours_of_interested_textline(mask_biggest2, pixel_img) try: textlines_cnt_per_region.append(cnt_textlines_in_image_ind[0]) except Exception as why: logger.error(why) else: - textlines_cnt_per_region = textline_contours_postprocessing(all_text_region_raw, slope_for_all, contour_par, box_text, True) + textlines_cnt_per_region = textline_contours_postprocessing(all_text_region_raw, + slope_for_all, contour_par, + box_text, True) return textlines_cnt_per_region[::-1], box_text, contour, contour_par, crop_coor, index_r_con, slope diff --git a/src/eynollah/utils/utils_ocr.py b/src/eynollah/utils/utils_ocr.py index 5f19387..602ad6e 100644 --- a/src/eynollah/utils/utils_ocr.py +++ b/src/eynollah/utils/utils_ocr.py @@ -370,7 +370,11 @@ def return_textline_contour_with_added_box_coordinate(textline_contour, box_ind return textline_contour -def return_rnn_cnn_ocr_of_given_textlines(image, all_found_textline_polygons, prediction_model, b_s_ocr, num_to_char, textline_light=False, curved_line=False): +def return_rnn_cnn_ocr_of_given_textlines(image, all_found_textline_polygons, + prediction_model, + b_s_ocr, num_to_char, + textline_light=False, + curved_line=False): max_len = 512 padding_token = 299 image_width = 512#max_len * 4 @@ -426,17 +430,23 @@ def return_rnn_cnn_ocr_of_given_textlines(image, all_found_textline_polygons, pr splited_images, splited_images_bin = return_textlines_split_if_needed(img_crop, None) if splited_images: - img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images[0], image_height, image_width) + img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images[0], + image_height, + image_width) cropped_lines.append(img_fin) cropped_lines_meging_indexing.append(1) - img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images[1], image_height, image_width) + img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images[1], + image_height, + image_width) cropped_lines.append(img_fin) cropped_lines_meging_indexing.append(-1) else: - img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) + img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop, + image_height, + image_width) cropped_lines.append(img_fin) cropped_lines_meging_indexing.append(0) @@ -469,7 +479,12 @@ def return_rnn_cnn_ocr_of_given_textlines(image, all_found_textline_polygons, pr pred_texts_ib = pred_texts[ib].replace("[UNK]", "") extracted_texts.append(pred_texts_ib) - extracted_texts_merged = [extracted_texts[ind] if cropped_lines_meging_indexing[ind]==0 else extracted_texts[ind]+" "+extracted_texts[ind+1] if cropped_lines_meging_indexing[ind]==1 else None for ind in range(len(cropped_lines_meging_indexing))] + extracted_texts_merged = [extracted_texts[ind] + if cropped_lines_meging_indexing[ind]==0 + else extracted_texts[ind]+" "+extracted_texts[ind+1] + if cropped_lines_meging_indexing[ind]==1 + else None + for ind in range(len(cropped_lines_meging_indexing))] extracted_texts_merged = [ind for ind in extracted_texts_merged if ind is not None] unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer) From b21051db21cf4c0f0e1bbf288cd4e985cc01cb7f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 30 Sep 2025 19:16:00 +0200 Subject: [PATCH 034/101] ProcessPoolExecutor: shutdown during del() instead of atexit() --- src/eynollah/eynollah.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 2e31433..7a28478 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -260,7 +260,6 @@ class Eynollah: # for parallelization of CPU-intensive tasks: self.executor = ProcessPoolExecutor(max_workers=cpu_count()) - atexit.register(self.executor.shutdown) if threshold_art_class_layout: self.threshold_art_class_layout = float(threshold_art_class_layout) @@ -406,6 +405,26 @@ class Eynollah: self.logger.info(f"Model initialization complete ({time.time() - t_start:.1f}s)") + def __del__(self): + if hasattr(self, 'executor') and getattr(self, 'executor'): + self.executor.shutdown() + for model_name in ['model_page', + 'model_classifier', + 'model_bin', + 'model_enhancement', + 'model_region', + 'model_region_1_2', + 'model_region_p2', + 'model_region_fl_np', + 'model_region_fl', + 'model_textline', + 'model_reading_order', + 'model_table', + 'model_ocr', + 'processor']: + if hasattr(self, model_name) and getattr(self, model_name): + delattr(self, model_name) + def cache_images(self, image_filename=None, image_pil=None, dpi=None): ret = {} t_c0 = time.time() From 375e0263d4188ff5ca43037a6176544009c74e17 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 30 Sep 2025 19:16:50 +0200 Subject: [PATCH 035/101] CNN-RNN OCR model: switch to 20250930 version (compatible with TF 2.12 on CPU as well) --- src/eynollah/eynollah.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 7a28478..62ce002 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -327,7 +327,7 @@ class Eynollah: if self.ocr and self.tr: self.model_ocr_dir = dir_models + "/model_eynollah_ocr_trocr_20250919" elif self.ocr and not self.tr: - self.model_ocr_dir = dir_models + "/model_eynollah_ocr_cnnrnn_20250904" + self.model_ocr_dir = dir_models + "/model_eynollah_ocr_cnnrnn_20250930" if self.tables: if self.light_version: self.model_table_dir = dir_models + "/modelens_table_0t4_201124" @@ -5392,7 +5392,7 @@ class Eynollah_ocr: if self.model_name: self.model_ocr_dir = self.model_name else: - self.model_ocr_dir = dir_models + "/model_eynollah_ocr_cnnrnn_20250904" + self.model_ocr_dir = dir_models + "/model_eynollah_ocr_cnnrnn_20250930" model_ocr = load_model(self.model_ocr_dir , compile=False) self.prediction_model = tf.keras.models.Model( From 61b20cc83d153aa0df2f5b75d6059ac80c730b3c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 30 Sep 2025 19:20:35 +0200 Subject: [PATCH 036/101] tests: switch from subtests to parametrize, use --isolate everywhere to free CUDA memory in between --- Makefile | 2 +- requirements-test.txt | 2 +- tests/test_run.py | 202 ++++++++++++++++++++---------------------- 3 files changed, 100 insertions(+), 106 deletions(-) diff --git a/Makefile b/Makefile index a920615..dd95c0a 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ BIN_MODEL := https://github.com/qurator-spk/sbb_binarization/releases/download/v OCR_MODEL := https://zenodo.org/records/17194824/files/models_ocr_v0_5_0.tar.gz?download=1 -PYTEST_ARGS ?= -vv +PYTEST_ARGS ?= -vv --isolate # BEGIN-EVAL makefile-parser --make-help Makefile diff --git a/requirements-test.txt b/requirements-test.txt index cce9428..3ebcf71 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,4 +1,4 @@ pytest -pytest-subtests +pytest-isolate coverage[toml] black diff --git a/tests/test_run.py b/tests/test_run.py index be928a0..59e5099 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -20,23 +20,9 @@ MODELS_LAYOUT = environ.get('MODELS_LAYOUT', str(testdir.joinpath('..', 'models_ MODELS_OCR = environ.get('MODELS_OCR', str(testdir.joinpath('..', 'models_ocr_v0_5_0').resolve())) MODELS_BIN = environ.get('MODELS_BIN', str(testdir.joinpath('..', 'default-2021-03-09').resolve())) -def test_run_eynollah_layout_filename(tmp_path, subtests, pytestconfig, caplog): - infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif') - outfile = tmp_path / 'kant_aufklaerung_1784_0020.xml' - args = [ - '-m', MODELS_LAYOUT, - '-i', str(infile), - '-o', str(outfile.parent), - # subtests write to same location - '--overwrite', - ] - if pytestconfig.getoption('verbose') > 0: - args.extend(['-l', 'DEBUG']) - caplog.set_level(logging.INFO) - def only_eynollah(logrec): - return logrec.name == 'eynollah' - runner = CliRunner() - for options in [ +@pytest.mark.parametrize( + "options", + [ [], # defaults ["--allow_scaling", "--curved-line"], ["--allow_scaling", "--curved-line", "--full-layout"], @@ -47,22 +33,34 @@ def test_run_eynollah_layout_filename(tmp_path, subtests, pytestconfig, caplog): # -eoi ... # --do_ocr # --skip_layout_and_reading_order - ]: - with subtests.test(#msg="test CLI", - options=options): - with caplog.filtering(only_eynollah): - result = runner.invoke(layout_cli, args + options, catch_exceptions=False) - assert result.exit_code == 0, result.stdout - logmsgs = [logrec.message for logrec in caplog.records] - assert str(infile) in logmsgs - assert outfile.exists() - tree = page_from_file(str(outfile)).etree - regions = tree.xpath("//page:TextRegion", namespaces=NS) - assert len(regions) >= 2, "result is inaccurate" - regions = tree.xpath("//page:SeparatorRegion", namespaces=NS) - assert len(regions) >= 2, "result is inaccurate" - lines = tree.xpath("//page:TextLine", namespaces=NS) - assert len(lines) == 31, "result is inaccurate" # 29 paragraph lines, 1 page and 1 catch-word line + ], ids=str) +def test_run_eynollah_layout_filename(tmp_path, pytestconfig, caplog, options): + infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif') + outfile = tmp_path / 'kant_aufklaerung_1784_0020.xml' + args = [ + '-m', MODELS_LAYOUT, + '-i', str(infile), + '-o', str(outfile.parent), + ] + if pytestconfig.getoption('verbose') > 0: + args.extend(['-l', 'DEBUG']) + caplog.set_level(logging.INFO) + def only_eynollah(logrec): + return logrec.name == 'eynollah' + runner = CliRunner() + with caplog.filtering(only_eynollah): + result = runner.invoke(layout_cli, args + options, catch_exceptions=False) + assert result.exit_code == 0, result.stdout + logmsgs = [logrec.message for logrec in caplog.records] + assert str(infile) in logmsgs + assert outfile.exists() + tree = page_from_file(str(outfile)).etree + regions = tree.xpath("//page:TextRegion", namespaces=NS) + assert len(regions) >= 2, "result is inaccurate" + regions = tree.xpath("//page:SeparatorRegion", namespaces=NS) + assert len(regions) >= 2, "result is inaccurate" + lines = tree.xpath("//page:TextLine", namespaces=NS) + assert len(lines) == 31, "result is inaccurate" # 29 paragraph lines, 1 page and 1 catch-word line def test_run_eynollah_layout_directory(tmp_path, pytestconfig, caplog): indir = testdir.joinpath('resources') @@ -86,7 +84,13 @@ def test_run_eynollah_layout_directory(tmp_path, pytestconfig, caplog): assert any(logmsg for logmsg in logmsgs if logmsg.startswith('All jobs done in')) assert len(list(outdir.iterdir())) == 2 -def test_run_eynollah_binarization_filename(tmp_path, subtests, pytestconfig, caplog): +@pytest.mark.parametrize( + "options", + [ + [], # defaults + ["--no-patches"], + ], ids=str) +def test_run_eynollah_binarization_filename(tmp_path, pytestconfig, caplog, options): infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif') outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png') args = [ @@ -100,25 +104,19 @@ def test_run_eynollah_binarization_filename(tmp_path, subtests, pytestconfig, ca def only_eynollah(logrec): return logrec.name == 'SbbBinarizer' runner = CliRunner() - for options in [ - [], # defaults - ["--no-patches"], - ]: - with subtests.test(#msg="test CLI", - options=options): - with caplog.filtering(only_eynollah): - result = runner.invoke(binarization_cli, args + options, catch_exceptions=False) - assert result.exit_code == 0, result.stdout - logmsgs = [logrec.message for logrec in caplog.records] - assert any(True for logmsg in logmsgs if logmsg.startswith('Predicting')) - assert outfile.exists() - with Image.open(infile) as original_img: - original_size = original_img.size - with Image.open(outfile) as binarized_img: - binarized_size = binarized_img.size - assert original_size == binarized_size + with caplog.filtering(only_eynollah): + result = runner.invoke(binarization_cli, args + options, catch_exceptions=False) + assert result.exit_code == 0, result.stdout + logmsgs = [logrec.message for logrec in caplog.records] + assert any(True for logmsg in logmsgs if logmsg.startswith('Predicting')) + assert outfile.exists() + with Image.open(infile) as original_img: + original_size = original_img.size + with Image.open(outfile) as binarized_img: + binarized_size = binarized_img.size + assert original_size == binarized_size -def test_run_eynollah_binarization_directory(tmp_path, subtests, pytestconfig, caplog): +def test_run_eynollah_binarization_directory(tmp_path, pytestconfig, caplog): indir = testdir.joinpath('resources') outdir = tmp_path args = [ @@ -139,15 +137,19 @@ def test_run_eynollah_binarization_directory(tmp_path, subtests, pytestconfig, c assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Predicting')]) == 2 assert len(list(outdir.iterdir())) == 2 -def test_run_eynollah_enhancement_filename(tmp_path, subtests, pytestconfig, caplog): +@pytest.mark.parametrize( + "options", + [ + [], # defaults + ["-sos"], + ], ids=str) +def test_run_eynollah_enhancement_filename(tmp_path, pytestconfig, caplog, options): infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif') outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png') args = [ '-m', MODELS_LAYOUT, '-i', str(infile), '-o', str(outfile.parent), - # subtests write to same location - '--overwrite', ] if pytestconfig.getoption('verbose') > 0: args.extend(['-l', 'DEBUG']) @@ -155,25 +157,19 @@ def test_run_eynollah_enhancement_filename(tmp_path, subtests, pytestconfig, cap def only_eynollah(logrec): return logrec.name == 'enhancement' runner = CliRunner() - for options in [ - [], # defaults - ["-sos"], - ]: - with subtests.test(#msg="test CLI", - options=options): - with caplog.filtering(only_eynollah): - result = runner.invoke(enhancement_cli, args + options, catch_exceptions=False) - assert result.exit_code == 0, result.stdout - logmsgs = [logrec.message for logrec in caplog.records] - assert any(True for logmsg in logmsgs if logmsg.startswith('Image was enhanced')), logmsgs - assert outfile.exists() - with Image.open(infile) as original_img: - original_size = original_img.size - with Image.open(outfile) as enhanced_img: - enhanced_size = enhanced_img.size - assert (original_size == enhanced_size) == ("-sos" in options) + with caplog.filtering(only_eynollah): + result = runner.invoke(enhancement_cli, args + options, catch_exceptions=False) + assert result.exit_code == 0, result.stdout + logmsgs = [logrec.message for logrec in caplog.records] + assert any(True for logmsg in logmsgs if logmsg.startswith('Image was enhanced')), logmsgs + assert outfile.exists() + with Image.open(infile) as original_img: + original_size = original_img.size + with Image.open(outfile) as enhanced_img: + enhanced_size = enhanced_img.size + assert (original_size == enhanced_size) == ("-sos" in options) -def test_run_eynollah_enhancement_directory(tmp_path, subtests, pytestconfig, caplog): +def test_run_eynollah_enhancement_directory(tmp_path, pytestconfig, caplog): indir = testdir.joinpath('resources') outdir = tmp_path args = [ @@ -194,7 +190,7 @@ def test_run_eynollah_enhancement_directory(tmp_path, subtests, pytestconfig, ca assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Image was enhanced')]) == 2 assert len(list(outdir.iterdir())) == 2 -def test_run_eynollah_mbreorder_filename(tmp_path, subtests, pytestconfig, caplog): +def test_run_eynollah_mbreorder_filename(tmp_path, pytestconfig, caplog): infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.xml') outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml') args = [ @@ -223,7 +219,7 @@ def test_run_eynollah_mbreorder_filename(tmp_path, subtests, pytestconfig, caplo #assert in_order != out_order assert out_order == ['r_1_1', 'r_2_1', 'r_2_2', 'r_2_3'] -def test_run_eynollah_mbreorder_directory(tmp_path, subtests, pytestconfig, caplog): +def test_run_eynollah_mbreorder_directory(tmp_path, pytestconfig, caplog): indir = testdir.joinpath('resources') outdir = tmp_path args = [ @@ -245,7 +241,15 @@ def test_run_eynollah_mbreorder_directory(tmp_path, subtests, pytestconfig, capl #assert len([logmsg for logmsg in logmsgs if logmsg.startswith('???')]) == 2 assert len(list(outdir.iterdir())) == 2 -def test_run_eynollah_ocr_filename(tmp_path, subtests, pytestconfig, caplog): +@pytest.mark.parametrize( + "options", + [ + [], # defaults + ["-doit", #str(outrenderfile.parent)], + ], + ["-trocr"], + ], ids=str) +def test_run_eynollah_ocr_filename(tmp_path, pytestconfig, caplog, options): infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif') outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml') outrenderfile = tmp_path.joinpath('render').joinpath('kant_aufklaerung_1784_0020.png') @@ -255,8 +259,6 @@ def test_run_eynollah_ocr_filename(tmp_path, subtests, pytestconfig, caplog): '-i', str(infile), '-dx', str(infile.parent), '-o', str(outfile.parent), - # subtests write to same location - '--overwrite', ] if pytestconfig.getoption('verbose') > 0: args.extend(['-l', 'DEBUG']) @@ -264,33 +266,25 @@ def test_run_eynollah_ocr_filename(tmp_path, subtests, pytestconfig, caplog): def only_eynollah(logrec): return logrec.name == 'eynollah' runner = CliRunner() - for options in [ - # kba Fri Sep 26 12:53:49 CEST 2025 - # Disabled until NHWC/NCHW error in https://github.com/qurator-spk/eynollah/actions/runs/18019655200/job/51273541895 debugged - # [], # defaults - # ["-doit", str(outrenderfile.parent)], - ["-trocr"], - ]: - with subtests.test(#msg="test CLI", - options=options): - with caplog.filtering(only_eynollah): - result = runner.invoke(ocr_cli, args + options, catch_exceptions=False) - assert result.exit_code == 0, result.stdout - logmsgs = [logrec.message for logrec in caplog.records] - # FIXME: ocr has no logging! - #assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs - assert outfile.exists() - if "-doit" in options: - assert outrenderfile.exists() - #in_tree = page_from_file(str(infile)).etree - #in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS) - out_tree = page_from_file(str(outfile)).etree - out_texts = out_tree.xpath("//page:TextLine/page:TextEquiv[last()]/page:Unicode/text()", namespaces=NS) - assert len(out_texts) >= 2, ("result is inaccurate", out_texts) - assert sum(map(len, out_texts)) > 100, ("result is inaccurate", out_texts) + if "-doit" in options: + options.insert(options.index("-doit") + 1, str(outrenderfile.parent)) + with caplog.filtering(only_eynollah): + result = runner.invoke(ocr_cli, args + options, catch_exceptions=False) + assert result.exit_code == 0, result.stdout + logmsgs = [logrec.message for logrec in caplog.records] + # FIXME: ocr has no logging! + #assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs + assert outfile.exists() + if "-doit" in options: + assert outrenderfile.exists() + #in_tree = page_from_file(str(infile)).etree + #in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS) + out_tree = page_from_file(str(outfile)).etree + out_texts = out_tree.xpath("//page:TextLine/page:TextEquiv[last()]/page:Unicode/text()", namespaces=NS) + assert len(out_texts) >= 2, ("result is inaccurate", out_texts) + assert sum(map(len, out_texts)) > 100, ("result is inaccurate", out_texts) -@pytest.mark.skip("Disabled until NHWC/NCHW error in https://github.com/qurator-spk/eynollah/actions/runs/18019655200/job/51273541895 debugged") -def test_run_eynollah_ocr_directory(tmp_path, subtests, pytestconfig, caplog): +def test_run_eynollah_ocr_directory(tmp_path, pytestconfig, caplog): indir = testdir.joinpath('resources') outdir = tmp_path args = [ From a3d8197930b9e2c07862186d23ee192dc0347ff4 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 30 Sep 2025 21:50:21 +0200 Subject: [PATCH 037/101] makefile: update model URL --- Makefile | 50 ++++++++++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/Makefile b/Makefile index dd95c0a..357aa47 100644 --- a/Makefile +++ b/Makefile @@ -13,10 +13,16 @@ DOCKER ?= docker #SEG_MODEL := https://github.com/qurator-spk/eynollah/releases/download/v0.3.0/models_eynollah.tar.gz #SEG_MODEL := https://github.com/qurator-spk/eynollah/releases/download/v0.3.1/models_eynollah.tar.gz SEG_MODEL := https://zenodo.org/records/17194824/files/models_layout_v0_5_0.tar.gz?download=1 +SEG_MODELFILE = $(notdir $(patsubst %?download=1,%,$(SEG_MODEL))) +SEG_MODELNAME = $(SEG_MODELFILE:%.tar.gz=%) BIN_MODEL := https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2021_03_09.zip +BIN_MODELFILE = $(notdir $(BIN_MODEL)) +BIN_MODELNAME := default-2021-03-09 -OCR_MODEL := https://zenodo.org/records/17194824/files/models_ocr_v0_5_0.tar.gz?download=1 +OCR_MODEL := https://zenodo.org/records/17236998/files/models_ocr_v0_5_1.tar.gz?download=1 +OCR_MODELFILE = $(notdir $(patsubst %?download=1,%,$(OCR_MODEL))) +OCR_MODELNAME = $(OCR_MODELFILE:%.tar.gz=%) PYTEST_ARGS ?= -vv --isolate @@ -31,7 +37,8 @@ help: @echo " install Install package with pip" @echo " install-dev Install editable with pip" @echo " deps-test Install test dependencies with pip" - @echo " models Download and extract models to $(CURDIR)/models_layout_v0_5_0" + @echo " models Download and extract models to $(CURDIR):" + @echo " $(BIN_MODELNAME) $(SEG_MODELNAME) $(OCR_MODELNAME)" @echo " smoke-test Run simple CLI check" @echo " ocrd-test Run OCR-D CLI check" @echo " test Run unit tests" @@ -42,33 +49,29 @@ help: @echo " PYTEST_ARGS pytest args for 'test' (Set to '-s' to see log output during test execution, '-vv' to see individual tests. [$(PYTEST_ARGS)]" @echo " SEG_MODEL URL of 'models' archive to download for segmentation 'test' [$(SEG_MODEL)]" @echo " BIN_MODEL URL of 'models' archive to download for binarization 'test' [$(BIN_MODEL)]" + @echo " OCR_MODEL URL of 'models' archive to download for binarization 'test' [$(OCR_MODEL)]" @echo "" # END-EVAL # Download and extract models to $(PWD)/models_layout_v0_5_0 -models: models_layout_v0_5_0 models_ocr_v0_5_0 default-2021-03-09 +models: $(BIN_MODELNAME) $(SEG_MODELNAME) $(OCR_MODELNAME) -models_layout_v0_5_0: models_layout_v0_5_0.tar.gz - tar zxf models_layout_v0_5_0.tar.gz - -models_layout_v0_5_0.tar.gz: +$(BIN_MODELFILE): + wget -O $@ $(BIN_MODEL) +$(SEG_MODELFILE): wget -O $@ $(SEG_MODEL) - -models_ocr_v0_5_0: models_ocr_v0_5_0.tar.gz - tar zxf models_ocr_v0_5_0.tar.gz - -models_ocr_v0_5_0.tar.gz: +$(OCR_MODELFILE): wget -O $@ $(OCR_MODEL) -default-2021-03-09: $(notdir $(BIN_MODEL)) - unzip $(notdir $(BIN_MODEL)) +$(BIN_MODELNAME): $(BIN_MODELFILE) mkdir $@ - mv $(basename $(notdir $(BIN_MODEL))) $@ - -$(notdir $(BIN_MODEL)): - wget $(BIN_MODEL) + unzip -d $@ $< +$(SEG_MODELNAME): $(SEG_MODELFILE) + tar zxf $< +$(OCR_MODELNAME): $(OCR_MODELFILE) + tar zxf $< build: $(PIP) install build @@ -82,7 +85,10 @@ install: install-dev: $(PIP) install -e .$(and $(EXTRAS),[$(EXTRAS)]) -deps-test: models_layout_v0_5_0 +ifeq (OCR,$(findstring OCR, $(EXTRAS))) +deps-test: $(OCR_MODELNAME) +endif +deps-test: $(BIN_MODELNAME) $(SEG_MODELNAME) $(PIP) install -r requirements-test.txt smoke-test: TMPDIR != mktemp -d @@ -123,9 +129,9 @@ ocrd-test: tests/resources/kant_aufklaerung_1784_0020.tif $(RM) -r $(TMPDIR) # Run unit tests -test: export MODELS_LAYOUT=$(CURDIR)/models_layout_v0_5_0 -test: export MODELS_OCR=$(CURDIR)/models_ocr_v0_5_0 -test: export MODELS_BIN=$(CURDIR)/default-2021-03-09 +test: export MODELS_LAYOUT=$(CURDIR)/$(SEG_MODELNAME) +test: export MODELS_OCR=$(CURDIR)/$(OCR_MODELNAME) +test: export MODELS_BIN=$(CURDIR)/$(BIN_MODELNAME) test: $(PYTHON) -m pytest tests --durations=0 --continue-on-collection-errors $(PYTEST_ARGS) From c86e59f481ee47ccb9938b7f6105f95f626c5f17 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 30 Sep 2025 22:03:46 +0200 Subject: [PATCH 038/101] CI: update model key, split up cache restore/save --- .github/workflows/test-eynollah.yml | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test-eynollah.yml b/.github/workflows/test-eynollah.yml index 042e508..ca213cb 100644 --- a/.github/workflows/test-eynollah.yml +++ b/.github/workflows/test-eynollah.yml @@ -24,17 +24,17 @@ jobs: sudo rm -rf "$AGENT_TOOLSDIRECTORY" df -h - uses: actions/checkout@v4 - - uses: actions/cache@v4 + - uses: actions/cache/restore@v4 id: seg_model_cache with: path: models_layout_v0_5_0 - key: ${{ runner.os }}-models - - uses: actions/cache@v4 + key: ${{ runner.os }}-seg-models + - uses: actions/cache/restore@v4 id: ocr_model_cache with: - path: models_ocr_v0_5_0 - key: ${{ runner.os }}-models - - uses: actions/cache@v4 + path: models_ocr_v0_5_1 + key: ${{ runner.os }}-ocr-models + - uses: actions/cache/restore@v4 id: bin_model_cache with: path: default-2021-03-09 @@ -42,6 +42,21 @@ jobs: - name: Download models if: steps.seg_model_cache.outputs.cache-hit != 'true' || steps.bin_model_cache.outputs.cache-hit != 'true' || steps.ocr_model_cache.outputs.cache-hit != true run: make models + - uses: actions/cache/save@v4 + if: steps.seg_model_cache.outputs.cache-hit != 'true' + with: + path: models_layout_v0_5_0 + key: ${{ runner.os }}-seg-models + - uses: actions/cache/save@v4 + if: steps.ocr_model_cache.outputs.cache-hit != 'true' + with: + path: models_ocr_v0_5_1 + key: ${{ runner.os }}-ocr-models + - uses: actions/cache/save@v4 + if: steps.bin_model_cache.outputs.cache-hit != 'true' + with: + path: default-2021-03-09 + key: ${{ runner.os }}-modelbin - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: From ad129ed46c70b03fea7b48060e40e2451b40b975 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 30 Sep 2025 22:05:53 +0200 Subject: [PATCH 039/101] CI: remove OS from model cache keys --- .github/workflows/test-eynollah.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test-eynollah.yml b/.github/workflows/test-eynollah.yml index ca213cb..9d5b2c8 100644 --- a/.github/workflows/test-eynollah.yml +++ b/.github/workflows/test-eynollah.yml @@ -28,17 +28,17 @@ jobs: id: seg_model_cache with: path: models_layout_v0_5_0 - key: ${{ runner.os }}-seg-models + key: seg-models - uses: actions/cache/restore@v4 id: ocr_model_cache with: path: models_ocr_v0_5_1 - key: ${{ runner.os }}-ocr-models + key: ocr-models - uses: actions/cache/restore@v4 id: bin_model_cache with: path: default-2021-03-09 - key: ${{ runner.os }}-modelbin + key: bin-models - name: Download models if: steps.seg_model_cache.outputs.cache-hit != 'true' || steps.bin_model_cache.outputs.cache-hit != 'true' || steps.ocr_model_cache.outputs.cache-hit != true run: make models @@ -46,17 +46,17 @@ jobs: if: steps.seg_model_cache.outputs.cache-hit != 'true' with: path: models_layout_v0_5_0 - key: ${{ runner.os }}-seg-models + key: seg-models - uses: actions/cache/save@v4 if: steps.ocr_model_cache.outputs.cache-hit != 'true' with: path: models_ocr_v0_5_1 - key: ${{ runner.os }}-ocr-models + key: ocr-models - uses: actions/cache/save@v4 if: steps.bin_model_cache.outputs.cache-hit != 'true' with: path: default-2021-03-09 - key: ${{ runner.os }}-modelbin + key: bin-models - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: From 7daec392b9846931b932d48fde71680ab4bf33e9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 30 Sep 2025 22:10:45 +0200 Subject: [PATCH 040/101] Dockerfile: fix up CUDA installation for mixed TF/Torch --- Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Dockerfile b/Dockerfile index 4ba498b..a15776e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -40,6 +40,8 @@ RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename RUN ocrd ocrd-tool ocrd-tool.json dump-module-dirs > $(dirname $(ocrd bashlib filename))/ocrd-all-module-dir.json # install everything and reduce image size RUN make install EXTRAS=OCR && rm -rf /build/eynollah +# fixup for broken cuDNN installation (Torch pulls in 8.5.0, which is incompatible with Tensorflow) +RUN pip install nvidia-cudnn-cu11==8.6.0.163 # smoke test RUN eynollah --help From f0de1adabf45f3dd70df72ddc09795a4512d5316 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 30 Sep 2025 23:12:18 +0200 Subject: [PATCH 041/101] rm loky dependency --- .gitignore | 4 ++++ requirements.txt | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 0d5d834..3cc0eac 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,11 @@ __pycache__ sbb_newspapers_org_image/pylint.log models_eynollah* +models_ocr* +models_layout* +default-2021-03-09 output.html /build /dist *.tif +TAGS diff --git a/requirements.txt b/requirements.txt index 4bc0c6a..db1d7df 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,5 +5,4 @@ scikit-learn >= 0.23.2 tensorflow < 2.13 numba <= 0.58.1 scikit-image -loky biopython From 3aa7ad04fafd842fe31c36094a2b51fa43cc1bd3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 30 Sep 2025 23:14:52 +0200 Subject: [PATCH 042/101] :memo: update changelog --- CHANGELOG.md | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0ad9a09..f6776d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,33 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Fixed: + + * :fire: polygons: avoid invalid paths (use `Polygon.buffer()` instead of dilation etc.) + * `return_boxes_of_images_by_order_of_reading_new`: avoid Numpy.dtype mismatch, simplify + * `return_boxes_of_images_by_order_of_reading_new`: log any exceptions instead of ignoring + * `filter_contours_without_textline_inside`: avoid removing from duplicate lists twice + * `get_marginals`: exit early if no peaks found to avoid spurious overlap mask + * `get_smallest_skew`: after shifting search range of rotation angle, use overall best result + * Dockerfile: fix CUDA installation (cuDNN contested between Torch and TF due to extra OCR) + * OCR: re-instate missing methods and fix `utils_ocr` function calls + * :fire: writer: `SeparatorRegion` needs `SeparatorRegionType` (not `ImageRegionType`) +f458e3e + * tests: switch from `pytest-subtests` to `parametrize` so we can use `pytest-isolate` + (so CUDA memory gets freed between tests if running on GPU) + +Changed: + + * polygons: slightly widen for regions and lines, increase for separators + * various refactorings, some code style and identifier improvements + * deskewing/multiprocessing: switch back to ProcessPoolExecutor (faster), + but use shared memory if necessary, and switch back from `loky` to stdlib, + and shutdown in `del()` instead of `atexit` + * :fire: OCR: switch CNN-RNN model to `20250930` version compatible with TF 2.12 on CPU, too + * :fire: writer: use `@type='heading'` instead of `'header'` for headings + * CI: update+improve model caching + + ## [0.5.0] - 2025-09-26 Fixed: From 5725e4fd1f6bab4c1152c88cc28c44c0e8c2c584 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 1 Oct 2025 15:58:03 +0200 Subject: [PATCH 043/101] =?UTF-8?q?-Continue=20processing=20when=20num=5Fc?= =?UTF-8?q?ol=20is=20None=20but=20textregions=20exist.=20-Convert=20margin?= =?UTF-8?q?al-only=20=20to=20main=20body=20if=20no=20main=20body=20is=20pr?= =?UTF-8?q?esent.=20-Reset=20deskew=20angle=20to=200=20when=20text=20regio?= =?UTF-8?q?n=20density=20(textregion=20area=20to=20page=20area)=20<=200.3?= =?UTF-8?q?=20and=20angle=20>=2045=C2=B0.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/eynollah/eynollah.py | 41 +++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 20954a0..5e8412e 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -1,4 +1,4 @@ -# pylint: disable=no-member,invalid-name,line-too-long,missing-function-docstring,missing-class-docstring,too-many-branches +#run_single# pylint: disable=no-member,invalid-name,line-too-long,missing-function-docstring,missing-class-docstring,too-many-branches # pylint: disable=too-many-locals,wrong-import-position,too-many-lines,too-many-statements,chained-comparison,fixme,broad-except,c-extension-no-member # pylint: disable=too-many-public-methods,too-many-arguments,too-many-instance-attributes,too-many-public-methods, # pylint: disable=consider-using-enumerate @@ -2245,6 +2245,7 @@ class Eynollah: ##mask_texts_only = cv2.dilate(mask_texts_only, KERNEL, iterations=1) mask_texts_only = cv2.dilate(mask_texts_only, kernel=np.ones((2,2), np.uint8), iterations=1) + mask_images_only=(prediction_regions_org[:,:] ==2)*1 polygons_lines_xml, hir_lines_xml = return_contours_of_image(mask_lines_only) @@ -2280,20 +2281,18 @@ class Eynollah: text_regions_p_true[:,:][mask_images_only[:,:] == 1] = 2 text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_texts, color=(1,1,1)) - #plt.imshow(textline_mask_tot_ea) #plt.show() textline_mask_tot_ea[(text_regions_p_true==0) | (text_regions_p_true==4) ] = 0 - #plt.imshow(textline_mask_tot_ea) #plt.show() #print("inside 4 ", time.time()-t_in) self.logger.debug("exit get_regions_light_v") - return text_regions_p_true, erosion_hurts, polygons_lines_xml, textline_mask_tot_ea, img_bin, confidence_matrix + return text_regions_p_true, erosion_hurts, polygons_lines_xml, textline_mask_tot_ea, img_bin, confidence_matrix, polygons_of_only_texts else: img_bin = resize_image(img_bin,img_height_h, img_width_h ) self.logger.debug("exit get_regions_light_v") - return None, erosion_hurts, None, textline_mask_tot_ea, img_bin, None + return None, erosion_hurts, None, textline_mask_tot_ea, img_bin, None, None def get_regions_from_xy_2models(self,img,is_image_enhanced, num_col_classifier): self.logger.debug("enter get_regions_from_xy_2models") @@ -2386,7 +2385,7 @@ class Eynollah: text_regions_p_true=cv2.fillPoly(text_regions_p_true,pts=polygons_of_only_texts, color=(1,1,1)) self.logger.debug("exit get_regions_from_xy_2models") - return text_regions_p_true, erosion_hurts, polygons_lines_xml + return text_regions_p_true, erosion_hurts, polygons_lines_xml, polygons_of_only_texts except: if self.input_binary: prediction_bin = np.copy(img_org) @@ -2436,7 +2435,7 @@ class Eynollah: erosion_hurts = True self.logger.debug("exit get_regions_from_xy_2models") - return text_regions_p_true, erosion_hurts, polygons_lines_xml + return text_regions_p_true, erosion_hurts, polygons_lines_xml, polygons_of_only_texts def do_order_of_regions_full_layout( self, contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot): @@ -4701,7 +4700,7 @@ class Eynollah: self.logger.info("Step 2/5: Basic Processing Mode") self.logger.info("Skipping layout analysis and reading order detection") - _ ,_, _, textline_mask_tot_ea, img_bin_light, _ = \ + _ ,_, _, textline_mask_tot_ea, img_bin_light, _,_= \ self.get_regions_light_v(img_res, is_image_enhanced, num_col_classifier, skip_layout_and_reading_order=self.skip_layout_and_reading_order) @@ -4768,10 +4767,10 @@ class Eynollah: if self.light_version: self.logger.info("Using light version processing") - text_regions_p_1 ,erosion_hurts, polygons_lines_xml, textline_mask_tot_ea, img_bin_light, confidence_matrix = \ + text_regions_p_1 ,erosion_hurts, polygons_lines_xml, textline_mask_tot_ea, img_bin_light, confidence_matrix, polygons_text_early = \ self.get_regions_light_v(img_res, is_image_enhanced, num_col_classifier) #print("text region early -2 in %.1fs", time.time() - t0) - + if num_col_classifier == 1 or num_col_classifier ==2: if num_col_classifier == 1: img_w_new = 1000 @@ -4793,9 +4792,9 @@ class Eynollah: #self.logger.info("run graphics %.1fs ", time.time() - t1t) #print("text region early -3 in %.1fs", time.time() - t0) textline_mask_tot_ea_org = np.copy(textline_mask_tot_ea) - #print("text region early -4 in %.1fs", time.time() - t0) + else: - text_regions_p_1 ,erosion_hurts, polygons_lines_xml = \ + text_regions_p_1 ,erosion_hurts, polygons_lines_xml, polygons_text_early = \ self.get_regions_from_xy_2models(img_res, is_image_enhanced, num_col_classifier) self.logger.info(f"Textregion detection took {time.time() - t1:.1f}s") @@ -4811,7 +4810,7 @@ class Eynollah: #plt.show() self.logger.info(f"Layout analysis complete ({time.time() - t1:.1f}s)") - if not num_col: + if not num_col and len(polygons_text_early) == 0: self.logger.info("No columns detected - generating empty PAGE-XML") pcgts = self.writer.build_pagexml_no_full_layout( @@ -4848,6 +4847,15 @@ class Eynollah: textline_mask_tot, text_regions_p, image_page_rotated = \ self.run_marginals(image_page, textline_mask_tot_ea, mask_images, mask_lines, num_col_classifier, slope_deskew, text_regions_p_1, table_prediction) + + + if image_page.shape[0]!=0 and image_page.shape[1]!=0: + # if ratio of text regions to page area is smaller that 0.3, deskew angle is not aloowed to exceed 45 + if ( ( text_regions_p[:,:]==1).sum() + (text_regions_p[:,:]==4).sum() ) / float(image_page.shape[0]*image_page.shape[1] ) <= 0.3 and abs(slope_deskew) > 45: + slope_deskew = 0 + + if (text_regions_p[:,:]==1).sum() == 0: + text_regions_p[:,:][text_regions_p[:,:]==4] = 1 self.logger.info("Step 3/5: Text Line Detection") @@ -4894,6 +4902,8 @@ class Eynollah: ###min_con_area = 0.000005 contours_only_text, hir_on_text = return_contours_of_image(text_only) contours_only_text_parent = return_parent_contours(contours_only_text, hir_on_text) + + if len(contours_only_text_parent) > 0: areas_cnt_text = np.array([cv2.contourArea(c) for c in contours_only_text_parent]) areas_cnt_text = areas_cnt_text / float(text_only.shape[0] * text_only.shape[1]) @@ -4995,7 +5005,9 @@ class Eynollah: contours_only_text_parent_d_ordered = [] contours_only_text_parent_d = [] #contours_only_text_parent = [] - + + boxes_marginals, _ = get_text_region_boxes_by_given_contours(polygons_of_marginals) + if not len(contours_only_text_parent): # stop early empty_marginals = [[]] * len(polygons_of_marginals) @@ -5031,7 +5043,6 @@ class Eynollah: contours_only_text_parent, self.image, slope_first, confidence_matrix, map=self.executor.map) #print("text region early 4 in %.1fs", time.time() - t0) boxes_text, _ = get_text_region_boxes_by_given_contours(contours_only_text_parent) - boxes_marginals, _ = get_text_region_boxes_by_given_contours(polygons_of_marginals) #print("text region early 5 in %.1fs", time.time() - t0) ## birdan sora chock chakir if not self.curved_line: From 0b9d4901a61ea777fc0db6e90930a734fe33302d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 2 Oct 2025 20:51:03 +0200 Subject: [PATCH 044/101] contour features: avoid unused calculations, simplify, add shortcuts - new function: `find_center_of_contours` - simplified: `find_(new_)features_of_contours` --- src/eynollah/utils/contour.py | 78 ++++++++++++----------------------- 1 file changed, 27 insertions(+), 51 deletions(-) diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index 0700ed4..041cbf6 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -79,61 +79,37 @@ def filter_contours_area_of_image_tables(image, contours, hierarchy, max_area=1. found_polygons_early.append(polygon2contour(polygon)) return found_polygons_early -def find_new_features_of_contours(contours_main): - areas_main = np.array([cv2.contourArea(contours_main[j]) - for j in range(len(contours_main))]) - M_main = [cv2.moments(contours_main[j]) - for j in range(len(contours_main))] - cx_main = [(M_main[j]["m10"] / (M_main[j]["m00"] + 1e-32)) - for j in range(len(M_main))] - cy_main = [(M_main[j]["m01"] / (M_main[j]["m00"] + 1e-32)) - for j in range(len(M_main))] - try: - x_min_main = np.array([np.min(contours_main[j][:, 0, 0]) - for j in range(len(contours_main))]) - argmin_x_main = np.array([np.argmin(contours_main[j][:, 0, 0]) - for j in range(len(contours_main))]) - x_min_from_argmin = np.array([contours_main[j][argmin_x_main[j], 0, 0] - for j in range(len(contours_main))]) - y_corr_x_min_from_argmin = np.array([contours_main[j][argmin_x_main[j], 0, 1] - for j in range(len(contours_main))]) - x_max_main = np.array([np.max(contours_main[j][:, 0, 0]) - for j in range(len(contours_main))]) - y_min_main = np.array([np.min(contours_main[j][:, 0, 1]) - for j in range(len(contours_main))]) - y_max_main = np.array([np.max(contours_main[j][:, 0, 1]) - for j in range(len(contours_main))]) - except: - x_min_main = np.array([np.min(contours_main[j][:, 0]) - for j in range(len(contours_main))]) - argmin_x_main = np.array([np.argmin(contours_main[j][:, 0]) - for j in range(len(contours_main))]) - x_min_from_argmin = np.array([contours_main[j][argmin_x_main[j], 0] - for j in range(len(contours_main))]) - y_corr_x_min_from_argmin = np.array([contours_main[j][argmin_x_main[j], 1] - for j in range(len(contours_main))]) - x_max_main = np.array([np.max(contours_main[j][:, 0]) - for j in range(len(contours_main))]) - y_min_main = np.array([np.min(contours_main[j][:, 1]) - for j in range(len(contours_main))]) - y_max_main = np.array([np.max(contours_main[j][:, 1]) - for j in range(len(contours_main))]) - # dis_x=np.abs(x_max_main-x_min_main) +def find_center_of_contours(contours): + moments = [cv2.moments(contour) for contour in contours] + cx = [feat["m10"] / (feat["m00"] + 1e-32) + for feat in moments] + cy = [feat["m01"] / (feat["m00"] + 1e-32) + for feat in moments] + return cx, cy - return cx_main, cy_main, x_min_main, x_max_main, y_min_main, y_max_main, y_corr_x_min_from_argmin +def find_new_features_of_contours(contours): + # areas = np.array([cv2.contourArea(contour) for contour in contours]) + cx, cy = find_center_of_contours(contours) + slice_x = np.index_exp[:, 0, 0] + slice_y = np.index_exp[:, 0, 1] + if any(contour.ndim < 3 for contour in contours): + slice_x = np.index_exp[:, 0] + slice_y = np.index_exp[:, 1] + x_min = np.array([np.min(contour[slice_x]) for contour in contours]) + x_max = np.array([np.max(contour[slice_x]) for contour in contours]) + y_min = np.array([np.min(contour[slice_y]) for contour in contours]) + y_max = np.array([np.max(contour[slice_y]) for contour in contours]) + # dis_x=np.abs(x_max-x_min) + y_corr_x_min = np.array([contour[np.argmin(contour[slice_x])][slice_y[1:]] + for contour in contours]) -def find_features_of_contours(contours_main): - areas_main=np.array([cv2.contourArea(contours_main[j]) for j in range(len(contours_main))]) - M_main=[cv2.moments(contours_main[j]) for j in range(len(contours_main))] - cx_main=[(M_main[j]['m10']/(M_main[j]['m00']+1e-32)) for j in range(len(M_main))] - cy_main=[(M_main[j]['m01']/(M_main[j]['m00']+1e-32)) for j in range(len(M_main))] - x_min_main=np.array([np.min(contours_main[j][:,0,0]) for j in range(len(contours_main))]) - x_max_main=np.array([np.max(contours_main[j][:,0,0]) for j in range(len(contours_main))]) + return cx, cy, x_min, x_max, y_min, y_max, y_corr_x_min - y_min_main=np.array([np.min(contours_main[j][:,0,1]) for j in range(len(contours_main))]) - y_max_main=np.array([np.max(contours_main[j][:,0,1]) for j in range(len(contours_main))]) +def find_features_of_contours(contours): + y_min = np.array([np.min(contour[:,0,1]) for contour in contours]) + y_max = np.array([np.max(contour[:,0,1]) for contour in contours]) - return y_min_main, y_max_main + return y_min, y_max def return_parent_contours(contours, hierarchy): contours_parent = [contours[i] From 81827c2942e0a6b7e4121b9de510108de4f026fa Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 2 Oct 2025 21:03:07 +0200 Subject: [PATCH 045/101] filter_contours_inside_a_bigger_one: simplify - use new `find_center_of_contours` - avoid loops in favour of array processing - use sets instead of `np.unique` and `np.delete` instead of list.pop --- src/eynollah/eynollah.py | 102 +++++++++++++++------------------------ 1 file changed, 39 insertions(+), 63 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 62ce002..b2d9016 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4208,7 +4208,7 @@ class Eynollah: return generated_text def return_list_of_contours_with_desired_order(self, ls_cons, sorted_indexes): - return [ls_cons[sorted_indexes[index]] for index in range(len(sorted_indexes))] + return list(np.array(ls_cons)[np.array(sorted_indexes)]) def return_it_in_two_groups(self, x_differential): split = [ind if x_differential[ind]!=x_differential[ind+1] else -1 @@ -4237,47 +4237,38 @@ class Eynollah: def filter_contours_inside_a_bigger_one(self, contours, contours_d_ordered, image, marginal_cnts=None, type_contour="textregion"): - if type_contour=="textregion": - areas = [cv2.contourArea(contours[j]) for j in range(len(contours))] + if type_contour == "textregion": + areas = np.array(list(map(cv2.contourArea, contours))) area_tot = image.shape[0]*image.shape[1] + areas_ratio = areas / area_tot + cx_main, cy_main = find_center_of_contours(contours) - M_main = [cv2.moments(contours[j]) - for j in range(len(contours))] - cx_main = [(M_main[j]["m10"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))] - cy_main = [(M_main[j]["m01"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))] + contours_index_small = np.flatnonzero(areas_ratio < 1e-3) + contours_index_large = np.flatnonzero(areas_ratio >= 1e-3) - areas_ratio = np.array(areas)/ area_tot - contours_index_small = [ind for ind in range(len(contours)) if areas_ratio[ind] < 1e-3] - contours_index_big = [ind for ind in range(len(contours)) if areas_ratio[ind] >= 1e-3] - - #contours_> = [contours[ind] for ind in contours_index_big] + #contours_> = [contours[ind] for ind in contours_index_large] indexes_to_be_removed = [] for ind_small in contours_index_small: - results = [cv2.pointPolygonTest(contours[ind], (cx_main[ind_small], cy_main[ind_small]), False) - for ind in contours_index_big] - if marginal_cnts: - results_marginal = [cv2.pointPolygonTest(marginal_cnts[ind], + results = [cv2.pointPolygonTest(contours[ind_large], (cx_main[ind_small], + cy_main[ind_small]), + False) + for ind_large in contours_index_large] + results = np.array(results) + if np.any(results==1): + indexes_to_be_removed.append(ind_small) + elif marginal_cnts: + results_marginal = [cv2.pointPolygonTest(marginal_cnt, (cx_main[ind_small], cy_main[ind_small]), False) - for ind in range(len(marginal_cnts))] + for marginal_cnt in marginal_cnts] results_marginal = np.array(results_marginal) - if np.any(results_marginal==1): indexes_to_be_removed.append(ind_small) - results = np.array(results) - - if np.any(results==1): - indexes_to_be_removed.append(ind_small) - - if len(indexes_to_be_removed)>0: - indexes_to_be_removed = np.unique(indexes_to_be_removed) - indexes_to_be_removed = np.sort(indexes_to_be_removed)[::-1] - for ind in indexes_to_be_removed: - contours.pop(ind) - if len(contours_d_ordered)>0: - contours_d_ordered.pop(ind) + contours = np.delete(contours, indexes_to_be_removed, axis=0) + if len(contours_d_ordered): + contours_d_ordered = np.delete(contours_d_ordered, indexes_to_be_removed, axis=0) return contours, contours_d_ordered @@ -4285,33 +4276,21 @@ class Eynollah: contours_txtline_of_all_textregions = [] indexes_of_textline_tot = [] index_textline_inside_textregion = [] + for ind_region, textlines in enumerate(contours): + contours_txtline_of_all_textregions.extend(textlines) + index_textline_inside_textregion.extend(list(range(len(textlines)))) + indexes_of_textline_tot.extend([ind_region] * len(textlines)) - for jj in range(len(contours)): - contours_txtline_of_all_textregions = contours_txtline_of_all_textregions + contours[jj] - - ind_textline_inside_tr = list(range(len(contours[jj]))) - index_textline_inside_textregion = index_textline_inside_textregion + ind_textline_inside_tr - ind_ins = [jj] * len(contours[jj]) - indexes_of_textline_tot = indexes_of_textline_tot + ind_ins - - M_main_tot = [cv2.moments(contours_txtline_of_all_textregions[j]) - for j in range(len(contours_txtline_of_all_textregions))] - cx_main_tot = [(M_main_tot[j]["m10"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] - cy_main_tot = [(M_main_tot[j]["m01"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] - - areas_tot = [cv2.contourArea(con_ind) for con_ind in contours_txtline_of_all_textregions] + areas_tot = np.array(list(map(cv2.contourArea, contours_txtline_of_all_textregions))) area_tot_tot = image.shape[0]*image.shape[1] + cx_main_tot, cy_main_tot = find_center_of_contours(contours_txtline_of_all_textregions) - textregion_index_to_del = [] - textline_in_textregion_index_to_del = [] + textline_in_textregion_index_to_del = {} for ij in range(len(contours_txtline_of_all_textregions)): - args_all = list(np.array(range(len(contours_txtline_of_all_textregions)))) - args_all.pop(ij) - - areas_without = np.array(areas_tot)[args_all] area_of_con_interest = areas_tot[ij] - - args_with_bigger_area = np.array(args_all)[areas_without > 1.5*area_of_con_interest] + args_without = np.delete(np.arange(len(contours_txtline_of_all_textregions)), ij) + areas_without = areas_tot[args_without] + args_with_bigger_area = args_without[areas_without > 1.5*area_of_con_interest] if len(args_with_bigger_area)>0: results = [cv2.pointPolygonTest(contours_txtline_of_all_textregions[ind], @@ -4322,18 +4301,15 @@ class Eynollah: results = np.array(results) if np.any(results==1): #print(indexes_of_textline_tot[ij], index_textline_inside_textregion[ij]) - textregion_index_to_del.append(int(indexes_of_textline_tot[ij])) - textline_in_textregion_index_to_del.append(int(index_textline_inside_textregion[ij])) - #contours[int(indexes_of_textline_tot[ij])].pop(int(index_textline_inside_textregion[ij])) + textline_in_textregion_index_to_del.setdefault( + indexes_of_textline_tot[ij], list()).append( + index_textline_inside_textregion[ij]) + #contours[indexes_of_textline_tot[ij]].pop(index_textline_inside_textregion[ij]) - textregion_index_to_del = np.array(textregion_index_to_del) - textline_in_textregion_index_to_del = np.array(textline_in_textregion_index_to_del) - for ind_u_a_trs in np.unique(textregion_index_to_del): - textline_in_textregion_index_to_del_ind = \ - textline_in_textregion_index_to_del[textregion_index_to_del==ind_u_a_trs] - textline_in_textregion_index_to_del_ind = np.sort(textline_in_textregion_index_to_del_ind)[::-1] - for ittrd in textline_in_textregion_index_to_del_ind: - contours[ind_u_a_trs].pop(ittrd) + for textregion_index_to_del in textline_in_textregion_index_to_del: + contours[textregion_index_to_del] = list(np.delete( + contours[textregion_index_to_del], + textline_in_textregion_index_to_del[textregion_index_to_del])) return contours From 8869c20c33c673e02e4f60081b96a8bd71d823d2 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Mon, 6 Oct 2025 14:53:47 +0200 Subject: [PATCH 046/101] updating CHANGELOG for v0.5.0 --- CHANGELOG.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index bfdd1ce..70e8854 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,12 @@ Versioned according to [Semantic Versioning](http://semver.org/). Fixed: * restoring the contour in the original image caused an error due to an empty tuple, #154 + * removed NumPy warnings (fixed issue #158) + * fixed issue #124 + * Drop capitals are now handled separately from their corresponding textline + * Marginals are now divided into left and right. Their reading order is written first for left marginals, then for right marginals, and within each side from top to bottom + * Added a new page extraction model. Instead of bounding boxes, it outputs page contours in the XML file, improving results for skewed pages + * Improved reading order for cases where a textline is segmented into multiple smaller textlines Changed @@ -24,6 +30,20 @@ Added: * `eynollah machine-based-reading-order` CLI to run reading order detection, #175 * `eynollah enhancement` CLI to run image enhancement, #175 * Improved models for page extraction and reading order detection, #175 + * For the lightweight version (layout and textline detection), thresholds are now assigned to the artificial class. Users can apply these thresholds to improve detection of isolated textlines and regions. To counteract the drawback of thresholding, the skeleton of the artificial class is used to keep lines as thin as possible (resolved issues #163 and #161) + * Added and integrated trained CNN-RNN OCR models + * Added and integrated a trained TrOCR model + * Improved OCR detection to support vertical and curved textlines + * Introduced a new machine-based reading order model with rotation augmentation + * Optimized reading order speed by clustering text regions that belong to the same block, maintaining top-to-bottom order + * Implemented text merging across textlines based on hyphenation when a line ends with a hyphen + * Integrated image enhancement as a separate use case + * Added reading order functionality on the layout level as a separate use case + * CNN-RNN OCR models provide confidence scores for predictions + * Added OCR visualization: predicted OCR can be overlaid on an image of the same size as the input + * Introduced a threshold value for CNN-RNN OCR models, allowing users to filter out low-confidence textline predictions + * For OCR, users can specify a single model by name instead of always using the default model + * Under the OCR use case, if ground-truth XMLs and images are available, textline image and corresponding text extraction can now be performed Merged PRs: From 4ffe6190d2c6b885b27330027f4a0d8fd97a32f6 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 9 Oct 2025 14:03:26 +0200 Subject: [PATCH 047/101] :memo: changelog --- CHANGELOG.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 70e8854..5ca95a7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,8 +10,8 @@ Versioned according to [Semantic Versioning](http://semver.org/). Fixed: * restoring the contour in the original image caused an error due to an empty tuple, #154 - * removed NumPy warnings (fixed issue #158) - * fixed issue #124 + * removed NumPy warnings calculating sigma, mean, (fixed issue #158) + * fixed bug in `separate_lines.py`, #124 * Drop capitals are now handled separately from their corresponding textline * Marginals are now divided into left and right. Their reading order is written first for left marginals, then for right marginals, and within each side from top to bottom * Added a new page extraction model. Instead of bounding boxes, it outputs page contours in the XML file, improving results for skewed pages @@ -31,7 +31,7 @@ Added: * `eynollah enhancement` CLI to run image enhancement, #175 * Improved models for page extraction and reading order detection, #175 * For the lightweight version (layout and textline detection), thresholds are now assigned to the artificial class. Users can apply these thresholds to improve detection of isolated textlines and regions. To counteract the drawback of thresholding, the skeleton of the artificial class is used to keep lines as thin as possible (resolved issues #163 and #161) - * Added and integrated trained CNN-RNN OCR models + * Added and integrated a trained CNN-RNN OCR models * Added and integrated a trained TrOCR model * Improved OCR detection to support vertical and curved textlines * Introduced a new machine-based reading order model with rotation augmentation @@ -43,7 +43,7 @@ Added: * Added OCR visualization: predicted OCR can be overlaid on an image of the same size as the input * Introduced a threshold value for CNN-RNN OCR models, allowing users to filter out low-confidence textline predictions * For OCR, users can specify a single model by name instead of always using the default model - * Under the OCR use case, if ground-truth XMLs and images are available, textline image and corresponding text extraction can now be performed + * Under the OCR use case, if Ground Truth XMLs and images are available, textline image and corresponding text extraction can now be performed Merged PRs: From 8c3d5eb0eb0eccd97542a86b0d3385e95f4f1da0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 2 Oct 2025 21:07:35 +0200 Subject: [PATCH 048/101] separate_marginals_to_left_and_right_and_order_from_top_to_down: simplify - use new `find_center_of_contours` - avoid loops in favour of array processing - avoid repeated sorting --- src/eynollah/eynollah.py | 75 +++++++++++++++++----------------- src/eynollah/utils/__init__.py | 2 +- 2 files changed, 39 insertions(+), 38 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index b2d9016..9eba3d3 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4418,52 +4418,53 @@ class Eynollah: def separate_marginals_to_left_and_right_and_order_from_top_to_down( self, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes_marginals, mid_point_of_page_width): - cx_marg, cy_marg, _, _, _, _, _ = find_new_features_of_contours( - polygons_of_marginals) - + cx_marg, cy_marg = find_center_of_contours(polygons_of_marginals) cx_marg = np.array(cx_marg) cy_marg = np.array(cy_marg) + + def split(lis): + array = np.array(lis) + return (list(array[cx_marg < mid_point_of_page_width]), + list(array[cx_marg >= mid_point_of_page_width])) + + (poly_marg_left, + poly_marg_right) = \ + split(polygons_of_marginals) + + (all_found_textline_polygons_marginals_left, + all_found_textline_polygons_marginals_right) = \ + split(all_found_textline_polygons_marginals) - poly_marg_left = list( np.array(polygons_of_marginals)[cx_marg < mid_point_of_page_width] ) - poly_marg_right = list( np.array(polygons_of_marginals)[cx_marg >= mid_point_of_page_width] ) + (all_box_coord_marginals_left, + all_box_coord_marginals_right) = \ + split(all_box_coord_marginals) - all_found_textline_polygons_marginals_left = \ - list( np.array(all_found_textline_polygons_marginals)[cx_marg < mid_point_of_page_width] ) - all_found_textline_polygons_marginals_right = \ - list( np.array(all_found_textline_polygons_marginals)[cx_marg >= mid_point_of_page_width] ) + (slopes_marg_left, + slopes_marg_right) = \ + split(slopes_marginals) - all_box_coord_marginals_left = list( np.array(all_box_coord_marginals)[cx_marg < mid_point_of_page_width] ) - all_box_coord_marginals_right = list( np.array(all_box_coord_marginals)[cx_marg >= mid_point_of_page_width] ) + (cy_marg_left, + cy_marg_right) = \ + split(cy_marg) + + order_left = np.argsort(cy_marg_left) + order_right = np.argsort(cy_marg_right) + def sort_left(lis): + return list(np.array(lis)[order_left]) + def sort_right(lis): + return list(np.array(lis)[order_right]) - slopes_marg_left = list( np.array(slopes_marginals)[cx_marg < mid_point_of_page_width] ) - slopes_marg_right = list( np.array(slopes_marginals)[cx_marg >= mid_point_of_page_width] ) + ordered_left_marginals = sort_left(poly_marg_left) + ordered_right_marginals = sort_right(poly_marg_right) - cy_marg_left = cy_marg[cx_marg < mid_point_of_page_width] - cy_marg_right = cy_marg[cx_marg >= mid_point_of_page_width] + ordered_left_marginals_textline = sort_left(all_found_textline_polygons_marginals_left) + ordered_right_marginals_textline = sort_right(all_found_textline_polygons_marginals_right) - ordered_left_marginals = [poly for _, poly in sorted(zip(cy_marg_left, poly_marg_left), - key=lambda x: x[0])] - ordered_right_marginals = [poly for _, poly in sorted(zip(cy_marg_right, poly_marg_right), - key=lambda x: x[0])] + ordered_left_marginals_bbox = sort_left(all_box_coord_marginals_left) + ordered_right_marginals_bbox = sort_right(all_box_coord_marginals_right) - ordered_left_marginals_textline = [poly for _, poly in sorted(zip(cy_marg_left, - all_found_textline_polygons_marginals_left), - key=lambda x: x[0])] - ordered_right_marginals_textline = [poly for _, poly in sorted(zip(cy_marg_right, - all_found_textline_polygons_marginals_right), - key=lambda x: x[0])] - - ordered_left_marginals_bbox = [poly for _, poly in sorted(zip(cy_marg_left, - all_box_coord_marginals_left), - key=lambda x: x[0])] - ordered_right_marginals_bbox = [poly for _, poly in sorted(zip(cy_marg_right, - all_box_coord_marginals_right), - key=lambda x: x[0])] - - ordered_left_slopes_marginals = [poly for _, poly in sorted(zip(cy_marg_left, slopes_marg_left), - key=lambda x: x[0])] - ordered_right_slopes_marginals = [poly for _, poly in sorted(zip(cy_marg_right, slopes_marg_right), - key=lambda x: x[0])] + ordered_left_slopes_marginals = sort_left(slopes_marg_left) + ordered_right_slopes_marginals = sort_right(slopes_marg_right) return (ordered_left_marginals, ordered_right_marginals, diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 52bf3ef..4eee5a9 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1417,7 +1417,7 @@ def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( imgray = cv2.cvtColor(sep_ver_hor_cross, cv2.COLOR_BGR2GRAY) ret, thresh = cv2.threshold(imgray, 0, 255, 0) contours_cross,_=cv2.findContours(thresh,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) - cx_cross,cy_cross ,_ , _, _ ,_,_=find_new_features_of_contours(contours_cross) + cx_cross, cy_cross = find_center_of_contours(contours_cross) for ii in range(len(cx_cross)): img_p_in[int(cy_cross[ii])-30:int(cy_cross[ii])+30,int(cx_cross[ii])+5:int(cx_cross[ii])+40,0]=0 img_p_in[int(cy_cross[ii])-30:int(cy_cross[ii])+30,int(cx_cross[ii])-40:int(cx_cross[ii])-4,0]=0 From 3f3353ec3a53384a100ef9ebe2fefb7e092e968c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 2 Oct 2025 21:28:04 +0200 Subject: [PATCH 049/101] do_order_of_regions: simplify - avoid loops in favour of array processing --- src/eynollah/eynollah.py | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 9eba3d3..7f7f53f 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2518,6 +2518,8 @@ class Eynollah: self, contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot): self.logger.debug("enter do_order_of_regions_full_layout") + contours_only_text_parent = np.array(contours_only_text_parent) + contours_only_text_parent_h = np.array(contours_only_text_parent_h) boxes = np.array(boxes, dtype=int) # to be on the safe side cx_text_only, cy_text_only, x_min_text_only, _, _, _, y_cor_x_min_main = find_new_features_of_contours( contours_only_text_parent) @@ -2573,14 +2575,9 @@ class Eynollah: xs = slice(*boxes[iij][0:2]) args_contours_box = args_contours[np.array(arg_text_con) == iij] args_contours_box_h = args_contours_h[np.array(arg_text_con_h) == iij] - con_inter_box = [] - con_inter_box_h = [] + con_inter_box = contours_only_text_parent[args_contours_box] + con_inter_box_h = contours_only_text_parent_h[args_contours_box_h] - for box in args_contours_box: - con_inter_box.append(contours_only_text_parent[box]) - - for box in args_contours_box_h: - con_inter_box_h.append(contours_only_text_parent_h[box]) indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, boxes[iij][2]) @@ -2675,14 +2672,8 @@ class Eynollah: xs = slice(*boxes[iij][0:2]) args_contours_box = args_contours[np.array(arg_text_con) == iij] args_contours_box_h = args_contours_h[np.array(arg_text_con_h) == iij] - con_inter_box = [] - con_inter_box_h = [] - - for box in args_contours_box: - con_inter_box.append(contours_only_text_parent[box]) - - for box in args_contours_box_h: - con_inter_box_h.append(contours_only_text_parent_h[box]) + con_inter_box = contours_only_text_parent[args_contours_box] + con_inter_box_h = contours_only_text_parent_h[args_contours_box_h] indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, boxes[iij][2]) @@ -2729,6 +2720,8 @@ class Eynollah: self, contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot): self.logger.debug("enter do_order_of_regions_no_full_layout") + contours_only_text_parent = np.array(contours_only_text_parent) + contours_only_text_parent_h = np.array(contours_only_text_parent_h) boxes = np.array(boxes, dtype=int) # to be on the safe side cx_text_only, cy_text_only, x_min_text_only, _, _, _, y_cor_x_min_main = find_new_features_of_contours( contours_only_text_parent) @@ -2761,10 +2754,8 @@ class Eynollah: ys = slice(*boxes[iij][2:4]) xs = slice(*boxes[iij][0:2]) args_contours_box = args_contours[np.array(arg_text_con) == iij] - con_inter_box = [] + con_inter_box = contours_only_text_parent[args_contours_box] con_inter_box_h = [] - for i in range(len(args_contours_box)): - con_inter_box.append(contours_only_text_parent[args_contours_box[i]]) indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, boxes[iij][2]) From 415b2cbad843d4fa083f94f459777af97bd81234 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 2 Oct 2025 21:36:22 +0200 Subject: [PATCH 050/101] eynollah, drop_capitals: simplify - use new `find_center_of_contours` --- src/eynollah/eynollah.py | 21 ++++++++------------- src/eynollah/utils/drop_capitals.py | 27 ++++++++++++++------------- 2 files changed, 22 insertions(+), 26 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 7f7f53f..357c0c2 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -70,6 +70,7 @@ from .utils.contour import ( filter_contours_area_of_image, filter_contours_area_of_image_tables, find_contours_mean_y_diff, + find_center_of_contours, find_new_features_of_contours, find_features_of_contours, get_text_region_boxes_by_given_contours, @@ -1859,14 +1860,10 @@ class Eynollah: def get_slopes_and_deskew_new_light2(self, contours, contours_par, textline_mask_tot, boxes, slope_deskew): polygons_of_textlines = return_contours_of_interested_region(textline_mask_tot,1,0.00001) - M_main_tot = [cv2.moments(polygons_of_textlines[j]) - for j in range(len(polygons_of_textlines))] + cx_main_tot, cy_main_tot = find_center_of_contours(polygons_of_textlines) + w_h_textlines = [cv2.boundingRect(polygon)[2:] for polygon in polygons_of_textlines] - w_h_textlines = [cv2.boundingRect(polygons_of_textlines[i])[2:] for i in range(len(polygons_of_textlines))] - cx_main_tot = [(M_main_tot[j]["m10"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] - cy_main_tot = [(M_main_tot[j]["m01"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] - - args_textlines = np.array(range(len(polygons_of_textlines))) + args_textlines = np.arange(len(polygons_of_textlines)) all_found_textline_polygons = [] slopes = [] all_box_coord =[] @@ -4809,8 +4806,8 @@ class Eynollah: areas_cnt_text_parent = self.return_list_of_contours_with_desired_order( areas_cnt_text_parent, index_con_parents) - cx_bigest_big, cy_biggest_big, _, _, _, _, _ = find_new_features_of_contours([contours_biggest]) - cx_bigest, cy_biggest, _, _, _, _, _ = find_new_features_of_contours(contours_only_text_parent) + cx_bigest_big, cy_biggest_big = find_center_of_contours([contours_biggest]) + cx_bigest, cy_biggest = find_center_of_contours(contours_only_text_parent) if np.abs(slope_deskew) >= SLOPE_THRESHOLD: contours_only_text_d, hir_on_text_d = return_contours_of_image(text_only_d) @@ -4834,10 +4831,8 @@ class Eynollah: areas_cnt_text_d = self.return_list_of_contours_with_desired_order( areas_cnt_text_d, index_con_parents_d) - cx_bigest_d_big, cy_biggest_d_big, _, _, _, _, _ = \ - find_new_features_of_contours([contours_biggest_d]) - cx_bigest_d, cy_biggest_d, _, _, _, _, _ = \ - find_new_features_of_contours(contours_only_text_parent_d) + cx_bigest_d_big, cy_biggest_d_big = find_center_of_contours([contours_biggest_d]) + cx_bigest_d, cy_biggest_d = find_center_of_contours(contours_only_text_parent_d) try: if len(cx_bigest_d) >= 5: cx_bigest_d_last5 = cx_bigest_d[-5:] diff --git a/src/eynollah/utils/drop_capitals.py b/src/eynollah/utils/drop_capitals.py index 67547d3..9f82fac 100644 --- a/src/eynollah/utils/drop_capitals.py +++ b/src/eynollah/utils/drop_capitals.py @@ -1,6 +1,7 @@ import numpy as np import cv2 from .contour import ( + find_center_of_contours, find_new_features_of_contours, return_contours_of_image, return_parent_contours, @@ -22,8 +23,8 @@ def adhere_drop_capital_region_into_corresponding_textline( ): # print(np.shape(all_found_textline_polygons),np.shape(all_found_textline_polygons[3]),'all_found_textline_polygonsshape') # print(all_found_textline_polygons[3]) - cx_m, cy_m, _, _, _, _, _ = find_new_features_of_contours(contours_only_text_parent) - cx_h, cy_h, _, _, _, _, _ = find_new_features_of_contours(contours_only_text_parent_h) + cx_m, cy_m = find_center_of_contours(contours_only_text_parent) + cx_h, cy_h = find_center_of_contours(contours_only_text_parent_h) cx_d, cy_d, _, _, y_min_d, y_max_d, _ = find_new_features_of_contours(polygons_of_drop_capitals) img_con_all = np.zeros((text_regions_p.shape[0], text_regions_p.shape[1], 3)) @@ -89,9 +90,9 @@ def adhere_drop_capital_region_into_corresponding_textline( region_final = region_with_intersected_drop[np.argmax(sum_pixels_of_intersection)] - 1 # print(region_final,'region_final') - # cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) + # cx_t, cy_t = find_center_of_contours(all_found_textline_polygons[int(region_final)]) try: - cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) + cx_t, cy_t = find_center_of_contours(all_found_textline_polygons[int(region_final)]) # print(all_box_coord[j_cont]) # print(cx_t) # print(cy_t) @@ -153,9 +154,9 @@ def adhere_drop_capital_region_into_corresponding_textline( # areas_main=np.array([cv2.contourArea(all_found_textline_polygons[int(region_final)][0][j] ) for j in range(len(all_found_textline_polygons[int(region_final)]))]) - # cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) + # cx_t, cy_t = find_center_of_contours(all_found_textline_polygons[int(region_final)]) try: - cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) + cx_t, cy_t = find_center_of_contours(all_found_textline_polygons[int(region_final)]) # print(all_box_coord[j_cont]) # print(cx_t) # print(cy_t) @@ -208,7 +209,7 @@ def adhere_drop_capital_region_into_corresponding_textline( try: # print(all_found_textline_polygons[j_cont][0]) - cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) + cx_t, cy_t = find_center_of_contours(all_found_textline_polygons[int(region_final)]) # print(all_box_coord[j_cont]) # print(cx_t) # print(cy_t) @@ -261,7 +262,7 @@ def adhere_drop_capital_region_into_corresponding_textline( else: pass - ##cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) + ##cx_t, cy_t = find_center_of_contours(all_found_textline_polygons[int(region_final)]) ###print(all_box_coord[j_cont]) ###print(cx_t) ###print(cy_t) @@ -315,9 +316,9 @@ def adhere_drop_capital_region_into_corresponding_textline( region_final = region_with_intersected_drop[np.argmax(sum_pixels_of_intersection)] - 1 # print(region_final,'region_final') - # cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) + # cx_t, cy_t = find_center_of_contours(all_found_textline_polygons[int(region_final)]) try: - cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) + cx_t, cy_t = find_center_of_contours(all_found_textline_polygons[int(region_final)]) # print(all_box_coord[j_cont]) # print(cx_t) # print(cy_t) @@ -375,12 +376,12 @@ def adhere_drop_capital_region_into_corresponding_textline( # areas_main=np.array([cv2.contourArea(all_found_textline_polygons[int(region_final)][0][j] ) for j in range(len(all_found_textline_polygons[int(region_final)]))]) - # cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) + # cx_t, cy_t = find_center_of_contours(all_found_textline_polygons[int(region_final)]) # print(cx_t,'print') try: # print(all_found_textline_polygons[j_cont][0]) - cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) + cx_t, cy_t = find_center_of_contours(all_found_textline_polygons[int(region_final)]) # print(all_box_coord[j_cont]) # print(cx_t) # print(cy_t) @@ -453,7 +454,7 @@ def adhere_drop_capital_region_into_corresponding_textline( #####try: #####if len(contours_new_parent)==1: ######print(all_found_textline_polygons[j_cont][0]) - #####cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_textline_polygons[j_cont]) + #####cx_t, cy_t = find_center_of_contours(all_found_textline_polygons[j_cont]) ######print(all_box_coord[j_cont]) ######print(cx_t) ######print(cy_t) From a1c8fd44677fc894395652de070710a5fc6aae2e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 2 Oct 2025 21:41:37 +0200 Subject: [PATCH 051/101] do_order_of_regions / order_of_regions: simplify - array-convert only once (before returning from `order_of_regions`) - avoid passing `matrix_of_orders` unnecessarily between `order_of_regions` and `order_and_id_of_texts` --- src/eynollah/eynollah.py | 73 +++++++++++++++++----------------- src/eynollah/utils/__init__.py | 2 +- src/eynollah/utils/xml.py | 6 +-- 3 files changed, 38 insertions(+), 43 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 357c0c2..8351ab6 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2567,26 +2567,25 @@ class Eynollah: ref_point = 0 order_of_texts_tot = [] id_of_texts_tot = [] - for iij in range(len(boxes)): - ys = slice(*boxes[iij][2:4]) - xs = slice(*boxes[iij][0:2]) + for iij, box in enumerate(boxes): + ys = slice(*box[2:4]) + xs = slice(*box[0:2]) args_contours_box = args_contours[np.array(arg_text_con) == iij] args_contours_box_h = args_contours_h[np.array(arg_text_con_h) == iij] con_inter_box = contours_only_text_parent[args_contours_box] con_inter_box_h = contours_only_text_parent_h[args_contours_box_h] - - indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( - textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, boxes[iij][2]) + indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( + textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2]) order_of_texts, id_of_texts = order_and_id_of_texts( con_inter_box, con_inter_box_h, - matrix_of_orders, indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) + indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) - indexes_sorted_main = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 1] - indexes_by_type_main = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 1] - indexes_sorted_head = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 2] - indexes_by_type_head = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 2] + indexes_sorted_main = indexes_sorted[kind_of_texts_sorted == 1] + indexes_by_type_main = index_by_kind_sorted[kind_of_texts_sorted == 1] + indexes_sorted_head = indexes_sorted[kind_of_texts_sorted == 2] + indexes_by_type_head = index_by_kind_sorted[kind_of_texts_sorted == 2] for zahler, _ in enumerate(args_contours_box): arg_order_v = indexes_sorted_main[zahler] @@ -2664,25 +2663,25 @@ class Eynollah: ref_point = 0 order_of_texts_tot = [] id_of_texts_tot = [] - for iij, _ in enumerate(boxes): - ys = slice(*boxes[iij][2:4]) - xs = slice(*boxes[iij][0:2]) + for iij, box in enumerate(boxes): + ys = slice(*box[2:4]) + xs = slice(*box[0:2]) args_contours_box = args_contours[np.array(arg_text_con) == iij] args_contours_box_h = args_contours_h[np.array(arg_text_con_h) == iij] con_inter_box = contours_only_text_parent[args_contours_box] con_inter_box_h = contours_only_text_parent_h[args_contours_box_h] - indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( - textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, boxes[iij][2]) + indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( + textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2]) order_of_texts, id_of_texts = order_and_id_of_texts( con_inter_box, con_inter_box_h, - matrix_of_orders, indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) + indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) - indexes_sorted_main = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 1] - indexes_by_type_main = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 1] - indexes_sorted_head = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 2] - indexes_by_type_head = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 2] + indexes_sorted_main = indexes_sorted[kind_of_texts_sorted == 1] + indexes_by_type_main = index_by_kind_sorted[kind_of_texts_sorted == 1] + indexes_sorted_head = indexes_sorted[kind_of_texts_sorted == 2] + indexes_by_type_head = index_by_kind_sorted[kind_of_texts_sorted == 2] for zahler, _ in enumerate(args_contours_box): arg_order_v = indexes_sorted_main[zahler] @@ -2747,22 +2746,22 @@ class Eynollah: ref_point = 0 order_of_texts_tot = [] id_of_texts_tot = [] - for iij in range(len(boxes)): - ys = slice(*boxes[iij][2:4]) - xs = slice(*boxes[iij][0:2]) + for iij, box in enumerate(boxes): + ys = slice(*box[2:4]) + xs = slice(*box[0:2]) args_contours_box = args_contours[np.array(arg_text_con) == iij] con_inter_box = contours_only_text_parent[args_contours_box] con_inter_box_h = [] - indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( - textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, boxes[iij][2]) + indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( + textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2]) order_of_texts, id_of_texts = order_and_id_of_texts( con_inter_box, con_inter_box_h, - matrix_of_orders, indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) + indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) - indexes_sorted_main = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 1] - indexes_by_type_main = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 1] + indexes_sorted_main = indexes_sorted[kind_of_texts_sorted == 1] + indexes_by_type_main = index_by_kind_sorted[kind_of_texts_sorted == 1] for zahler, _ in enumerate(args_contours_box): arg_order_v = indexes_sorted_main[zahler] @@ -2808,24 +2807,24 @@ class Eynollah: ref_point = 0 order_of_texts_tot = [] id_of_texts_tot = [] - for iij in range(len(boxes)): - ys = slice(*boxes[iij][2:4]) - xs = slice(*boxes[iij][0:2]) + for iij, box in enumerate(boxes): + ys = slice(*box[2:4]) + xs = slice(*box[0:2]) args_contours_box = args_contours[np.array(arg_text_con) == iij] con_inter_box = [] con_inter_box_h = [] for i in range(len(args_contours_box)): con_inter_box.append(contours_only_text_parent[args_contours_box[i]]) - indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( - textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, boxes[iij][2]) + indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( + textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2]) order_of_texts, id_of_texts = order_and_id_of_texts( con_inter_box, con_inter_box_h, - matrix_of_orders, indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) + indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) - indexes_sorted_main = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 1] - indexes_by_type_main = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 1] + indexes_sorted_main = indexes_sorted[kind_of_texts_sorted == 1] + indexes_by_type_main = index_by_kind_sorted[kind_of_texts_sorted == 1] for zahler, _ in enumerate(args_contours_box): arg_order_v = indexes_sorted_main[zahler] diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 4eee5a9..27a85da 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1325,7 +1325,7 @@ def order_of_regions(textline_mask, contours_main, contours_header, y_ref): final_types.append(1) final_index_type.append(ind_missed) - return final_indexers_sorted, matrix_of_orders, final_types, final_index_type + return np.array(final_indexers_sorted), np.array(final_types), np.array(final_index_type) def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( img_p_in_ver, img_in_hor,num_col_classifier): diff --git a/src/eynollah/utils/xml.py b/src/eynollah/utils/xml.py index 13420df..a61dadb 100644 --- a/src/eynollah/utils/xml.py +++ b/src/eynollah/utils/xml.py @@ -65,11 +65,7 @@ def xml_reading_order(page, order_of_texts, id_of_marginalia_left, id_of_margina og.add_RegionRefIndexed(RegionRefIndexedType(index=str(region_counter.get('region')), regionRef=id_marginal)) region_counter.inc('region') -def order_and_id_of_texts(found_polygons_text_region, found_polygons_text_region_h, matrix_of_orders, indexes_sorted, index_of_types, kind_of_texts, ref_point): - indexes_sorted = np.array(indexes_sorted) - index_of_types = np.array(index_of_types) - kind_of_texts = np.array(kind_of_texts) - +def order_and_id_of_texts(found_polygons_text_region, found_polygons_text_region_h, indexes_sorted, index_of_types, kind_of_texts, ref_point): id_of_texts = [] order_of_texts = [] From 4950e6bd784e2078ca7b65b1fcbf20de29d0f613 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 2 Oct 2025 22:28:52 +0200 Subject: [PATCH 052/101] order_of_regions: simplify - use new `find_center_of_contours` - avoid unused calculations - avoid loops in favour of array processing --- src/eynollah/utils/__init__.py | 131 +++++++++------------------------ 1 file changed, 34 insertions(+), 97 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 27a85da..92da14a 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -15,10 +15,21 @@ from scipy.ndimage import gaussian_filter1d from .is_nan import isNaN from .contour import (contours_in_same_horizon, + find_center_of_contours, find_new_features_of_contours, return_contours_of_image, return_parent_contours) +def pairwise(iterable): + # pairwise('ABCDEFG') → AB BC CD DE EF FG + + iterator = iter(iterable) + a = next(iterator, None) + + for b in iterator: + yield a, b + a = b + def return_x_start_end_mothers_childs_and_type_of_reading_order( x_min_hor_some, x_max_hor_some, cy_hor_some, peak_points, cy_hor_diff): @@ -1183,106 +1194,45 @@ def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col) textlines_con_changed.append(textlines_big_org_form) return textlines_con_changed -def order_of_regions(textline_mask, contours_main, contours_header, y_ref): +def order_of_regions(textline_mask, contours_main, contours_head, y_ref): ##plt.imshow(textline_mask) ##plt.show() - """ - print(len(contours_main),'contours_main') - mada_n=textline_mask.sum(axis=1) - y=mada_n[:] - - y_help=np.zeros(len(y)+40) - y_help[20:len(y)+20]=y - x=np.arange(len(y)) - - peaks_real, _ = find_peaks(gaussian_filter1d(y, 3), height=0) - ##plt.imshow(textline_mask[:,:]) - ##plt.show() - - sigma_gaus=8 - z= gaussian_filter1d(y_help, sigma_gaus) - zneg_rev=-y_help+np.max(y_help) - zneg=np.zeros(len(zneg_rev)+40) - zneg[20:len(zneg_rev)+20]=zneg_rev - zneg= gaussian_filter1d(zneg, sigma_gaus) - - peaks, _ = find_peaks(z, height=0) - peaks_neg, _ = find_peaks(zneg, height=0) - peaks_neg=peaks_neg-20-20 - peaks=peaks-20 - """ - textline_sum_along_width = textline_mask.sum(axis=1) - - y = textline_sum_along_width[:] + y = textline_mask.sum(axis=1) # horizontal projection profile y_padded = np.zeros(len(y) + 40) y_padded[20 : len(y) + 20] = y - x = np.arange(len(y)) - - peaks_real, _ = find_peaks(gaussian_filter1d(y, 3), height=0) sigma_gaus = 8 - z = gaussian_filter1d(y_padded, sigma_gaus) - zneg_rev = -y_padded + np.max(y_padded) + #z = gaussian_filter1d(y_padded, sigma_gaus) + #peaks, _ = find_peaks(z, height=0) + #peaks = peaks - 20 + zneg_rev = np.max(y_padded) - y_padded zneg = np.zeros(len(zneg_rev) + 40) zneg[20 : len(zneg_rev) + 20] = zneg_rev zneg = gaussian_filter1d(zneg, sigma_gaus) - peaks, _ = find_peaks(z, height=0) peaks_neg, _ = find_peaks(zneg, height=0) peaks_neg = peaks_neg - 20 - 20 - peaks = peaks - 20 ##plt.plot(z) ##plt.show() - if contours_main != None: - areas_main = np.array([cv2.contourArea(contours_main[j]) for j in range(len(contours_main))]) - M_main = [cv2.moments(contours_main[j]) for j in range(len(contours_main))] - cx_main = [(M_main[j]["m10"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))] - cy_main = [(M_main[j]["m01"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))] - x_min_main = np.array([np.min(contours_main[j][:, 0, 0]) for j in range(len(contours_main))]) - x_max_main = np.array([np.max(contours_main[j][:, 0, 0]) for j in range(len(contours_main))]) + cx_main, cy_main = find_center_of_contours(contours_main) + cx_head, cy_head = find_center_of_contours(contours_head) - y_min_main = np.array([np.min(contours_main[j][:, 0, 1]) for j in range(len(contours_main))]) - y_max_main = np.array([np.max(contours_main[j][:, 0, 1]) for j in range(len(contours_main))]) + peaks_neg_new = np.append(np.insert(peaks_neg, 0, 0), textline_mask.shape[0]) + # offset from bbox of mask + peaks_neg_new += y_ref - if len(contours_header) != None: - areas_header = np.array([cv2.contourArea(contours_header[j]) for j in range(len(contours_header))]) - M_header = [cv2.moments(contours_header[j]) for j in range(len(contours_header))] - cx_header = [(M_header[j]["m10"] / (M_header[j]["m00"] + 1e-32)) for j in range(len(M_header))] - cy_header = [(M_header[j]["m01"] / (M_header[j]["m00"] + 1e-32)) for j in range(len(M_header))] - x_min_header = np.array([np.min(contours_header[j][:, 0, 0]) for j in range(len(contours_header))]) - x_max_header = np.array([np.max(contours_header[j][:, 0, 0]) for j in range(len(contours_header))]) - - y_min_header = np.array([np.min(contours_header[j][:, 0, 1]) for j in range(len(contours_header))]) - y_max_header = np.array([np.max(contours_header[j][:, 0, 1]) for j in range(len(contours_header))]) - # print(cy_main,'mainy') - - peaks_neg_new = [] - peaks_neg_new.append(0 + y_ref) - for iii in range(len(peaks_neg)): - peaks_neg_new.append(peaks_neg[iii] + y_ref) - peaks_neg_new.append(textline_mask.shape[0] + y_ref) - - if len(cy_main) > 0 and np.max(cy_main) > np.max(peaks_neg_new): - cy_main = np.array(cy_main) * (np.max(peaks_neg_new) / np.max(cy_main)) - 10 - if contours_main != None: - indexer_main = np.arange(len(contours_main)) - if contours_main != None: - len_main = len(contours_main) - else: - len_main = 0 - - matrix_of_orders = np.zeros((len(contours_main) + len(contours_header), 5)) - matrix_of_orders[:, 0] = np.arange(len(contours_main) + len(contours_header)) + matrix_of_orders = np.zeros((len(contours_main) + len(contours_head), 5), dtype=int) + matrix_of_orders[:, 0] = np.arange(len(contours_main) + len(contours_head)) matrix_of_orders[: len(contours_main), 1] = 1 matrix_of_orders[len(contours_main) :, 1] = 2 matrix_of_orders[: len(contours_main), 2] = cx_main - matrix_of_orders[len(contours_main) :, 2] = cx_header + matrix_of_orders[len(contours_main) :, 2] = cx_head matrix_of_orders[: len(contours_main), 3] = cy_main - matrix_of_orders[len(contours_main) :, 3] = cy_header + matrix_of_orders[len(contours_main) :, 3] = cy_head matrix_of_orders[: len(contours_main), 4] = np.arange(len(contours_main)) - matrix_of_orders[len(contours_main) :, 4] = np.arange(len(contours_header)) + matrix_of_orders[len(contours_main) :, 4] = np.arange(len(contours_head)) # print(peaks_neg_new,'peaks_neg_new') # print(matrix_of_orders,'matrix_of_orders') @@ -1290,27 +1240,14 @@ def order_of_regions(textline_mask, contours_main, contours_header, y_ref): final_indexers_sorted = [] final_types = [] final_index_type = [] - for i in range(len(peaks_neg_new) - 1): - top = peaks_neg_new[i] - down = peaks_neg_new[i + 1] - indexes_in = matrix_of_orders[:, 0][(matrix_of_orders[:, 3] >= top) & - ((matrix_of_orders[:, 3] < down))] - cxs_in = matrix_of_orders[:, 2][(matrix_of_orders[:, 3] >= top) & - ((matrix_of_orders[:, 3] < down))] - cys_in = matrix_of_orders[:, 3][(matrix_of_orders[:, 3] >= top) & - ((matrix_of_orders[:, 3] < down))] - types_of_text = matrix_of_orders[:, 1][(matrix_of_orders[:, 3] >= top) & - (matrix_of_orders[:, 3] < down)] - index_types_of_text = matrix_of_orders[:, 4][(matrix_of_orders[:, 3] >= top) & - (matrix_of_orders[:, 3] < down)] + for top, bot in pairwise(peaks_neg_new): + indexes_in, types_in, cxs_in, cys_in, typed_indexes_in = \ + matrix_of_orders[(matrix_of_orders[:, 3] >= top) & + (matrix_of_orders[:, 3] < bot)].T sorted_inside = np.argsort(cxs_in) - ind_in_int = indexes_in[sorted_inside] - ind_in_type = types_of_text[sorted_inside] - ind_ind_type = index_types_of_text[sorted_inside] - for j in range(len(ind_in_int)): - final_indexers_sorted.append(int(ind_in_int[j])) - final_types.append(int(ind_in_type[j])) - final_index_type.append(int(ind_ind_type[j])) + final_indexers_sorted.extend(indexes_in[sorted_inside]) + final_types.extend(types_in[sorted_inside]) + final_index_type.extend(typed_indexes_in[sorted_inside]) ##matrix_of_orders[:len_main,4]=final_indexers_sorted[:] From 7387f5a92994bc5c2678be643816e5883f32cfa1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 2 Oct 2025 22:35:40 +0200 Subject: [PATCH 053/101] do_order_of_regions: improve box matching, simplify - when searching for boxes matching contour, be more precise: - avoid heuristic rules ("xmin + 80 within xrange") in favour of exact criteria (contour properly contained in box) - for fallback criterion (nearest centers), also require proper containment of center in box - `order_of_regions`: remove (now) unnecessary (and insufficient) workaround for missing indexes (if boxes are not covering contours exactly) --- src/eynollah/eynollah.py | 185 ++++++++++++++++++--------------- src/eynollah/utils/__init__.py | 14 +-- 2 files changed, 106 insertions(+), 93 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 8351ab6..3194b66 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2518,51 +2518,59 @@ class Eynollah: contours_only_text_parent = np.array(contours_only_text_parent) contours_only_text_parent_h = np.array(contours_only_text_parent_h) boxes = np.array(boxes, dtype=int) # to be on the safe side - cx_text_only, cy_text_only, x_min_text_only, _, _, _, y_cor_x_min_main = find_new_features_of_contours( + c_boxes = np.stack((0.5 * boxes[:, 2:4].sum(axis=1), + 0.5 * boxes[:, 0:2].sum(axis=1))) + cx_main, cy_main, mx_main, Mx_main, my_main, My_main, mxy_main = find_new_features_of_contours( contours_only_text_parent) - cx_text_only_h, cy_text_only_h, x_min_text_only_h, _, _, _, y_cor_x_min_main_h = find_new_features_of_contours( + cx_head, cy_head, mx_head, Mx_head, my_head, My_head, mxy_head = find_new_features_of_contours( contours_only_text_parent_h) try: arg_text_con = [] - for ii in range(len(cx_text_only)): + for ii in range(len(contours_only_text_parent)): check_if_textregion_located_in_a_box = False - for jj in range(len(boxes)): - if (x_min_text_only[ii] + 80 >= boxes[jj][0] and - x_min_text_only[ii] + 80 < boxes[jj][1] and - y_cor_x_min_main[ii] >= boxes[jj][2] and - y_cor_x_min_main[ii] < boxes[jj][3]): + for jj, box in enumerate(boxes): + if (mx_main[ii] >= box[0] and + Mx_main[ii] < box[1] and + my_main[ii] >= box[2] and + My_main[ii] < box[3]): arg_text_con.append(jj) check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: - dists_tr_from_box = [math.sqrt((cx_text_only[ii] - boxes[jj][1]) ** 2 + - (cy_text_only[ii] - boxes[jj][2]) ** 2) - for jj in range(len(boxes))] - ind_min = np.argmin(dists_tr_from_box) + # dists_tr_from_box = [math.sqrt((cx_main[ii] - 0.5 * box[1] - 0.5 * box[0]) ** 2 + + # (cy_main[ii] - 0.5 * box[3] - 0.5 * box[2]) ** 2) + # for box in boxes] + dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0) + pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & + (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) + ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) arg_text_con.append(ind_min) - args_contours = np.array(range(len(arg_text_con))) + args_contours = np.arange(len(arg_text_con)) + order_by_con_main = np.zeros(len(arg_text_con)) + arg_text_con_h = [] - for ii in range(len(cx_text_only_h)): + for ii in range(len(contours_only_text_parent_h)): check_if_textregion_located_in_a_box = False - for jj in range(len(boxes)): - if (x_min_text_only_h[ii] + 80 >= boxes[jj][0] and - x_min_text_only_h[ii] + 80 < boxes[jj][1] and - y_cor_x_min_main_h[ii] >= boxes[jj][2] and - y_cor_x_min_main_h[ii] < boxes[jj][3]): + for jj, box in enumerate(boxes): + if (mx_head[ii] >= box[0] and + Mx_head[ii] < box[1] and + my_head[ii] >= box[2] and + My_head[ii] < box[3]): arg_text_con_h.append(jj) check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: - dists_tr_from_box = [math.sqrt((cx_text_only_h[ii] - boxes[jj][1]) ** 2 + - (cy_text_only_h[ii] - boxes[jj][2]) ** 2) - for jj in range(len(boxes))] - ind_min = np.argmin(dists_tr_from_box) + # dists_tr_from_box = [math.sqrt((cx_head[ii] - 0.5 * box[1] - 0.5 * box[0]) ** 2 + + # (cy_head[ii] - 0.5 * box[3] - 0.5 * box[2]) ** 2) + # for box in boxes] + dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_head[ii]], [cx_head[ii]]]), axis=0) + pcontained_in_box = ((boxes[:, 2] <= cy_head[ii]) & (cy_head[ii] < boxes[:, 3]) & + (boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1])) + ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) arg_text_con_h.append(ind_min) - args_contours_h = np.array(range(len(arg_text_con_h))) - + args_contours_h = np.arange(len(arg_text_con_h)) order_by_con_head = np.zeros(len(arg_text_con_h)) - order_by_con_main = np.zeros(len(arg_text_con)) ref_point = 0 order_of_texts_tot = [] @@ -2590,12 +2598,12 @@ class Eynollah: for zahler, _ in enumerate(args_contours_box): arg_order_v = indexes_sorted_main[zahler] order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = \ - np.where(indexes_sorted == arg_order_v)[0][0] + ref_point + np.flatnonzero(indexes_sorted == arg_order_v) + ref_point for zahler, _ in enumerate(args_contours_box_h): arg_order_v = indexes_sorted_head[zahler] order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = \ - np.where(indexes_sorted == arg_order_v)[0][0] + ref_point + np.flatnonzero(indexes_sorted == arg_order_v) + ref_point for jji in range(len(id_of_texts)): order_of_texts_tot.append(order_of_texts[jji] + ref_point) @@ -2611,53 +2619,59 @@ class Eynollah: order_text_new = [] for iii in range(len(order_of_texts_tot)): - order_text_new.append(np.where(np.array(order_of_texts_tot) == iii)[0][0]) + order_text_new.append(np.flatnonzero(np.array(order_of_texts_tot) == iii)) except Exception as why: self.logger.error(why) arg_text_con = [] - for ii in range(len(cx_text_only)): + for ii in range(len(contours_only_text_parent)): check_if_textregion_located_in_a_box = False - for jj in range(len(boxes)): - if (cx_text_only[ii] >= boxes[jj][0] and - cx_text_only[ii] < boxes[jj][1] and - cy_text_only[ii] >= boxes[jj][2] and - cy_text_only[ii] < boxes[jj][3]): + for jj, box in enumerate(boxes): + if (cx_main[ii] >= box[0] and + cx_main[ii] < box[1] and + cy_main[ii] >= box[2] and + cy_main[ii] < box[3]): # this is valid if the center of region identify in which box it is located arg_text_con.append(jj) check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: - dists_tr_from_box = [math.sqrt((cx_text_only[ii] - boxes[jj][1]) ** 2 + - (cy_text_only[ii] - boxes[jj][2]) ** 2) - for jj in range(len(boxes))] - ind_min = np.argmin(dists_tr_from_box) + # dists_tr_from_box = [math.sqrt((cx_main[ii] - 0.5 * box[1] - 0.5 * box[0]) ** 2 + + # (cy_main[ii] - 0.5 * box[3] - 0.5 * box[2]) ** 2) + # for box in boxes] + dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0) + pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & + (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) + ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) arg_text_con.append(ind_min) - args_contours = np.array(range(len(arg_text_con))) + args_contours = np.arange(len(arg_text_con)) order_by_con_main = np.zeros(len(arg_text_con)) ############################# head arg_text_con_h = [] - for ii in range(len(cx_text_only_h)): + for ii in range(len(contours_only_text_parent_h)): check_if_textregion_located_in_a_box = False - for jj in range(len(boxes)): - if (cx_text_only_h[ii] >= boxes[jj][0] and - cx_text_only_h[ii] < boxes[jj][1] and - cy_text_only_h[ii] >= boxes[jj][2] and - cy_text_only_h[ii] < boxes[jj][3]): + for jj, box in enumerate(boxes): + if (cx_head[ii] >= box[0] and + cx_head[ii] < box[1] and + cy_head[ii] >= box[2] and + cy_head[ii] < box[3]): # this is valid if the center of region identify in which box it is located arg_text_con_h.append(jj) check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: - dists_tr_from_box = [math.sqrt((cx_text_only_h[ii] - boxes[jj][1]) ** 2 + - (cy_text_only_h[ii] - boxes[jj][2]) ** 2) - for jj in range(len(boxes))] - ind_min = np.argmin(dists_tr_from_box) + # dists_tr_from_box = [math.sqrt((cx_head[ii] - 0.5 * box[1] - 0.5 * box[0]) ** 2 + + # (cy_head[ii] - 0.5 * box[3] - 0.5 * box[2]) ** 2) + # for box in boxes] + dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_head[ii]], [cx_head[ii]]]), axis=0) + pcontained_in_box = ((boxes[:, 2] <= cy_head[ii]) & (cy_head[ii] < boxes[:, 3]) & + (boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1])) + ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) arg_text_con_h.append(ind_min) - args_contours_h = np.array(range(len(arg_text_con_h))) + args_contours_h = np.arange(len(arg_text_con_h)) order_by_con_head = np.zeros(len(arg_text_con_h)) ref_point = 0 @@ -2686,14 +2700,14 @@ class Eynollah: for zahler, _ in enumerate(args_contours_box): arg_order_v = indexes_sorted_main[zahler] order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = \ - np.where(indexes_sorted == arg_order_v)[0][0] + ref_point + np.flatnonzero(indexes_sorted == arg_order_v) + ref_point for zahler, _ in enumerate(args_contours_box_h): arg_order_v = indexes_sorted_head[zahler] order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = \ - np.where(indexes_sorted == arg_order_v)[0][0] + ref_point + np.flatnonzero(indexes_sorted == arg_order_v) + ref_point - for jji, _ in enumerate(id_of_texts): + for jji in range(len(id_of_texts)): order_of_texts_tot.append(order_of_texts[jji] + ref_point) id_of_texts_tot.append(id_of_texts[jji]) ref_point += len(id_of_texts) @@ -2707,7 +2721,7 @@ class Eynollah: order_text_new = [] for iii in range(len(order_of_texts_tot)): - order_text_new.append(np.where(np.array(order_of_texts_tot) == iii)[0][0]) + order_text_new.append(np.flatnonzero(np.array(order_of_texts_tot) == iii)) self.logger.debug("exit do_order_of_regions_full_layout") return order_text_new, id_of_texts_tot @@ -2719,28 +2733,33 @@ class Eynollah: contours_only_text_parent = np.array(contours_only_text_parent) contours_only_text_parent_h = np.array(contours_only_text_parent_h) boxes = np.array(boxes, dtype=int) # to be on the safe side - cx_text_only, cy_text_only, x_min_text_only, _, _, _, y_cor_x_min_main = find_new_features_of_contours( + c_boxes = np.stack((0.5 * boxes[:, 2:4].sum(axis=1), + 0.5 * boxes[:, 0:2].sum(axis=1))) + cx_main, cy_main, mx_main, Mx_main, my_main, My_main, mxy_main = find_new_features_of_contours( contours_only_text_parent) try: arg_text_con = [] - for ii in range(len(cx_text_only)): + for ii in range(len(contours_only_text_parent)): check_if_textregion_located_in_a_box = False - for jj in range(len(boxes)): - if (x_min_text_only[ii] + 80 >= boxes[jj][0] and - x_min_text_only[ii] + 80 < boxes[jj][1] and - y_cor_x_min_main[ii] >= boxes[jj][2] and - y_cor_x_min_main[ii] < boxes[jj][3]): + for jj, box in enumerate(boxes): + if (mx_main[ii] >= box[0] and + Mx_main[ii] < box[1] and + my_main[ii] >= box[2] and + My_main[ii] < box[3]): arg_text_con.append(jj) check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: - dists_tr_from_box = [math.sqrt((cx_text_only[ii] - boxes[jj][1]) ** 2 + - (cy_text_only[ii] - boxes[jj][2]) ** 2) - for jj in range(len(boxes))] - ind_min = np.argmin(dists_tr_from_box) + # dists_tr_from_box = [math.sqrt((cx_main[ii] - 0.5 * box[1] - 0.5 * box[0]) ** 2 + + # (cy_main[ii] - 0.5 * box[3] - 0.5 * box[2]) ** 2) + # for box in boxes] + dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0) + pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & + (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) + ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) arg_text_con.append(ind_min) - args_contours = np.array(range(len(arg_text_con))) + args_contours = np.arange(len(arg_text_con)) order_by_con_main = np.zeros(len(arg_text_con)) ref_point = 0 @@ -2766,7 +2785,7 @@ class Eynollah: for zahler, _ in enumerate(args_contours_box): arg_order_v = indexes_sorted_main[zahler] order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = \ - np.where(indexes_sorted == arg_order_v)[0][0] + ref_point + np.flatnonzero(indexes_sorted == arg_order_v) + ref_point for jji, _ in enumerate(id_of_texts): order_of_texts_tot.append(order_of_texts[jji] + ref_point) @@ -2779,29 +2798,29 @@ class Eynollah: order_text_new = [] for iii in range(len(order_of_texts_tot)): - order_text_new.append(np.where(np.array(order_of_texts_tot) == iii)[0][0]) + order_text_new.append(np.flatnonzero(np.array(order_of_texts_tot) == iii)) except Exception as why: self.logger.error(why) arg_text_con = [] - for ii in range(len(cx_text_only)): + for ii in range(len(contours_only_text_parent)): check_if_textregion_located_in_a_box = False - for jj in range(len(boxes)): - if (cx_text_only[ii] >= boxes[jj][0] and - cx_text_only[ii] < boxes[jj][1] and - cy_text_only[ii] >= boxes[jj][2] and - cy_text_only[ii] < boxes[jj][3]): + for jj, box in enumerate(boxes): + if (cx_main[ii] >= box[0] and + cx_main[ii] < box[1] and + cy_main[ii] >= box[2] and + cy_main[ii] < box[3]): # this is valid if the center of region identify in which box it is located arg_text_con.append(jj) check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: - dists_tr_from_box = [math.sqrt((cx_text_only[ii] - boxes[jj][1]) ** 2 + - (cy_text_only[ii] - boxes[jj][2]) ** 2) - for jj in range(len(boxes))] - ind_min = np.argmin(dists_tr_from_box) - arg_text_con.append(ind_min) - args_contours = np.array(range(len(arg_text_con))) + dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0) + pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & + (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) + ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) + arg_text_con[ii] = ind_min + args_contours = np.arange(len(contours_only_text_parent)) order_by_con_main = np.zeros(len(arg_text_con)) ref_point = 0 @@ -2829,7 +2848,7 @@ class Eynollah: for zahler, _ in enumerate(args_contours_box): arg_order_v = indexes_sorted_main[zahler] order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = \ - np.where(indexes_sorted == arg_order_v)[0][0] + ref_point + np.flatnonzero(indexes_sorted == arg_order_v) + ref_point for jji, _ in enumerate(id_of_texts): order_of_texts_tot.append(order_of_texts[jji] + ref_point) @@ -2843,7 +2862,7 @@ class Eynollah: order_text_new = [] for iii in range(len(order_of_texts_tot)): - order_text_new.append(np.where(np.array(order_of_texts_tot) == iii)[0][0]) + order_text_new.append(np.flatnonzero(np.array(order_of_texts_tot) == iii)) self.logger.debug("exit do_order_of_regions_no_full_layout") return order_text_new, id_of_texts_tot diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 92da14a..6e5afd4 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1222,6 +1222,8 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref): # offset from bbox of mask peaks_neg_new += y_ref + # assert not len(cy_main) or np.min(peaks_neg_new) <= np.min(cy_main) and np.max(cy_main) <= np.max(peaks_neg_new) + # assert not len(cy_head) or np.min(peaks_neg_new) <= np.min(cy_head) and np.max(cy_head) <= np.max(peaks_neg_new) matrix_of_orders = np.zeros((len(contours_main) + len(contours_head), 5), dtype=int) matrix_of_orders[:, 0] = np.arange(len(contours_main) + len(contours_head)) @@ -1251,16 +1253,8 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref): ##matrix_of_orders[:len_main,4]=final_indexers_sorted[:] - # This fix is applied if the sum of the lengths of contours and contours_h - # does not match final_indexers_sorted. However, this is not the optimal solution.. - if len(cy_main) + len(cy_header) == len(final_index_type): - pass - else: - indexes_missed = set(np.arange(len(cy_main) + len(cy_header))) - set(final_indexers_sorted) - for ind_missed in indexes_missed: - final_indexers_sorted.append(ind_missed) - final_types.append(1) - final_index_type.append(ind_missed) + # assert len(final_indexers_sorted) == len(contours_main) + len(contours_head) + # assert not len(final_indexers_sorted) or max(final_index_type) == max(len(contours_main) return np.array(final_indexers_sorted), np.array(final_types), np.array(final_index_type) From e9bb62bd86747dabd5cd6fb5f67a36547c5c626d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 2 Oct 2025 23:44:00 +0200 Subject: [PATCH 054/101] do_order_of_regions: simplify - avoid loops in favour of array processing --- src/eynollah/eynollah.py | 158 ++++++++++++++++----------------------- 1 file changed, 64 insertions(+), 94 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 3194b66..6a3fd1e 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2526,7 +2526,7 @@ class Eynollah: contours_only_text_parent_h) try: - arg_text_con = [] + arg_text_con_main = np.zeros(len(contours_only_text_parent), dtype=int) for ii in range(len(contours_only_text_parent)): check_if_textregion_located_in_a_box = False for jj, box in enumerate(boxes): @@ -2534,7 +2534,7 @@ class Eynollah: Mx_main[ii] < box[1] and my_main[ii] >= box[2] and My_main[ii] < box[3]): - arg_text_con.append(jj) + arg_text_con_main[ii] = jj check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: @@ -2545,11 +2545,11 @@ class Eynollah: pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) - arg_text_con.append(ind_min) - args_contours = np.arange(len(arg_text_con)) - order_by_con_main = np.zeros(len(arg_text_con)) + arg_text_con_main[ii] = ind_min + args_contours_main = np.arange(len(contours_only_text_parent)) + order_by_con_main = np.zeros_like(arg_text_con_main) - arg_text_con_h = [] + arg_text_con_head = np.zeros(len(contours_only_text_parent_h), dtype=int) for ii in range(len(contours_only_text_parent_h)): check_if_textregion_located_in_a_box = False for jj, box in enumerate(boxes): @@ -2557,7 +2557,7 @@ class Eynollah: Mx_head[ii] < box[1] and my_head[ii] >= box[2] and My_head[ii] < box[3]): - arg_text_con_h.append(jj) + arg_text_con_head[ii] = jj check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: @@ -2568,9 +2568,9 @@ class Eynollah: pcontained_in_box = ((boxes[:, 2] <= cy_head[ii]) & (cy_head[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1])) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) - arg_text_con_h.append(ind_min) - args_contours_h = np.arange(len(arg_text_con_h)) - order_by_con_head = np.zeros(len(arg_text_con_h)) + arg_text_con_head[ii] = ind_min + args_contours_head = np.arange(len(contours_only_text_parent_h)) + order_by_con_head = np.zeros_like(arg_text_con_head) ref_point = 0 order_of_texts_tot = [] @@ -2578,10 +2578,10 @@ class Eynollah: for iij, box in enumerate(boxes): ys = slice(*box[2:4]) xs = slice(*box[0:2]) - args_contours_box = args_contours[np.array(arg_text_con) == iij] - args_contours_box_h = args_contours_h[np.array(arg_text_con_h) == iij] - con_inter_box = contours_only_text_parent[args_contours_box] - con_inter_box_h = contours_only_text_parent_h[args_contours_box_h] + args_contours_box_main = args_contours_main[arg_text_con_main == iij] + args_contours_box_head = args_contours_head[arg_text_con_head == iij] + con_inter_box = contours_only_text_parent[args_contours_box_main] + con_inter_box_h = contours_only_text_parent_h[args_contours_box_head] indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2]) @@ -2595,14 +2595,14 @@ class Eynollah: indexes_sorted_head = indexes_sorted[kind_of_texts_sorted == 2] indexes_by_type_head = index_by_kind_sorted[kind_of_texts_sorted == 2] - for zahler, _ in enumerate(args_contours_box): + for zahler, _ in enumerate(args_contours_box_main): arg_order_v = indexes_sorted_main[zahler] - order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = \ + order_by_con_main[args_contours_box_main[indexes_by_type_main[zahler]]] = \ np.flatnonzero(indexes_sorted == arg_order_v) + ref_point - for zahler, _ in enumerate(args_contours_box_h): + for zahler, _ in enumerate(args_contours_box_head): arg_order_v = indexes_sorted_head[zahler] - order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = \ + order_by_con_head[args_contours_box_head[indexes_by_type_head[zahler]]] = \ np.flatnonzero(indexes_sorted == arg_order_v) + ref_point for jji in range(len(id_of_texts)): @@ -2610,20 +2610,13 @@ class Eynollah: id_of_texts_tot.append(id_of_texts[jji]) ref_point += len(id_of_texts) - order_of_texts_tot = [] - for tj1 in range(len(contours_only_text_parent)): - order_of_texts_tot.append(int(order_by_con_main[tj1])) - - for tj1 in range(len(contours_only_text_parent_h)): - order_of_texts_tot.append(int(order_by_con_head[tj1])) - - order_text_new = [] - for iii in range(len(order_of_texts_tot)): - order_text_new.append(np.flatnonzero(np.array(order_of_texts_tot) == iii)) + order_of_texts_tot = np.concatenate((order_by_con_main, + order_by_con_head)) + order_text_new = np.argsort(order_of_texts_tot) except Exception as why: self.logger.error(why) - arg_text_con = [] + arg_text_con_main = np.zeros(len(contours_only_text_parent), dtype=int) for ii in range(len(contours_only_text_parent)): check_if_textregion_located_in_a_box = False for jj, box in enumerate(boxes): @@ -2632,10 +2625,9 @@ class Eynollah: cy_main[ii] >= box[2] and cy_main[ii] < box[3]): # this is valid if the center of region identify in which box it is located - arg_text_con.append(jj) + arg_text_con_main[ii] = jj check_if_textregion_located_in_a_box = True break - if not check_if_textregion_located_in_a_box: # dists_tr_from_box = [math.sqrt((cx_main[ii] - 0.5 * box[1] - 0.5 * box[0]) ** 2 + # (cy_main[ii] - 0.5 * box[3] - 0.5 * box[2]) ** 2) @@ -2644,13 +2636,11 @@ class Eynollah: pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) - arg_text_con.append(ind_min) - args_contours = np.arange(len(arg_text_con)) - order_by_con_main = np.zeros(len(arg_text_con)) + arg_text_con_main[ii] = ind_min + args_contours_main = np.arange(len(contours_only_text_parent)) + order_by_con_main = np.zeros_like(arg_text_con_main) - ############################# head - - arg_text_con_h = [] + arg_text_con_head = np.zeros(len(contours_only_text_parent_h), dtype=int) for ii in range(len(contours_only_text_parent_h)): check_if_textregion_located_in_a_box = False for jj, box in enumerate(boxes): @@ -2659,7 +2649,7 @@ class Eynollah: cy_head[ii] >= box[2] and cy_head[ii] < box[3]): # this is valid if the center of region identify in which box it is located - arg_text_con_h.append(jj) + arg_text_con_head[ii] = jj check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: @@ -2670,9 +2660,9 @@ class Eynollah: pcontained_in_box = ((boxes[:, 2] <= cy_head[ii]) & (cy_head[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1])) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) - arg_text_con_h.append(ind_min) - args_contours_h = np.arange(len(arg_text_con_h)) - order_by_con_head = np.zeros(len(arg_text_con_h)) + arg_text_con_head[ii] = ind_min + args_contours_head = np.arange(len(contours_only_text_parent_h)) + order_by_con_head = np.zeros_like(arg_text_con_head) ref_point = 0 order_of_texts_tot = [] @@ -2680,10 +2670,10 @@ class Eynollah: for iij, box in enumerate(boxes): ys = slice(*box[2:4]) xs = slice(*box[0:2]) - args_contours_box = args_contours[np.array(arg_text_con) == iij] - args_contours_box_h = args_contours_h[np.array(arg_text_con_h) == iij] - con_inter_box = contours_only_text_parent[args_contours_box] - con_inter_box_h = contours_only_text_parent_h[args_contours_box_h] + args_contours_box_main = args_contours_main[arg_text_con_main == iij] + args_contours_box_head = args_contours_head[arg_text_con_head == iij] + con_inter_box = contours_only_text_parent[args_contours_box_main] + con_inter_box_h = contours_only_text_parent_h[args_contours_box_head] indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2]) @@ -2697,14 +2687,14 @@ class Eynollah: indexes_sorted_head = indexes_sorted[kind_of_texts_sorted == 2] indexes_by_type_head = index_by_kind_sorted[kind_of_texts_sorted == 2] - for zahler, _ in enumerate(args_contours_box): + for zahler, _ in enumerate(args_contours_box_main): arg_order_v = indexes_sorted_main[zahler] - order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = \ + order_by_con_main[args_contours_box_main[indexes_by_type_main[zahler]]] = \ np.flatnonzero(indexes_sorted == arg_order_v) + ref_point - for zahler, _ in enumerate(args_contours_box_h): + for zahler, _ in enumerate(args_contours_box_head): arg_order_v = indexes_sorted_head[zahler] - order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = \ + order_by_con_head[args_contours_box_head[indexes_by_type_head[zahler]]] = \ np.flatnonzero(indexes_sorted == arg_order_v) + ref_point for jji in range(len(id_of_texts)): @@ -2712,16 +2702,9 @@ class Eynollah: id_of_texts_tot.append(id_of_texts[jji]) ref_point += len(id_of_texts) - order_of_texts_tot = [] - for tj1 in range(len(contours_only_text_parent)): - order_of_texts_tot.append(int(order_by_con_main[tj1])) - - for tj1 in range(len(contours_only_text_parent_h)): - order_of_texts_tot.append(int(order_by_con_head[tj1])) - - order_text_new = [] - for iii in range(len(order_of_texts_tot)): - order_text_new.append(np.flatnonzero(np.array(order_of_texts_tot) == iii)) + order_of_texts_tot = np.concatenate((order_by_con_main, + order_by_con_head)) + order_text_new = np.argsort(order_of_texts_tot) self.logger.debug("exit do_order_of_regions_full_layout") return order_text_new, id_of_texts_tot @@ -2739,7 +2722,7 @@ class Eynollah: contours_only_text_parent) try: - arg_text_con = [] + arg_text_con_main = np.zeros(len(contours_only_text_parent), dtype=int) for ii in range(len(contours_only_text_parent)): check_if_textregion_located_in_a_box = False for jj, box in enumerate(boxes): @@ -2747,7 +2730,7 @@ class Eynollah: Mx_main[ii] < box[1] and my_main[ii] >= box[2] and My_main[ii] < box[3]): - arg_text_con.append(jj) + arg_text_con_main[ii] = jj check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: @@ -2758,9 +2741,9 @@ class Eynollah: pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) - arg_text_con.append(ind_min) - args_contours = np.arange(len(arg_text_con)) - order_by_con_main = np.zeros(len(arg_text_con)) + arg_text_con_main[ii] = ind_min + args_contours_main = np.arange(len(contours_only_text_parent)) + order_by_con_main = np.zeros_like(arg_text_con_main) ref_point = 0 order_of_texts_tot = [] @@ -2768,8 +2751,8 @@ class Eynollah: for iij, box in enumerate(boxes): ys = slice(*box[2:4]) xs = slice(*box[0:2]) - args_contours_box = args_contours[np.array(arg_text_con) == iij] - con_inter_box = contours_only_text_parent[args_contours_box] + args_contours_box_main = args_contours_main[arg_text_con_main == iij] + con_inter_box = contours_only_text_parent[args_contours_box_main] con_inter_box_h = [] indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( @@ -2782,9 +2765,9 @@ class Eynollah: indexes_sorted_main = indexes_sorted[kind_of_texts_sorted == 1] indexes_by_type_main = index_by_kind_sorted[kind_of_texts_sorted == 1] - for zahler, _ in enumerate(args_contours_box): + for zahler, _ in enumerate(args_contours_box_main): arg_order_v = indexes_sorted_main[zahler] - order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = \ + order_by_con_main[args_contours_box_main[indexes_by_type_main[zahler]]] = \ np.flatnonzero(indexes_sorted == arg_order_v) + ref_point for jji, _ in enumerate(id_of_texts): @@ -2792,17 +2775,12 @@ class Eynollah: id_of_texts_tot.append(id_of_texts[jji]) ref_point += len(id_of_texts) - order_of_texts_tot = [] - for tj1 in range(len(contours_only_text_parent)): - order_of_texts_tot.append(int(order_by_con_main[tj1])) - - order_text_new = [] - for iii in range(len(order_of_texts_tot)): - order_text_new.append(np.flatnonzero(np.array(order_of_texts_tot) == iii)) + order_of_texts_tot = order_by_con_main + order_text_new = np.argsort(order_of_texts_tot) except Exception as why: self.logger.error(why) - arg_text_con = [] + arg_text_con_main = np.zeros(len(contours_only_text_parent), dtype=int) for ii in range(len(contours_only_text_parent)): check_if_textregion_located_in_a_box = False for jj, box in enumerate(boxes): @@ -2811,7 +2789,7 @@ class Eynollah: cy_main[ii] >= box[2] and cy_main[ii] < box[3]): # this is valid if the center of region identify in which box it is located - arg_text_con.append(jj) + arg_text_con_main[ii] = jj check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: @@ -2819,9 +2797,9 @@ class Eynollah: pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) - arg_text_con[ii] = ind_min - args_contours = np.arange(len(contours_only_text_parent)) - order_by_con_main = np.zeros(len(arg_text_con)) + arg_text_con_main[ii] = ind_min + args_contours_main = np.arange(len(contours_only_text_parent)) + order_by_con_main = np.zeros_like(arg_text_con_main) ref_point = 0 order_of_texts_tot = [] @@ -2829,11 +2807,9 @@ class Eynollah: for iij, box in enumerate(boxes): ys = slice(*box[2:4]) xs = slice(*box[0:2]) - args_contours_box = args_contours[np.array(arg_text_con) == iij] - con_inter_box = [] + args_contours_box_main = args_contours_main[arg_text_con_main == iij] + con_inter_box = contours_only_text_parent[args_contours_box_main] con_inter_box_h = [] - for i in range(len(args_contours_box)): - con_inter_box.append(contours_only_text_parent[args_contours_box[i]]) indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2]) @@ -2845,9 +2821,9 @@ class Eynollah: indexes_sorted_main = indexes_sorted[kind_of_texts_sorted == 1] indexes_by_type_main = index_by_kind_sorted[kind_of_texts_sorted == 1] - for zahler, _ in enumerate(args_contours_box): + for zahler, _ in enumerate(args_contours_box_main): arg_order_v = indexes_sorted_main[zahler] - order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = \ + order_by_con_main[args_contours_box_main[indexes_by_type_main[zahler]]] = \ np.flatnonzero(indexes_sorted == arg_order_v) + ref_point for jji, _ in enumerate(id_of_texts): @@ -2855,14 +2831,8 @@ class Eynollah: id_of_texts_tot.append(id_of_texts[jji]) ref_point += len(id_of_texts) - order_of_texts_tot = [] - - for tj1 in range(len(contours_only_text_parent)): - order_of_texts_tot.append(int(order_by_con_main[tj1])) - - order_text_new = [] - for iii in range(len(order_of_texts_tot)): - order_text_new.append(np.flatnonzero(np.array(order_of_texts_tot) == iii)) + order_of_texts_tot = order_by_con_main + order_text_new = np.argsort(order_of_texts_tot) self.logger.debug("exit do_order_of_regions_no_full_layout") return order_text_new, id_of_texts_tot From e674ea08f383de0c87f950be153fc954c3b4308e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 3 Oct 2025 00:59:25 +0200 Subject: [PATCH 055/101] do_order_of_regions: drop redundant no/full_layout (`_no_full_layout` is the same copied code as `_full_layout`; the latter runs just the same if passed an empty list for headings) --- src/eynollah/eynollah.py | 141 ++------------------------------------ src/eynollah/utils/xml.py | 4 +- 2 files changed, 6 insertions(+), 139 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 6a3fd1e..629b001 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2511,10 +2511,10 @@ class Eynollah: self.logger.debug("exit get_regions_from_xy_2models") return text_regions_p_true, erosion_hurts, polygons_seplines - def do_order_of_regions_full_layout( + def do_order_of_regions( self, contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot): - self.logger.debug("enter do_order_of_regions_full_layout") + self.logger.debug("enter do_order_of_regions") contours_only_text_parent = np.array(contours_only_text_parent) contours_only_text_parent_h = np.array(contours_only_text_parent_h) boxes = np.array(boxes, dtype=int) # to be on the safe side @@ -2706,135 +2706,7 @@ class Eynollah: order_by_con_head)) order_text_new = np.argsort(order_of_texts_tot) - self.logger.debug("exit do_order_of_regions_full_layout") - return order_text_new, id_of_texts_tot - - def do_order_of_regions_no_full_layout( - self, contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot): - - self.logger.debug("enter do_order_of_regions_no_full_layout") - contours_only_text_parent = np.array(contours_only_text_parent) - contours_only_text_parent_h = np.array(contours_only_text_parent_h) - boxes = np.array(boxes, dtype=int) # to be on the safe side - c_boxes = np.stack((0.5 * boxes[:, 2:4].sum(axis=1), - 0.5 * boxes[:, 0:2].sum(axis=1))) - cx_main, cy_main, mx_main, Mx_main, my_main, My_main, mxy_main = find_new_features_of_contours( - contours_only_text_parent) - - try: - arg_text_con_main = np.zeros(len(contours_only_text_parent), dtype=int) - for ii in range(len(contours_only_text_parent)): - check_if_textregion_located_in_a_box = False - for jj, box in enumerate(boxes): - if (mx_main[ii] >= box[0] and - Mx_main[ii] < box[1] and - my_main[ii] >= box[2] and - My_main[ii] < box[3]): - arg_text_con_main[ii] = jj - check_if_textregion_located_in_a_box = True - break - if not check_if_textregion_located_in_a_box: - # dists_tr_from_box = [math.sqrt((cx_main[ii] - 0.5 * box[1] - 0.5 * box[0]) ** 2 + - # (cy_main[ii] - 0.5 * box[3] - 0.5 * box[2]) ** 2) - # for box in boxes] - dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0) - pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & - (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) - ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) - arg_text_con_main[ii] = ind_min - args_contours_main = np.arange(len(contours_only_text_parent)) - order_by_con_main = np.zeros_like(arg_text_con_main) - - ref_point = 0 - order_of_texts_tot = [] - id_of_texts_tot = [] - for iij, box in enumerate(boxes): - ys = slice(*box[2:4]) - xs = slice(*box[0:2]) - args_contours_box_main = args_contours_main[arg_text_con_main == iij] - con_inter_box = contours_only_text_parent[args_contours_box_main] - con_inter_box_h = [] - - indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( - textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2]) - - order_of_texts, id_of_texts = order_and_id_of_texts( - con_inter_box, con_inter_box_h, - indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) - - indexes_sorted_main = indexes_sorted[kind_of_texts_sorted == 1] - indexes_by_type_main = index_by_kind_sorted[kind_of_texts_sorted == 1] - - for zahler, _ in enumerate(args_contours_box_main): - arg_order_v = indexes_sorted_main[zahler] - order_by_con_main[args_contours_box_main[indexes_by_type_main[zahler]]] = \ - np.flatnonzero(indexes_sorted == arg_order_v) + ref_point - - for jji, _ in enumerate(id_of_texts): - order_of_texts_tot.append(order_of_texts[jji] + ref_point) - id_of_texts_tot.append(id_of_texts[jji]) - ref_point += len(id_of_texts) - - order_of_texts_tot = order_by_con_main - order_text_new = np.argsort(order_of_texts_tot) - - except Exception as why: - self.logger.error(why) - arg_text_con_main = np.zeros(len(contours_only_text_parent), dtype=int) - for ii in range(len(contours_only_text_parent)): - check_if_textregion_located_in_a_box = False - for jj, box in enumerate(boxes): - if (cx_main[ii] >= box[0] and - cx_main[ii] < box[1] and - cy_main[ii] >= box[2] and - cy_main[ii] < box[3]): - # this is valid if the center of region identify in which box it is located - arg_text_con_main[ii] = jj - check_if_textregion_located_in_a_box = True - break - if not check_if_textregion_located_in_a_box: - dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0) - pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & - (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) - ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) - arg_text_con_main[ii] = ind_min - args_contours_main = np.arange(len(contours_only_text_parent)) - order_by_con_main = np.zeros_like(arg_text_con_main) - - ref_point = 0 - order_of_texts_tot = [] - id_of_texts_tot = [] - for iij, box in enumerate(boxes): - ys = slice(*box[2:4]) - xs = slice(*box[0:2]) - args_contours_box_main = args_contours_main[arg_text_con_main == iij] - con_inter_box = contours_only_text_parent[args_contours_box_main] - con_inter_box_h = [] - - indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( - textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2]) - - order_of_texts, id_of_texts = order_and_id_of_texts( - con_inter_box, con_inter_box_h, - indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) - - indexes_sorted_main = indexes_sorted[kind_of_texts_sorted == 1] - indexes_by_type_main = index_by_kind_sorted[kind_of_texts_sorted == 1] - - for zahler, _ in enumerate(args_contours_box_main): - arg_order_v = indexes_sorted_main[zahler] - order_by_con_main[args_contours_box_main[indexes_by_type_main[zahler]]] = \ - np.flatnonzero(indexes_sorted == arg_order_v) + ref_point - - for jji, _ in enumerate(id_of_texts): - order_of_texts_tot.append(order_of_texts[jji] + ref_point) - id_of_texts_tot.append(id_of_texts[jji]) - ref_point += len(id_of_texts) - - order_of_texts_tot = order_by_con_main - order_text_new = np.argsort(order_of_texts_tot) - - self.logger.debug("exit do_order_of_regions_no_full_layout") + self.logger.debug("exit do_order_of_regions") return order_text_new, id_of_texts_tot def check_iou_of_bounding_box_and_contour_for_tables( @@ -3081,11 +2953,6 @@ class Eynollah: image_revised_last[int(y_min_tab_col1[ijv]):int(y_max_tab_col1[ijv]),:,:]=pixel_table return image_revised_last - def do_order_of_regions(self, *args, **kwargs): - if self.full_layout: - return self.do_order_of_regions_full_layout(*args, **kwargs) - return self.do_order_of_regions_no_full_layout(*args, **kwargs) - def get_tables_from_model(self, img, num_col_classifier): img_org = np.copy(img) img_height_h = img_org.shape[0] @@ -5170,7 +5037,7 @@ class Eynollah: return pcgts - contours_only_text_parent_h = None + contours_only_text_parent_h = [] self.logger.info("Step 4/5: Reading Order Detection") if self.reading_order_machine_based: diff --git a/src/eynollah/utils/xml.py b/src/eynollah/utils/xml.py index a61dadb..88d1df8 100644 --- a/src/eynollah/utils/xml.py +++ b/src/eynollah/utils/xml.py @@ -57,8 +57,8 @@ def xml_reading_order(page, order_of_texts, id_of_marginalia_left, id_of_margina og.add_RegionRefIndexed(RegionRefIndexedType(index=str(region_counter.get('region')), regionRef=id_marginal)) region_counter.inc('region') - for idx_textregion, _ in enumerate(order_of_texts): - og.add_RegionRefIndexed(RegionRefIndexedType(index=str(region_counter.get('region')), regionRef=region_counter.region_id(order_of_texts[idx_textregion] + 1))) + for idx_textregion in order_of_texts: + og.add_RegionRefIndexed(RegionRefIndexedType(index=str(region_counter.get('region')), regionRef=region_counter.region_id(idx_textregion + 1))) region_counter.inc('region') for id_marginal in id_of_marginalia_right: From 29b4527bdebf6583f32b8801aed26f6ae70d25c7 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 3 Oct 2025 02:06:08 +0200 Subject: [PATCH 056/101] do_order_of_regions: simplify - remove duplicate code via inline def for the try-catch --- src/eynollah/eynollah.py | 127 +++++++-------------------------------- 1 file changed, 22 insertions(+), 105 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 629b001..bb3d1bf 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2525,22 +2525,23 @@ class Eynollah: cx_head, cy_head, mx_head, Mx_head, my_head, My_head, mxy_head = find_new_features_of_contours( contours_only_text_parent_h) - try: + def match_boxes(only_centers: bool): arg_text_con_main = np.zeros(len(contours_only_text_parent), dtype=int) for ii in range(len(contours_only_text_parent)): check_if_textregion_located_in_a_box = False for jj, box in enumerate(boxes): - if (mx_main[ii] >= box[0] and - Mx_main[ii] < box[1] and - my_main[ii] >= box[2] and - My_main[ii] < box[3]): + if ((cx_main[ii] >= box[0] and + cx_main[ii] < box[1] and + cy_main[ii] >= box[2] and + cy_main[ii] < box[3]) if only_centers else + (mx_main[ii] >= box[0] and + Mx_main[ii] < box[1] and + my_main[ii] >= box[2] and + My_main[ii] < box[3])): arg_text_con_main[ii] = jj check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: - # dists_tr_from_box = [math.sqrt((cx_main[ii] - 0.5 * box[1] - 0.5 * box[0]) ** 2 + - # (cy_main[ii] - 0.5 * box[3] - 0.5 * box[2]) ** 2) - # for box in boxes] dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0) pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) @@ -2553,17 +2554,18 @@ class Eynollah: for ii in range(len(contours_only_text_parent_h)): check_if_textregion_located_in_a_box = False for jj, box in enumerate(boxes): - if (mx_head[ii] >= box[0] and - Mx_head[ii] < box[1] and - my_head[ii] >= box[2] and - My_head[ii] < box[3]): + if ((cx_head[ii] >= box[0] and + cx_head[ii] < box[1] and + cy_head[ii] >= box[2] and + cy_head[ii] < box[3]) if only_centers else + (mx_head[ii] >= box[0] and + Mx_head[ii] < box[1] and + my_head[ii] >= box[2] and + My_head[ii] < box[3])): arg_text_con_head[ii] = jj check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: - # dists_tr_from_box = [math.sqrt((cx_head[ii] - 0.5 * box[1] - 0.5 * box[0]) ** 2 + - # (cy_head[ii] - 0.5 * box[3] - 0.5 * box[2]) ** 2) - # for box in boxes] dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_head[ii]], [cx_head[ii]]]), axis=0) pcontained_in_box = ((boxes[:, 2] <= cy_head[ii]) & (cy_head[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1])) @@ -2613,101 +2615,16 @@ class Eynollah: order_of_texts_tot = np.concatenate((order_by_con_main, order_by_con_head)) order_text_new = np.argsort(order_of_texts_tot) + return order_text_new, id_of_texts_tot + try: + results = match_boxes(False) except Exception as why: self.logger.error(why) - arg_text_con_main = np.zeros(len(contours_only_text_parent), dtype=int) - for ii in range(len(contours_only_text_parent)): - check_if_textregion_located_in_a_box = False - for jj, box in enumerate(boxes): - if (cx_main[ii] >= box[0] and - cx_main[ii] < box[1] and - cy_main[ii] >= box[2] and - cy_main[ii] < box[3]): - # this is valid if the center of region identify in which box it is located - arg_text_con_main[ii] = jj - check_if_textregion_located_in_a_box = True - break - if not check_if_textregion_located_in_a_box: - # dists_tr_from_box = [math.sqrt((cx_main[ii] - 0.5 * box[1] - 0.5 * box[0]) ** 2 + - # (cy_main[ii] - 0.5 * box[3] - 0.5 * box[2]) ** 2) - # for box in boxes] - dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0) - pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & - (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) - ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) - arg_text_con_main[ii] = ind_min - args_contours_main = np.arange(len(contours_only_text_parent)) - order_by_con_main = np.zeros_like(arg_text_con_main) - - arg_text_con_head = np.zeros(len(contours_only_text_parent_h), dtype=int) - for ii in range(len(contours_only_text_parent_h)): - check_if_textregion_located_in_a_box = False - for jj, box in enumerate(boxes): - if (cx_head[ii] >= box[0] and - cx_head[ii] < box[1] and - cy_head[ii] >= box[2] and - cy_head[ii] < box[3]): - # this is valid if the center of region identify in which box it is located - arg_text_con_head[ii] = jj - check_if_textregion_located_in_a_box = True - break - if not check_if_textregion_located_in_a_box: - # dists_tr_from_box = [math.sqrt((cx_head[ii] - 0.5 * box[1] - 0.5 * box[0]) ** 2 + - # (cy_head[ii] - 0.5 * box[3] - 0.5 * box[2]) ** 2) - # for box in boxes] - dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_head[ii]], [cx_head[ii]]]), axis=0) - pcontained_in_box = ((boxes[:, 2] <= cy_head[ii]) & (cy_head[ii] < boxes[:, 3]) & - (boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1])) - ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) - arg_text_con_head[ii] = ind_min - args_contours_head = np.arange(len(contours_only_text_parent_h)) - order_by_con_head = np.zeros_like(arg_text_con_head) - - ref_point = 0 - order_of_texts_tot = [] - id_of_texts_tot = [] - for iij, box in enumerate(boxes): - ys = slice(*box[2:4]) - xs = slice(*box[0:2]) - args_contours_box_main = args_contours_main[arg_text_con_main == iij] - args_contours_box_head = args_contours_head[arg_text_con_head == iij] - con_inter_box = contours_only_text_parent[args_contours_box_main] - con_inter_box_h = contours_only_text_parent_h[args_contours_box_head] - - indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( - textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2]) - - order_of_texts, id_of_texts = order_and_id_of_texts( - con_inter_box, con_inter_box_h, - indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) - - indexes_sorted_main = indexes_sorted[kind_of_texts_sorted == 1] - indexes_by_type_main = index_by_kind_sorted[kind_of_texts_sorted == 1] - indexes_sorted_head = indexes_sorted[kind_of_texts_sorted == 2] - indexes_by_type_head = index_by_kind_sorted[kind_of_texts_sorted == 2] - - for zahler, _ in enumerate(args_contours_box_main): - arg_order_v = indexes_sorted_main[zahler] - order_by_con_main[args_contours_box_main[indexes_by_type_main[zahler]]] = \ - np.flatnonzero(indexes_sorted == arg_order_v) + ref_point - - for zahler, _ in enumerate(args_contours_box_head): - arg_order_v = indexes_sorted_head[zahler] - order_by_con_head[args_contours_box_head[indexes_by_type_head[zahler]]] = \ - np.flatnonzero(indexes_sorted == arg_order_v) + ref_point - - for jji in range(len(id_of_texts)): - order_of_texts_tot.append(order_of_texts[jji] + ref_point) - id_of_texts_tot.append(id_of_texts[jji]) - ref_point += len(id_of_texts) - - order_of_texts_tot = np.concatenate((order_by_con_main, - order_by_con_head)) - order_text_new = np.argsort(order_of_texts_tot) + results = match_boxes(True) self.logger.debug("exit do_order_of_regions") - return order_text_new, id_of_texts_tot + return results def check_iou_of_bounding_box_and_contour_for_tables( self, layout, table_prediction_early, pixel_table, num_col_classifier): From d774a23daa80cad0baa16dc4b41e93b93bca39bf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 5 Oct 2025 02:18:17 +0200 Subject: [PATCH 057/101] matching deskewed text region contours with predicted: simplify - avoid loops in favour of array processing - improve readability and identifiers --- src/eynollah/eynollah.py | 108 +++++++++++++++------------------------ 1 file changed, 40 insertions(+), 68 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index bb3d1bf..dd6172a 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4559,27 +4559,16 @@ class Eynollah: areas_cnt_text = np.array([cv2.contourArea(c) for c in contours_only_text_parent]) areas_cnt_text = areas_cnt_text / float(text_only.shape[0] * text_only.shape[1]) #self.logger.info('areas_cnt_text %s', areas_cnt_text) - contours_biggest = contours_only_text_parent[np.argmax(areas_cnt_text)] - contours_only_text_parent = [c for jz, c in enumerate(contours_only_text_parent) - if areas_cnt_text[jz] > MIN_AREA_REGION] - areas_cnt_text_parent = [area for area in areas_cnt_text if area > MIN_AREA_REGION] + contour0 = contours_only_text_parent[np.argmax(areas_cnt_text)] + contours_only_text_parent = np.array(contours_only_text_parent)[areas_cnt_text > MIN_AREA_REGION] + areas_cnt_text_parent = areas_cnt_text[areas_cnt_text > MIN_AREA_REGION] + index_con_parents = np.argsort(areas_cnt_text_parent) + contours_only_text_parent = contours_only_text_parent[index_con_parents] + areas_cnt_text_parent = areas_cnt_text_parent[index_con_parents] - contours_only_text_parent = self.return_list_of_contours_with_desired_order( - contours_only_text_parent, index_con_parents) - - ##try: - ##contours_only_text_parent = \ - ##list(np.array(contours_only_text_parent,dtype=object)[index_con_parents]) - ##except: - ##contours_only_text_parent = \ - ##list(np.array(contours_only_text_parent,dtype=np.int32)[index_con_parents]) - ##areas_cnt_text_parent = list(np.array(areas_cnt_text_parent)[index_con_parents]) - areas_cnt_text_parent = self.return_list_of_contours_with_desired_order( - areas_cnt_text_parent, index_con_parents) - - cx_bigest_big, cy_biggest_big = find_center_of_contours([contours_biggest]) - cx_bigest, cy_biggest = find_center_of_contours(contours_only_text_parent) + center0 = np.stack(find_center_of_contours([contour0])) # [2, 1] + centers = np.stack(find_center_of_contours(contours_only_text_parent)) # [2, N] if np.abs(slope_deskew) >= SLOPE_THRESHOLD: contours_only_text_d, hir_on_text_d = return_contours_of_image(text_only_d) @@ -4588,65 +4577,48 @@ class Eynollah: areas_cnt_text_d = np.array([cv2.contourArea(c) for c in contours_only_text_parent_d]) areas_cnt_text_d = areas_cnt_text_d / float(text_only_d.shape[0] * text_only_d.shape[1]) - if len(areas_cnt_text_d)>0: - contours_biggest_d = contours_only_text_parent_d[np.argmax(areas_cnt_text_d)] + if len(contours_only_text_parent_d): + contour0_d = contours_only_text_parent_d[np.argmax(areas_cnt_text_d)] index_con_parents_d = np.argsort(areas_cnt_text_d) - contours_only_text_parent_d = self.return_list_of_contours_with_desired_order( - contours_only_text_parent_d, index_con_parents_d) - #try: - #contours_only_text_parent_d = \ - #list(np.array(contours_only_text_parent_d,dtype=object)[index_con_parents_d]) - #except: - #contours_only_text_parent_d = \ - #list(np.array(contours_only_text_parent_d,dtype=np.int32)[index_con_parents_d]) - #areas_cnt_text_d = list(np.array(areas_cnt_text_d)[index_con_parents_d]) - areas_cnt_text_d = self.return_list_of_contours_with_desired_order( - areas_cnt_text_d, index_con_parents_d) + contours_only_text_parent_d = np.array(contours_only_text_parent_d)[index_con_parents_d] + # rs: should be the same, no? + assert np.all(contour0_d == contours_only_text_parent_d[-1]), (np.argmax(areas_cnt_text_d), index_con_parents_d[-1]) + areas_cnt_text_d = areas_cnt_text_d[index_con_parents_d] - cx_bigest_d_big, cy_biggest_d_big = find_center_of_contours([contours_biggest_d]) - cx_bigest_d, cy_biggest_d = find_center_of_contours(contours_only_text_parent_d) - try: - if len(cx_bigest_d) >= 5: - cx_bigest_d_last5 = cx_bigest_d[-5:] - cy_biggest_d_last5 = cy_biggest_d[-5:] - dists_d = [math.sqrt((cx_bigest_big[0] - cx_bigest_d_last5[j]) ** 2 + - (cy_biggest_big[0] - cy_biggest_d_last5[j]) ** 2) - for j in range(len(cy_biggest_d_last5))] - ind_largest = len(cx_bigest_d) -5 + np.argmin(dists_d) - else: - cx_bigest_d_last5 = cx_bigest_d[-len(cx_bigest_d):] - cy_biggest_d_last5 = cy_biggest_d[-len(cx_bigest_d):] - dists_d = [math.sqrt((cx_bigest_big[0]-cx_bigest_d_last5[j])**2 + - (cy_biggest_big[0]-cy_biggest_d_last5[j])**2) - for j in range(len(cy_biggest_d_last5))] - ind_largest = len(cx_bigest_d) - len(cx_bigest_d) + np.argmin(dists_d) - - cx_bigest_d_big[0] = cx_bigest_d[ind_largest] - cy_biggest_d_big[0] = cy_biggest_d[ind_largest] - except Exception as why: - self.logger.error(str(why)) + center0_d = np.stack(find_center_of_contours([contour0_d])) # [2, 1] + centers_d = np.stack(find_center_of_contours(contours_only_text_parent_d)) # [2, N] + # rs: should be the same, no? + assert center0_d[0,0] == centers_d[0,-1] and center0_d[1,0] == centers_d[1,-1] + last5_centers_d = centers_d[:, -5:] + dists_d = np.linalg.norm(center0 - last5_centers_d, axis=0) + ind_largest = len(contours_only_text_parent_d) - last5_centers_d.shape[1] + np.argmin(dists_d) + center0_d[:, 0] = centers_d[:, ind_largest] + # order new contours the same way as the undeskewed contours + # (by calculating the offset of the largest contours, respectively, + # of the new and undeskewed image; then for each contour, + # finding the closest new contour, with proximity calculated + # as distance of their centers modulo offset vector) (h, w) = text_only.shape[:2] center = (w // 2.0, h // 2.0) M = cv2.getRotationMatrix2D(center, slope_deskew, 1.0) M_22 = np.array(M)[:2, :2] - p_big = np.dot(M_22, [cx_bigest_big, cy_biggest_big]) - x_diff = p_big[0] - cx_bigest_d_big - y_diff = p_big[1] - cy_biggest_d_big + p0 = np.dot(M_22, center0) # [2, 1] + offset = p0 - center0_d # [2, 1] + # img2 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) contours_only_text_parent_d_ordered = [] for i in range(len(contours_only_text_parent)): - p = np.dot(M_22, [cx_bigest[i], cy_biggest[i]]) - p[0] = p[0] - x_diff[0] - p[1] = p[1] - y_diff[0] - dists = [math.sqrt((p[0] - cx_bigest_d[j]) ** 2 + - (p[1] - cy_biggest_d[j]) ** 2) - for j in range(len(cx_bigest_d))] - contours_only_text_parent_d_ordered.append(contours_only_text_parent_d[np.argmin(dists)]) - # img2=np.zeros((text_only.shape[0],text_only.shape[1],3)) - # img2=cv2.fillPoly(img2,pts=[contours_only_text_parent_d[np.argmin(dists)]] ,color=(1,1,1)) - # plt.imshow(img2[:,:,0]) - # plt.show() + p = np.dot(M_22, centers[:, i:i+1]) # [2, 1] + p -= offset + dists = np.linalg.norm(p - centers_d, axis=0) + contours_only_text_parent_d_ordered.append( + contours_only_text_parent_d[np.argmin(dists)]) + # cv2.fillPoly(img2, pts=[contours_only_text_parent_d[np.argmin(dists)]], color=i + 1) + # plt.imshow(img2) + # plt.show() + # rs: what about the remaining contours_only_text_parent_d? + # rs: what about duplicates? else: contours_only_text_parent_d_ordered = [] contours_only_text_parent_d = [] From 73e5a1def8489f6bf022e696f010d4c852ff685b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 5 Oct 2025 02:33:03 +0200 Subject: [PATCH 058/101] matching deskewed text region contours with predicted: simplify - (no need for argmax if already sorted) --- src/eynollah/eynollah.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index dd6172a..46437f0 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4559,7 +4559,6 @@ class Eynollah: areas_cnt_text = np.array([cv2.contourArea(c) for c in contours_only_text_parent]) areas_cnt_text = areas_cnt_text / float(text_only.shape[0] * text_only.shape[1]) #self.logger.info('areas_cnt_text %s', areas_cnt_text) - contour0 = contours_only_text_parent[np.argmax(areas_cnt_text)] contours_only_text_parent = np.array(contours_only_text_parent)[areas_cnt_text > MIN_AREA_REGION] areas_cnt_text_parent = areas_cnt_text[areas_cnt_text > MIN_AREA_REGION] @@ -4567,9 +4566,11 @@ class Eynollah: contours_only_text_parent = contours_only_text_parent[index_con_parents] areas_cnt_text_parent = areas_cnt_text_parent[index_con_parents] - center0 = np.stack(find_center_of_contours([contour0])) # [2, 1] centers = np.stack(find_center_of_contours(contours_only_text_parent)) # [2, N] + contour0 = contours_only_text_parent[-1] + center0 = centers[:, -1:] # [2, 1] + if np.abs(slope_deskew) >= SLOPE_THRESHOLD: contours_only_text_d, hir_on_text_d = return_contours_of_image(text_only_d) contours_only_text_parent_d = return_parent_contours(contours_only_text_d, hir_on_text_d) @@ -4578,17 +4579,15 @@ class Eynollah: areas_cnt_text_d = areas_cnt_text_d / float(text_only_d.shape[0] * text_only_d.shape[1]) if len(contours_only_text_parent_d): - contour0_d = contours_only_text_parent_d[np.argmax(areas_cnt_text_d)] index_con_parents_d = np.argsort(areas_cnt_text_d) contours_only_text_parent_d = np.array(contours_only_text_parent_d)[index_con_parents_d] - # rs: should be the same, no? - assert np.all(contour0_d == contours_only_text_parent_d[-1]), (np.argmax(areas_cnt_text_d), index_con_parents_d[-1]) areas_cnt_text_d = areas_cnt_text_d[index_con_parents_d] - center0_d = np.stack(find_center_of_contours([contour0_d])) # [2, 1] centers_d = np.stack(find_center_of_contours(contours_only_text_parent_d)) # [2, N] - # rs: should be the same, no? - assert center0_d[0,0] == centers_d[0,-1] and center0_d[1,0] == centers_d[1,-1] + + contour0_d = contours_only_text_parent_d[-1] + center0_d = centers_d[:, -1:] # [2, 1] + last5_centers_d = centers_d[:, -5:] dists_d = np.linalg.norm(center0 - last5_centers_d, axis=0) ind_largest = len(contours_only_text_parent_d) - last5_centers_d.shape[1] + np.argmin(dists_d) From 0f33c21eb3a9cbe87f7221dd3481203de415794d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 5 Oct 2025 02:45:01 +0200 Subject: [PATCH 059/101] matching deskewed text region contours with predicted: improve - when matching undeskewed and new contours, do not just pick the closest centers, respectively, but also of similar size (by making the contour area the 3rd dimension of the vector norm in the distance calculation) --- src/eynollah/eynollah.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 46437f0..e474916 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4610,7 +4610,11 @@ class Eynollah: for i in range(len(contours_only_text_parent)): p = np.dot(M_22, centers[:, i:i+1]) # [2, 1] p -= offset - dists = np.linalg.norm(p - centers_d, axis=0) + # add dimension for area + #dists = np.linalg.norm(p - centers_d, axis=0) + diffs = (np.append(p, [[areas_cnt_text_parent[i]]], axis=0) - + np.append(centers_d, areas_cnt_text_d[np.newaxis], axis=0)) + dists = np.linalg.norm(diffs, axis=0) contours_only_text_parent_d_ordered.append( contours_only_text_parent_d[np.argmin(dists)]) # cv2.fillPoly(img2, pts=[contours_only_text_parent_d[np.argmin(dists)]], color=i + 1) From 0e00d7868be55d3fb94b52fffc6ed96bf9387067 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 6 Oct 2025 12:55:10 +0200 Subject: [PATCH 060/101] matching deskewed text region contours with predicted: improve - apply same min-area filter to deskewed contours as to original ones --- src/eynollah/eynollah.py | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index e474916..e5ad5ae 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4568,7 +4568,6 @@ class Eynollah: centers = np.stack(find_center_of_contours(contours_only_text_parent)) # [2, N] - contour0 = contours_only_text_parent[-1] center0 = centers[:, -1:] # [2, 1] if np.abs(slope_deskew) >= SLOPE_THRESHOLD: @@ -4578,6 +4577,9 @@ class Eynollah: areas_cnt_text_d = np.array([cv2.contourArea(c) for c in contours_only_text_parent_d]) areas_cnt_text_d = areas_cnt_text_d / float(text_only_d.shape[0] * text_only_d.shape[1]) + contours_only_text_parent_d = np.array(contours_only_text_parent_d)[areas_cnt_text_d > MIN_AREA_REGION] + areas_cnt_text_d = areas_cnt_text_d[areas_cnt_text_d > MIN_AREA_REGION] + if len(contours_only_text_parent_d): index_con_parents_d = np.argsort(areas_cnt_text_d) contours_only_text_parent_d = np.array(contours_only_text_parent_d)[index_con_parents_d] @@ -4585,9 +4587,10 @@ class Eynollah: centers_d = np.stack(find_center_of_contours(contours_only_text_parent_d)) # [2, N] - contour0_d = contours_only_text_parent_d[-1] center0_d = centers_d[:, -1:] # [2, 1] + # find the largest among the largest 5 deskewed contours + # that is also closest to the largest original contour last5_centers_d = centers_d[:, -5:] dists_d = np.linalg.norm(center0 - last5_centers_d, axis=0) ind_largest = len(contours_only_text_parent_d) - last5_centers_d.shape[1] + np.argmin(dists_d) @@ -4762,14 +4765,7 @@ class Eynollah: if np.abs(slope_deskew) >= SLOPE_THRESHOLD: contours_only_text_parent_d_ordered = self.return_list_of_contours_with_desired_order( contours_only_text_parent_d_ordered, index_by_text_par_con) - #try: - #contours_only_text_parent_d_ordered = \ - #list(np.array(contours_only_text_parent_d_ordered, dtype=np.int32)[index_by_text_par_con]) - #except: - #contours_only_text_parent_d_ordered = \ - #list(np.array(contours_only_text_parent_d_ordered, dtype=object)[index_by_text_par_con]) else: - #takes long timee contours_only_text_parent_d_ordered = None if self.light_version: fun = check_any_text_region_in_model_one_is_main_or_header_light @@ -4949,12 +4945,6 @@ class Eynollah: else: contours_only_text_parent_d_ordered = self.return_list_of_contours_with_desired_order( contours_only_text_parent_d_ordered, index_by_text_par_con) - #try: - #contours_only_text_parent_d_ordered = \ - #list(np.array(contours_only_text_parent_d_ordered, dtype=object)[index_by_text_par_con]) - #except: - #contours_only_text_parent_d_ordered = \ - #list(np.array(contours_only_text_parent_d_ordered, dtype=np.int32)[index_by_text_par_con]) order_text_new, id_of_texts_tot = self.do_order_of_regions( contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d) From 155b8f68b8a7754de11e002e0df2bfc7292899d8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 6 Oct 2025 12:58:24 +0200 Subject: [PATCH 061/101] matching deskewed text region contours with predicted: improve - avoid duplicate and missing mappings by using a different approach: instead of just minimising the center distance for the N contours that we expect, 1. get all N:M distances 2. iterate over them from small to large 3. continue adding correspondences until both every original contour and every deskewed contour have at least one match 4. where one original matches multiple deskewed contours, join the latter polygons to map as single contour 5. where one deskewed contour matches multiple originals, split the former by intersecting with each of the latter (after bringing them into the same coordinate space), so ultimately only the respective match gets assigned --- src/eynollah/eynollah.py | 94 ++++++++++++++++++++++++++++------- src/eynollah/utils/contour.py | 15 ++++++ 2 files changed, 90 insertions(+), 19 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index e5ad5ae..5e32929 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -33,6 +33,7 @@ from concurrent.futures import ProcessPoolExecutor import xml.etree.ElementTree as ET import cv2 import numpy as np +import shapely.affinity from scipy.signal import find_peaks from scipy.ndimage import gaussian_filter1d from numba import cuda @@ -83,6 +84,10 @@ from .utils.contour import ( return_parent_contours, dilate_textregion_contours, dilate_textline_contours, + polygon2contour, + contour2polygon, + join_polygons, + make_intersection, ) from .utils.rotate import ( rotate_image, @@ -4556,8 +4561,9 @@ class Eynollah: contours_only_text, hir_on_text = return_contours_of_image(text_only) contours_only_text_parent = return_parent_contours(contours_only_text, hir_on_text) if len(contours_only_text_parent) > 0: + areas_tot_text = np.prod(text_only.shape) areas_cnt_text = np.array([cv2.contourArea(c) for c in contours_only_text_parent]) - areas_cnt_text = areas_cnt_text / float(text_only.shape[0] * text_only.shape[1]) + areas_cnt_text = areas_cnt_text / float(areas_tot_text) #self.logger.info('areas_cnt_text %s', areas_cnt_text) contours_only_text_parent = np.array(contours_only_text_parent)[areas_cnt_text > MIN_AREA_REGION] areas_cnt_text_parent = areas_cnt_text[areas_cnt_text > MIN_AREA_REGION] @@ -4574,8 +4580,9 @@ class Eynollah: contours_only_text_d, hir_on_text_d = return_contours_of_image(text_only_d) contours_only_text_parent_d = return_parent_contours(contours_only_text_d, hir_on_text_d) + areas_tot_text_d = np.prod(text_only_d.shape) areas_cnt_text_d = np.array([cv2.contourArea(c) for c in contours_only_text_parent_d]) - areas_cnt_text_d = areas_cnt_text_d / float(text_only_d.shape[0] * text_only_d.shape[1]) + areas_cnt_text_d = areas_cnt_text_d / float(areas_tot_text_d) contours_only_text_parent_d = np.array(contours_only_text_parent_d)[areas_cnt_text_d > MIN_AREA_REGION] areas_cnt_text_d = areas_cnt_text_d[areas_cnt_text_d > MIN_AREA_REGION] @@ -4587,7 +4594,7 @@ class Eynollah: centers_d = np.stack(find_center_of_contours(contours_only_text_parent_d)) # [2, N] - center0_d = centers_d[:, -1:] # [2, 1] + center0_d = centers_d[:, -1:].copy() # [2, 1] # find the largest among the largest 5 deskewed contours # that is also closest to the largest original contour @@ -4605,26 +4612,75 @@ class Eynollah: center = (w // 2.0, h // 2.0) M = cv2.getRotationMatrix2D(center, slope_deskew, 1.0) M_22 = np.array(M)[:2, :2] - p0 = np.dot(M_22, center0) # [2, 1] - offset = p0 - center0_d # [2, 1] + center0 = np.dot(M_22, center0) # [2, 1] + offset = center0 - center0_d # [2, 1] - # img2 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) - contours_only_text_parent_d_ordered = [] + centers = np.dot(M_22, centers) - offset # [2,N] + # add dimension for area (so only contours of similar size will be considered close) + centers = np.append(centers, areas_cnt_text_parent[np.newaxis], axis=0) + centers_d = np.append(centers_d, areas_cnt_text_d[np.newaxis], axis=0) + + dists = np.zeros((len(contours_only_text_parent), len(contours_only_text_parent_d))) for i in range(len(contours_only_text_parent)): - p = np.dot(M_22, centers[:, i:i+1]) # [2, 1] - p -= offset - # add dimension for area - #dists = np.linalg.norm(p - centers_d, axis=0) - diffs = (np.append(p, [[areas_cnt_text_parent[i]]], axis=0) - - np.append(centers_d, areas_cnt_text_d[np.newaxis], axis=0)) - dists = np.linalg.norm(diffs, axis=0) - contours_only_text_parent_d_ordered.append( - contours_only_text_parent_d[np.argmin(dists)]) - # cv2.fillPoly(img2, pts=[contours_only_text_parent_d[np.argmin(dists)]], color=i + 1) + dists[i] = np.linalg.norm(centers[:, i:i + 1] - centers_d, axis=0) + corresp = np.zeros(dists.shape, dtype=bool) + # keep searching next-closest until at least one correspondence on each side + while not np.all(corresp.sum(axis=1)) and not np.all(corresp.sum(axis=0)): + idx = np.nanargmin(dists) + i, j = np.unravel_index(idx, dists.shape) + dists[i, j] = np.nan + corresp[i, j] = True + #print("original/deskewed adjacency", corresp.nonzero()) + contours_only_text_parent_d_ordered = np.zeros_like(contours_only_text_parent) + contours_only_text_parent_d_ordered = contours_only_text_parent_d[np.argmax(corresp, axis=1)] + # img1 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) + # for i in range(len(contours_only_text_parent)): + # cv2.fillPoly(img1, pts=[contours_only_text_parent_d_ordered[i]], color=i + 1) + # plt.subplot(2, 2, 1, title="direct corresp contours") + # plt.imshow(img1) + # img2 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) + # join deskewed regions mapping to single original ones + for i in range(len(contours_only_text_parent)): + if np.count_nonzero(corresp[i]) > 1: + indices = np.flatnonzero(corresp[i]) + #print("joining", indices) + polygons_d = [contour2polygon(contour) + for contour in contours_only_text_parent_d[indices]] + contour_d = polygon2contour(join_polygons(polygons_d)) + contours_only_text_parent_d_ordered[i] = contour_d + # cv2.fillPoly(img2, pts=[contour_d], color=i + 1) + # plt.subplot(2, 2, 3, title="joined contours") # plt.imshow(img2) + # img3 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) + # split deskewed regions mapping to multiple original ones + def deskew(polygon): + polygon = shapely.affinity.rotate(polygon, -slope_deskew, origin=center) + polygon = shapely.affinity.translate(polygon, *offset.squeeze()) + return polygon + for j in range(len(contours_only_text_parent_d)): + if np.count_nonzero(corresp[:, j]) > 1: + indices = np.flatnonzero(corresp[:, j]) + #print("splitting along", indices) + polygons = [deskew(contour2polygon(contour)) + for contour in contours_only_text_parent[indices]] + polygon_d = contour2polygon(contours_only_text_parent_d[j]) + polygons_d = [make_intersection(polygon_d, polygon) + for polygon in polygons] + # ignore where there is no actual overlap + indices = indices[np.flatnonzero(polygons_d)] + contours_d = [polygon2contour(polygon_d) + for polygon_d in polygons_d + if polygon_d] + contours_only_text_parent_d_ordered[indices] = contours_d + # cv2.fillPoly(img3, pts=contours_d, color=j + 1) + # plt.subplot(2, 2, 4, title="split contours") + # plt.imshow(img3) + # img4 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) + # for i in range(len(contours_only_text_parent)): + # cv2.fillPoly(img4, pts=[contours_only_text_parent_d_ordered[i]], color=i + 1) + # plt.subplot(2, 2, 2, title="result contours") + # plt.imshow(img4) # plt.show() - # rs: what about the remaining contours_only_text_parent_d? - # rs: what about duplicates? else: contours_only_text_parent_d_ordered = [] contours_only_text_parent_d = [] diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index 041cbf6..8431bbe 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -335,6 +335,21 @@ def polygon2contour(polygon: Polygon) -> np.ndarray: polygon = np.array(polygon.exterior.coords[:-1], dtype=int) return np.maximum(0, polygon).astype(np.uint)[:, np.newaxis] +def make_intersection(poly1, poly2): + interp = poly1.intersection(poly2) + # post-process + if interp.is_empty or interp.area == 0.0: + return None + if interp.geom_type == 'GeometryCollection': + # heterogeneous result: filter zero-area shapes (LineString, Point) + interp = unary_union([geom for geom in interp.geoms if geom.area > 0]) + if interp.geom_type == 'MultiPolygon': + # homogeneous result: construct convex hull to connect + interp = join_polygons(interp.geoms) + assert interp.geom_type == 'Polygon', interp.wkt + interp = make_valid(interp) + return interp + def make_valid(polygon: Polygon) -> Polygon: """Ensures shapely.geometry.Polygon object is valid by repeated rearrangement/simplification/enlargement.""" def isint(x): From fe603188f4f7f9d545b44085cdc45195f98f0546 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 6 Oct 2025 13:11:03 +0200 Subject: [PATCH 062/101] avoid unnecessary 3-channel conversions --- src/eynollah/eynollah.py | 52 ++++----- src/eynollah/utils/__init__.py | 156 +++++++++++---------------- src/eynollah/utils/contour.py | 74 +++++-------- src/eynollah/utils/separate_lines.py | 53 ++++----- 4 files changed, 132 insertions(+), 203 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 5e32929..834ecf3 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -712,7 +712,7 @@ class Eynollah: if self.input_binary: img = self.imread() prediction_bin = self.do_prediction(True, img, self.model_bin, n_batch_inference=5) - prediction_bin = 255 * (prediction_bin[:,:,0]==0) + prediction_bin = 255 * (prediction_bin[:,:,0] == 0) prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2).astype(np.uint8) img= np.copy(prediction_bin) img_bin = prediction_bin @@ -2064,9 +2064,7 @@ class Eynollah: boxes_sub_new = [] poly_sub = [] for mv in range(len(boxes_per_process)): - crop_img, _ = crop_image_inside_box(boxes_per_process[mv], - np.repeat(textline_mask_tot[:, :, np.newaxis], 3, axis=2)) - crop_img = crop_img[:, :, 0] + crop_img, _ = crop_image_inside_box(boxes_per_process[mv], textline_mask_tot) crop_img = cv2.erode(crop_img, KERNEL, iterations=2) try: textline_con, hierarchy = return_contours_of_image(crop_img) @@ -2638,10 +2636,8 @@ class Eynollah: layout_org[:,:,0][layout_org[:,:,0]==pixel_table] = 0 layout = (layout[:,:,0]==pixel_table)*1 - layout =np.repeat(layout[:, :, np.newaxis], 3, axis=2) layout = layout.astype(np.uint8) - imgray = cv2.cvtColor(layout, cv2.COLOR_BGR2GRAY ) - _, thresh = cv2.threshold(imgray, 0, 255, 0) + _, thresh = cv2.threshold(layout, 0, 255, 0) contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) cnt_size = np.array([cv2.contourArea(contours[j]) @@ -2652,8 +2648,8 @@ class Eynollah: x, y, w, h = cv2.boundingRect(contours[i]) iou = cnt_size[i] /float(w*h) *100 if iou<80: - layout_contour = np.zeros((layout_org.shape[0], layout_org.shape[1])) - layout_contour= cv2.fillPoly(layout_contour,pts=[contours[i]] ,color=(1,1,1)) + layout_contour = np.zeros(layout_org.shape[:2]) + layout_contour = cv2.fillPoly(layout_contour, pts=[contours[i]] ,color=1) layout_contour_sum = layout_contour.sum(axis=0) layout_contour_sum_diff = np.diff(layout_contour_sum) @@ -2669,20 +2665,17 @@ class Eynollah: layout_contour=cv2.erode(layout_contour[:,:], KERNEL, iterations=5) layout_contour=cv2.dilate(layout_contour[:,:], KERNEL, iterations=5) - layout_contour =np.repeat(layout_contour[:, :, np.newaxis], 3, axis=2) layout_contour = layout_contour.astype(np.uint8) - - imgray = cv2.cvtColor(layout_contour, cv2.COLOR_BGR2GRAY ) - _, thresh = cv2.threshold(imgray, 0, 255, 0) + _, thresh = cv2.threshold(layout_contour, 0, 255, 0) contours_sep, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) for ji in range(len(contours_sep) ): contours_new.append(contours_sep[ji]) if num_col_classifier>=2: - only_recent_contour_image = np.zeros((layout.shape[0],layout.shape[1])) - only_recent_contour_image= cv2.fillPoly(only_recent_contour_image, - pts=[contours_sep[ji]], color=(1,1,1)) + only_recent_contour_image = np.zeros(layout.shape[:2]) + only_recent_contour_image = cv2.fillPoly(only_recent_contour_image, + pts=[contours_sep[ji]], color=1) table_pixels_masked_from_early_pre = only_recent_contour_image * table_prediction_early iou_in = 100. * table_pixels_masked_from_early_pre.sum() / only_recent_contour_image.sum() #print(iou_in,'iou_in_in1') @@ -3210,13 +3203,11 @@ class Eynollah: pixel_lines = 3 if np.abs(slope_deskew) < SLOPE_THRESHOLD: _, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( - np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), - num_col_classifier, self.tables, pixel_lines) + text_regions_p, num_col_classifier, self.tables, pixel_lines) if np.abs(slope_deskew) >= SLOPE_THRESHOLD: _, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), - num_col_classifier, self.tables, pixel_lines) + text_regions_p_1_n, num_col_classifier, self.tables, pixel_lines) #print(time.time()-t_0_box,'time box in 2') self.logger.info("num_col_classifier: %s", num_col_classifier) @@ -3392,13 +3383,11 @@ class Eynollah: pixel_lines=3 if np.abs(slope_deskew) < SLOPE_THRESHOLD: num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( - np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), - num_col_classifier, self.tables, pixel_lines) + text_regions_p, num_col_classifier, self.tables, pixel_lines) if np.abs(slope_deskew) >= SLOPE_THRESHOLD: num_col_d, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), - num_col_classifier, self.tables, pixel_lines) + text_regions_p_1_n, num_col_classifier, self.tables, pixel_lines) if num_col_classifier>=3: if np.abs(slope_deskew) < SLOPE_THRESHOLD: @@ -3498,7 +3487,7 @@ class Eynollah: #text_regions_p[:,:][regions_fully[:,:,0]==6]=6 ##regions_fully_only_drop = put_drop_out_from_only_drop_model(regions_fully_only_drop, text_regions_p) - ##regions_fully[:, :, 0][regions_fully_only_drop[:, :, 0] == 4] = 4 + ##regions_fully[:, :, 0][regions_fully_only_drop[:, :] == 4] = 4 drop_capital_label_in_full_layout_model = 3 drops = (regions_fully[:,:,0]==drop_capital_label_in_full_layout_model)*1 @@ -4715,7 +4704,6 @@ class Eynollah: return pcgts - #print("text region early 3 in %.1fs", time.time() - t0) if self.light_version: contours_only_text_parent = dilate_textregion_contours(contours_only_text_parent) @@ -4851,21 +4839,17 @@ class Eynollah: if not self.headers_off: if np.abs(slope_deskew) < SLOPE_THRESHOLD: num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( - np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), - num_col_classifier, self.tables, label_seps, contours_only_text_parent_h) + text_regions_p, num_col_classifier, self.tables, label_seps, contours_only_text_parent_h) else: _, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), - num_col_classifier, self.tables, label_seps, contours_only_text_parent_h_d_ordered) + text_regions_p_1_n, num_col_classifier, self.tables, label_seps, contours_only_text_parent_h_d_ordered) elif self.headers_off: if np.abs(slope_deskew) < SLOPE_THRESHOLD: num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( - np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), - num_col_classifier, self.tables, label_seps) + text_regions_p, num_col_classifier, self.tables, label_seps) else: _, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), - num_col_classifier, self.tables, label_seps) + text_regions_p_1_n, num_col_classifier, self.tables, label_seps) if num_col_classifier >= 3: if np.abs(slope_deskew) < SLOPE_THRESHOLD: diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 6e5afd4..ebf78fe 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -796,7 +796,7 @@ def find_num_col_only_image(regions_without_separators, multiplier=3.8): return len(peaks_fin_true), peaks_fin_true def find_num_col_by_vertical_lines(regions_without_separators, multiplier=3.8): - regions_without_separators_0 = regions_without_separators[:, :, 0].sum(axis=0) + regions_without_separators_0 = regions_without_separators.sum(axis=0) ##plt.plot(regions_without_separators_0) ##plt.show() @@ -823,7 +823,10 @@ def return_regions_without_separators(regions_pre): return regions_without_separators def put_drop_out_from_only_drop_model(layout_no_patch, layout1): - drop_only = (layout_no_patch[:, :, 0] == 4) * 1 + if layout_no_patch.ndim == 3: + layout_no_patch = layout_no_patch[:, :, 0] + + drop_only = (layout_no_patch[:, :] == 4) * 1 contours_drop, hir_on_drop = return_contours_of_image(drop_only) contours_drop_parent = return_parent_contours(contours_drop, hir_on_drop) @@ -849,9 +852,8 @@ def put_drop_out_from_only_drop_model(layout_no_patch, layout1): (map_of_drop_contour_bb == 5).sum()) >= 15: contours_drop_parent_final.append(contours_drop_parent[jj]) - layout_no_patch[:, :, 0][layout_no_patch[:, :, 0] == 4] = 0 - - layout_no_patch = cv2.fillPoly(layout_no_patch, pts=contours_drop_parent_final, color=(4, 4, 4)) + layout_no_patch[:, :][layout_no_patch[:, :] == 4] = 0 + layout_no_patch = cv2.fillPoly(layout_no_patch, pts=contours_drop_parent_final, color=4) return layout_no_patch @@ -925,17 +927,16 @@ def check_any_text_region_in_model_one_is_main_or_header( contours_only_text_parent_main_d=[] contours_only_text_parent_head_d=[] - for ii in range(len(contours_only_text_parent)): - con=contours_only_text_parent[ii] - img=np.zeros((regions_model_1.shape[0],regions_model_1.shape[1],3)) - img = cv2.fillPoly(img, pts=[con], color=(255, 255, 255)) + for ii, con in enumerate(contours_only_text_parent): + img = np.zeros(regions_model_1.shape[:2]) + img = cv2.fillPoly(img, pts=[con], color=255) - all_pixels=((img[:,:,0]==255)*1).sum() - pixels_header=( ( (img[:,:,0]==255) & (regions_model_full[:,:,0]==2) )*1 ).sum() + all_pixels=((img == 255)*1).sum() + pixels_header=( ( (img == 255) & (regions_model_full[:,:,0]==2) )*1 ).sum() pixels_main=all_pixels-pixels_header if (pixels_header>=pixels_main) and ( (length_con[ii]/float(height_con[ii]) )>=1.3 ): - regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ]=2 + regions_model_1[:,:][(regions_model_1[:,:]==1) & (img == 255) ]=2 contours_only_text_parent_head.append(con) if contours_only_text_parent_d_ordered is not None: contours_only_text_parent_head_d.append(contours_only_text_parent_d_ordered[ii]) @@ -944,7 +945,7 @@ def check_any_text_region_in_model_one_is_main_or_header( all_found_textline_polygons_head.append(all_found_textline_polygons[ii]) conf_contours_head.append(None) else: - regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ]=1 + regions_model_1[:,:][(regions_model_1[:,:]==1) & (img == 255) ]=1 contours_only_text_parent_main.append(con) conf_contours_main.append(conf_contours[ii]) if contours_only_text_parent_d_ordered is not None: @@ -1015,11 +1016,11 @@ def check_any_text_region_in_model_one_is_main_or_header_light( contours_only_text_parent_head_d=[] for ii, con in enumerate(contours_only_text_parent_z): - img=np.zeros((regions_model_1.shape[0], regions_model_1.shape[1], 3)) - img = cv2.fillPoly(img, pts=[con], color=(255, 255, 255)) + img = np.zeros(regions_model_1.shape[:2]) + img = cv2.fillPoly(img, pts=[con], color=255) - all_pixels = (img[:,:,0]==255).sum() - pixels_header=((img[:,:,0]==255) & + all_pixels = (img == 255).sum() + pixels_header=((img == 255) & (regions_model_full[:,:,0]==2)).sum() pixels_main = all_pixels - pixels_header @@ -1029,7 +1030,7 @@ def check_any_text_region_in_model_one_is_main_or_header_light( ( pixels_header / float(pixels_main) >= 0.3 and length_con[ii] / float(height_con[ii]) >=3 )): - regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ] = 2 + regions_model_1[:,:][(regions_model_1[:,:]==1) & (img == 255) ] = 2 contours_only_text_parent_head.append(contours_only_text_parent[ii]) conf_contours_head.append(None) # why not conf_contours[ii], too? if contours_only_text_parent_d_ordered is not None: @@ -1039,7 +1040,7 @@ def check_any_text_region_in_model_one_is_main_or_header_light( all_found_textline_polygons_head.append(all_found_textline_polygons[ii]) else: - regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ] = 1 + regions_model_1[:,:][(regions_model_1[:,:]==1) & (img == 255) ] = 1 contours_only_text_parent_main.append(contours_only_text_parent[ii]) conf_contours_main.append(conf_contours[ii]) if contours_only_text_parent_d_ordered is not None: @@ -1119,11 +1120,11 @@ def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col) textlines_big.append(textlines_tot[i]) textlines_big_org_form.append(textlines_tot_org_form[i]) - img_textline_s = np.zeros((textline_iamge.shape[0], textline_iamge.shape[1])) - img_textline_s = cv2.fillPoly(img_textline_s, pts=textlines_small, color=(1, 1, 1)) + img_textline_s = np.zeros(textline_iamge.shape[:2]) + img_textline_s = cv2.fillPoly(img_textline_s, pts=textlines_small, color=1) - img_textline_b = np.zeros((textline_iamge.shape[0], textline_iamge.shape[1])) - img_textline_b = cv2.fillPoly(img_textline_b, pts=textlines_big, color=(1, 1, 1)) + img_textline_b = np.zeros(textline_iamge.shape[:2]) + img_textline_b = cv2.fillPoly(img_textline_b, pts=textlines_big, color=1) sum_small_big_all = img_textline_s + img_textline_b sum_small_big_all2 = (sum_small_big_all[:, :] == 2) * 1 @@ -1135,11 +1136,11 @@ def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col) # print(len(textlines_small),'small') intersections = [] for z2 in range(len(textlines_big)): - img_text = np.zeros((textline_iamge.shape[0], textline_iamge.shape[1])) - img_text = cv2.fillPoly(img_text, pts=[textlines_small[z1]], color=(1, 1, 1)) + img_text = np.zeros(textline_iamge.shape[:2]) + img_text = cv2.fillPoly(img_text, pts=[textlines_small[z1]], color=1) - img_text2 = np.zeros((textline_iamge.shape[0], textline_iamge.shape[1])) - img_text2 = cv2.fillPoly(img_text2, pts=[textlines_big[z2]], color=(1, 1, 1)) + img_text2 = np.zeros(textline_iamge.shape[:2]) + img_text2 = cv2.fillPoly(img_text2, pts=[textlines_big[z2]], color=1) sum_small_big = img_text2 + img_text sum_small_big_2 = (sum_small_big[:, :] == 2) * 1 @@ -1165,19 +1166,17 @@ def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col) index_small_textlines = list(np.where(np.array(dis_small_from_bigs_tot) == z)[0]) # print(z,index_small_textlines) - img_text2 = np.zeros((textline_iamge.shape[0], textline_iamge.shape[1], 3)) - img_text2 = cv2.fillPoly(img_text2, pts=[textlines_big[z]], color=(255, 255, 255)) + img_text2 = np.zeros(textline_iamge.shape[:2], dtype=np.uint8) + img_text2 = cv2.fillPoly(img_text2, pts=[textlines_big[z]], color=255) textlines_big_with_change.append(z) for k in index_small_textlines: - img_text2 = cv2.fillPoly(img_text2, pts=[textlines_small[k]], color=(255, 255, 255)) + img_text2 = cv2.fillPoly(img_text2, pts=[textlines_small[k]], color=255) textlines_small_with_change.append(k) - img_text2 = img_text2.astype(np.uint8) - imgray = cv2.cvtColor(img_text2, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - cont, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + _, thresh = cv2.threshold(img_text2, 0, 255, 0) + cont, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) # print(cont[0],type(cont)) textlines_big_with_change_con.append(cont) @@ -1189,9 +1188,8 @@ def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col) # print(textlines_big_with_change,'textlines_big_with_change') # print(textlines_small_with_change,'textlines_small_with_change') # print(textlines_big) - textlines_con_changed.append(textlines_big_org_form) - else: - textlines_con_changed.append(textlines_big_org_form) + + textlines_con_changed.append(textlines_big_org_form) return textlines_con_changed def order_of_regions(textline_mask, contours_main, contours_head, y_ref): @@ -1262,29 +1260,22 @@ def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( img_p_in_ver, img_in_hor,num_col_classifier): #img_p_in_ver = cv2.erode(img_p_in_ver, self.kernel, iterations=2) - img_p_in_ver=img_p_in_ver.astype(np.uint8) - img_p_in_ver=np.repeat(img_p_in_ver[:, :, np.newaxis], 3, axis=2) - imgray = cv2.cvtColor(img_p_in_ver, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - - contours_lines_ver,hierarchy=cv2.findContours(thresh,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) + _, thresh = cv2.threshold(img_p_in_ver, 0, 255, 0) + contours_lines_ver, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) slope_lines_ver, _, x_min_main_ver, _, _, _, y_min_main_ver, y_max_main_ver, cx_main_ver = \ find_features_of_lines(contours_lines_ver) for i in range(len(x_min_main_ver)): img_p_in_ver[int(y_min_main_ver[i]): int(y_min_main_ver[i])+30, int(cx_main_ver[i])-25: - int(cx_main_ver[i])+25, 0] = 0 + int(cx_main_ver[i])+25] = 0 img_p_in_ver[int(y_max_main_ver[i])-30: int(y_max_main_ver[i]), int(cx_main_ver[i])-25: - int(cx_main_ver[i])+25, 0] = 0 + int(cx_main_ver[i])+25] = 0 - img_in_hor=img_in_hor.astype(np.uint8) - img_in_hor=np.repeat(img_in_hor[:, :, np.newaxis], 3, axis=2) - imgray = cv2.cvtColor(img_in_hor, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - contours_lines_hor,hierarchy=cv2.findContours(thresh,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) + _, thresh = cv2.threshold(img_in_hor, 0, 255, 0) + contours_lines_hor, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) slope_lines_hor, dist_x_hor, x_min_main_hor, x_max_main_hor, cy_main_hor, _, _, _, _ = \ find_features_of_lines(contours_lines_hor) @@ -1340,22 +1331,19 @@ def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( img_p_in=img_in_hor special_separators=[] - img_p_in_ver[:,:,0][img_p_in_ver[:,:,0]==255]=1 - sep_ver_hor=img_p_in+img_p_in_ver - sep_ver_hor_cross=(sep_ver_hor[:,:,0]==2)*1 - sep_ver_hor_cross=np.repeat(sep_ver_hor_cross[:, :, np.newaxis], 3, axis=2) - sep_ver_hor_cross=sep_ver_hor_cross.astype(np.uint8) - imgray = cv2.cvtColor(sep_ver_hor_cross, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - contours_cross,_=cv2.findContours(thresh,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) - cx_cross, cy_cross = find_center_of_contours(contours_cross) - for ii in range(len(cx_cross)): - img_p_in[int(cy_cross[ii])-30:int(cy_cross[ii])+30,int(cx_cross[ii])+5:int(cx_cross[ii])+40,0]=0 - img_p_in[int(cy_cross[ii])-30:int(cy_cross[ii])+30,int(cx_cross[ii])-40:int(cx_cross[ii])-4,0]=0 + img_p_in_ver[img_p_in_ver == 255] = 1 + sep_ver_hor = img_p_in + img_p_in_ver + sep_ver_hor_cross = (sep_ver_hor == 2) * 1 + _, thresh = cv2.threshold(sep_ver_hor_cross.astype(np.uint8), 0, 255, 0) + contours_cross, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + center_cross = np.array(find_center_of_contours(contours_cross), dtype=int) + for cx, cy in center_cross.T: + img_p_in[cy - 30: cy + 30, cx + 5: cx + 40] = 0 + img_p_in[cy - 30: cy + 30, cx - 40: cx - 4] = 0 else: img_p_in=np.copy(img_in_hor) special_separators=[] - return img_p_in[:,:,0], special_separators + return img_p_in, special_separators def return_points_with_boundies(peaks_neg_fin, first_point, last_point): peaks_neg_tot = [] @@ -1365,11 +1353,11 @@ def return_points_with_boundies(peaks_neg_fin, first_point, last_point): peaks_neg_tot.append(last_point) return peaks_neg_tot -def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, pixel_lines, contours_h=None): +def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, label_lines, contours_h=None): t_ins_c0 = time.time() - separators_closeup=( (region_pre_p[:,:,:]==pixel_lines))*1 - separators_closeup[0:110,:,:]=0 - separators_closeup[separators_closeup.shape[0]-150:,:,:]=0 + separators_closeup=( (region_pre_p[:,:]==label_lines))*1 + separators_closeup[0:110,:]=0 + separators_closeup[separators_closeup.shape[0]-150:,:]=0 kernel = np.ones((5,5),np.uint8) separators_closeup=separators_closeup.astype(np.uint8) @@ -1381,15 +1369,11 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, separators_closeup_n=separators_closeup_n.astype(np.uint8) separators_closeup_n_binary=np.zeros(( separators_closeup_n.shape[0],separators_closeup_n.shape[1]) ) - separators_closeup_n_binary[:,:]=separators_closeup_n[:,:,0] + separators_closeup_n_binary[:,:]=separators_closeup_n[:,:] separators_closeup_n_binary[:,:][separators_closeup_n_binary[:,:]!=0]=1 - gray_early=np.repeat(separators_closeup_n_binary[:, :, np.newaxis], 3, axis=2) - gray_early=gray_early.astype(np.uint8) - imgray_e = cv2.cvtColor(gray_early, cv2.COLOR_BGR2GRAY) - ret_e, thresh_e = cv2.threshold(imgray_e, 0, 255, 0) - - contours_line_e,hierarchy_e=cv2.findContours(thresh_e,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) + _, thresh_e = cv2.threshold(separators_closeup_n_binary, 0, 255, 0) + contours_line_e, _ = cv2.findContours(thresh_e.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) _, dist_xe, _, _, _, _, y_min_main, y_max_main, _ = \ find_features_of_lines(contours_line_e) dist_ye = y_max_main - y_min_main @@ -1399,10 +1383,8 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, cnts_hor_e=[] for ce in args_hor_e: cnts_hor_e.append(contours_line_e[ce]) - figs_e=np.zeros(thresh_e.shape) - figs_e=cv2.fillPoly(figs_e,pts=cnts_hor_e,color=(1,1,1)) - separators_closeup_n_binary=cv2.fillPoly(separators_closeup_n_binary, pts=cnts_hor_e, color=(0,0,0)) + separators_closeup_n_binary=cv2.fillPoly(separators_closeup_n_binary, pts=cnts_hor_e, color=0) gray = cv2.bitwise_not(separators_closeup_n_binary) gray=gray.astype(np.uint8) @@ -1422,7 +1404,7 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, kernel = np.ones((5,5),np.uint8) horizontal = cv2.dilate(horizontal,kernel,iterations = 2) horizontal = cv2.erode(horizontal,kernel,iterations = 2) - horizontal = cv2.fillPoly(horizontal, pts=cnts_hor_e, color=(255,255,255)) + horizontal = cv2.fillPoly(horizontal, pts=cnts_hor_e, color=255) rows = vertical.shape[0] verticalsize = rows // 30 @@ -1440,13 +1422,8 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, separators_closeup_new[:,:][vertical[:,:]!=0]=1 separators_closeup_new[:,:][horizontal[:,:]!=0]=1 - vertical=np.repeat(vertical[:, :, np.newaxis], 3, axis=2) - vertical=vertical.astype(np.uint8) - - imgray = cv2.cvtColor(vertical, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - - contours_line_vers,hierarchy=cv2.findContours(thresh,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) + _, thresh = cv2.threshold(vertical, 0, 255, 0) + contours_line_vers, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) slope_lines, dist_x, x_min_main, x_max_main, cy_main, slope_lines_org, y_min_main, y_max_main, cx_main = \ find_features_of_lines(contours_line_vers) @@ -1461,11 +1438,8 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, dist_y_ver=y_max_main_ver-y_min_main_ver len_y=separators_closeup.shape[0]/3.0 - horizontal=np.repeat(horizontal[:, :, np.newaxis], 3, axis=2) - horizontal=horizontal.astype(np.uint8) - imgray = cv2.cvtColor(horizontal, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - contours_line_hors,hierarchy=cv2.findContours(thresh,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) + _, thresh = cv2.threshold(horizontal, 0, 255, 0) + contours_line_hors, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) slope_lines, dist_x, x_min_main, x_max_main, cy_main, slope_lines_org, y_min_main, y_max_main, cx_main = \ find_features_of_lines(contours_line_hors) @@ -1558,7 +1532,7 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, peaks_neg_fin_fin=[] for itiles in args_big_parts: regions_without_separators_tile=regions_without_separators[int(splitter_y_new[itiles]): - int(splitter_y_new[itiles+1]),:,0] + int(splitter_y_new[itiles+1]),:] try: num_col, peaks_neg_fin = find_num_col(regions_without_separators_tile, num_col_classifier, tables, multiplier=7.0) diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index 8431bbe..22a6f50 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -119,14 +119,11 @@ def return_parent_contours(contours, hierarchy): def return_contours_of_interested_region(region_pre_p, label, min_area=0.0002): # pixels of images are identified by 5 - if len(region_pre_p.shape) == 3: + if region_pre_p.ndim == 3: cnts_images = (region_pre_p[:, :, 0] == label) * 1 else: cnts_images = (region_pre_p[:, :] == label) * 1 - cnts_images = cnts_images.astype(np.uint8) - cnts_images = np.repeat(cnts_images[:, :, np.newaxis], 3, axis=2) - imgray = cv2.cvtColor(cnts_images, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) + _, thresh = cv2.threshold(cnts_images.astype(np.uint8), 0, 255, 0) contours_imgs, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) contours_imgs = return_parent_contours(contours_imgs, hierarchy) @@ -135,13 +132,11 @@ def return_contours_of_interested_region(region_pre_p, label, min_area=0.0002): return contours_imgs def do_work_of_contours_in_image(contour, index_r_con, img, slope_first): - img_copy = np.zeros(img.shape) - img_copy = cv2.fillPoly(img_copy, pts=[contour], color=(1, 1, 1)) + img_copy = np.zeros(img.shape[:2], dtype=np.uint8) + img_copy = cv2.fillPoly(img_copy, pts=[contour], color=1) img_copy = rotation_image_new(img_copy, -slope_first) - img_copy = img_copy.astype(np.uint8) - imgray = cv2.cvtColor(img_copy, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) + _, thresh = cv2.threshold(img_copy, 0, 255, 0) cont_int, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) @@ -164,8 +159,8 @@ def get_textregion_contours_in_org_image(cnts, img, slope_first): cnts_org = [] # print(cnts,'cnts') for i in range(len(cnts)): - img_copy = np.zeros(img.shape) - img_copy = cv2.fillPoly(img_copy, pts=[cnts[i]], color=(1, 1, 1)) + img_copy = np.zeros(img.shape[:2], dtype=np.uint8) + img_copy = cv2.fillPoly(img_copy, pts=[cnts[i]], color=1) # plt.imshow(img_copy) # plt.show() @@ -176,9 +171,7 @@ def get_textregion_contours_in_org_image(cnts, img, slope_first): # plt.imshow(img_copy) # plt.show() - img_copy = img_copy.astype(np.uint8) - imgray = cv2.cvtColor(img_copy, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) + _, thresh = cv2.threshold(img_copy, 0, 255, 0) cont_int, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) cont_int[0][:, 0, 0] = cont_int[0][:, 0, 0] + np.abs(img_copy.shape[1] - img.shape[1]) @@ -195,12 +188,11 @@ def get_textregion_contours_in_org_image_light_old(cnts, img, slope_first): interpolation=cv2.INTER_NEAREST) cnts_org = [] for cnt in cnts: - img_copy = np.zeros(img.shape) - img_copy = cv2.fillPoly(img_copy, pts=[(cnt / zoom).astype(int)], color=(1, 1, 1)) + img_copy = np.zeros(img.shape[:2], dtype=np.uint8) + img_copy = cv2.fillPoly(img_copy, pts=[cnt // zoom], color=1) img_copy = rotation_image_new(img_copy, -slope_first).astype(np.uint8) - imgray = cv2.cvtColor(img_copy, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) + _, thresh = cv2.threshold(img_copy, 0, 255, 0) cont_int, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) cont_int[0][:, 0, 0] = cont_int[0][:, 0, 0] + np.abs(img_copy.shape[1] - img.shape[1]) @@ -210,14 +202,13 @@ def get_textregion_contours_in_org_image_light_old(cnts, img, slope_first): return cnts_org def do_back_rotation_and_get_cnt_back(contour_par, index_r_con, img, slope_first, confidence_matrix): - img_copy = np.zeros(img.shape) - img_copy = cv2.fillPoly(img_copy, pts=[contour_par], color=(1, 1, 1)) - confidence_matrix_mapped_with_contour = confidence_matrix * img_copy[:,:,0] - confidence_contour = np.sum(confidence_matrix_mapped_with_contour) / float(np.sum(img_copy[:,:,0])) + img_copy = np.zeros(img.shape[:2], dtype=np.uint8) + img_copy = cv2.fillPoly(img_copy, pts=[contour_par], color=1) + confidence_matrix_mapped_with_contour = confidence_matrix * img_copy + confidence_contour = np.sum(confidence_matrix_mapped_with_contour) / float(np.sum(img_copy)) img_copy = rotation_image_new(img_copy, -slope_first).astype(np.uint8) - imgray = cv2.cvtColor(img_copy, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) + _, thresh = cv2.threshold(img_copy, 0, 255, 0) cont_int, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) if len(cont_int)==0: @@ -245,14 +236,11 @@ def get_textregion_contours_in_org_image_light(cnts, img, confidence_matrix): def return_contours_of_interested_textline(region_pre_p, label): # pixels of images are identified by 5 - if len(region_pre_p.shape) == 3: + if region_pre_p.ndim == 3: cnts_images = (region_pre_p[:, :, 0] == label) * 1 else: cnts_images = (region_pre_p[:, :] == label) * 1 - cnts_images = cnts_images.astype(np.uint8) - cnts_images = np.repeat(cnts_images[:, :, np.newaxis], 3, axis=2) - imgray = cv2.cvtColor(cnts_images, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) + _, thresh = cv2.threshold(cnts_images.astype(np.uint8), 0, 255, 0) contours_imgs, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) contours_imgs = return_parent_contours(contours_imgs, hierarchy) @@ -262,25 +250,22 @@ def return_contours_of_interested_textline(region_pre_p, label): def return_contours_of_image(image): if len(image.shape) == 2: - image = np.repeat(image[:, :, np.newaxis], 3, axis=2) image = image.astype(np.uint8) + imgray = image else: image = image.astype(np.uint8) - imgray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) + imgray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + _, thresh = cv2.threshold(imgray, 0, 255, 0) contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) return contours, hierarchy def return_contours_of_interested_region_by_min_size(region_pre_p, label, min_size=0.00003): # pixels of images are identified by 5 - if len(region_pre_p.shape) == 3: + if region_pre_p.ndim == 3: cnts_images = (region_pre_p[:, :, 0] == label) * 1 else: cnts_images = (region_pre_p[:, :] == label) * 1 - cnts_images = cnts_images.astype(np.uint8) - cnts_images = np.repeat(cnts_images[:, :, np.newaxis], 3, axis=2) - imgray = cv2.cvtColor(cnts_images, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) + _, thresh = cv2.threshold(cnts_images.astype(np.uint8), 0, 255, 0) contours_imgs, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) contours_imgs = return_parent_contours(contours_imgs, hierarchy) @@ -291,24 +276,21 @@ def return_contours_of_interested_region_by_min_size(region_pre_p, label, min_si def return_contours_of_interested_region_by_size(region_pre_p, label, min_area, max_area): # pixels of images are identified by 5 - if len(region_pre_p.shape) == 3: + if region_pre_p.ndim == 3: cnts_images = (region_pre_p[:, :, 0] == label) * 1 else: cnts_images = (region_pre_p[:, :] == label) * 1 - cnts_images = cnts_images.astype(np.uint8) - cnts_images = np.repeat(cnts_images[:, :, np.newaxis], 3, axis=2) - imgray = cv2.cvtColor(cnts_images, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) + _, thresh = cv2.threshold(cnts_images.astype(np.uint8), 0, 255, 0) contours_imgs, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) contours_imgs = return_parent_contours(contours_imgs, hierarchy) contours_imgs = filter_contours_area_of_image_tables( thresh, contours_imgs, hierarchy, max_area=max_area, min_area=min_area) - img_ret = np.zeros((region_pre_p.shape[0], region_pre_p.shape[1], 3)) - img_ret = cv2.fillPoly(img_ret, pts=contours_imgs, color=(1, 1, 1)) + img_ret = np.zeros((region_pre_p.shape[0], region_pre_p.shape[1])) + img_ret = cv2.fillPoly(img_ret, pts=contours_imgs, color=1) - return img_ret[:, :, 0] + return img_ret def dilate_textline_contours(all_found_textline_polygons): return [[polygon2contour(contour2polygon(contour, dilate=6)) diff --git a/src/eynollah/utils/separate_lines.py b/src/eynollah/utils/separate_lines.py index d41dda1..b8c7f3d 100644 --- a/src/eynollah/utils/separate_lines.py +++ b/src/eynollah/utils/separate_lines.py @@ -142,13 +142,12 @@ def dedup_separate_lines(img_patch, contour_text_interest, thetha, axis): rotation_matrix) def separate_lines(img_patch, contour_text_interest, thetha, x_help, y_help): - (h, w) = img_patch.shape[:2] + h, w = img_patch.shape[:2] center = (w // 2, h // 2) M = cv2.getRotationMatrix2D(center, -thetha, 1.0) x_d = M[0, 2] y_d = M[1, 2] - thetha = thetha / 180. * np.pi - rotation_matrix = np.array([[np.cos(thetha), -np.sin(thetha)], [np.sin(thetha), np.cos(thetha)]]) + rotation_matrix = M[:2, :2] contour_text_interest_copy = contour_text_interest.copy() x_cont = contour_text_interest[:, 0, 0] @@ -1302,19 +1301,16 @@ def separate_lines_new_inside_tiles(img_path, thetha): def separate_lines_vertical_cont(img_patch, contour_text_interest, thetha, box_ind, add_boxes_coor_into_textlines): kernel = np.ones((5, 5), np.uint8) - pixel = 255 + label = 255 min_area = 0 max_area = 1 - if len(img_patch.shape) == 3: - cnts_images = (img_patch[:, :, 0] == pixel) * 1 + if img_patch.ndim == 3: + cnts_images = (img_patch[:, :, 0] == label) * 1 else: - cnts_images = (img_patch[:, :] == pixel) * 1 - cnts_images = cnts_images.astype(np.uint8) - cnts_images = np.repeat(cnts_images[:, :, np.newaxis], 3, axis=2) - imgray = cv2.cvtColor(cnts_images, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - contours_imgs, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + cnts_images = (img_patch[:, :] == label) * 1 + _, thresh = cv2.threshold(cnts_images.astype(np.uint8), 0, 255, 0) + contours_imgs, hierarchy = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) contours_imgs = return_parent_contours(contours_imgs, hierarchy) contours_imgs = filter_contours_area_of_image_tables(thresh, @@ -1322,14 +1318,12 @@ def separate_lines_vertical_cont(img_patch, contour_text_interest, thetha, box_i max_area=max_area, min_area=min_area) cont_final = [] for i in range(len(contours_imgs)): - img_contour = np.zeros((cnts_images.shape[0], cnts_images.shape[1], 3)) - img_contour = cv2.fillPoly(img_contour, pts=[contours_imgs[i]], color=(255, 255, 255)) - img_contour = img_contour.astype(np.uint8) + img_contour = np.zeros(cnts_images.shape[:2], dtype=np.uint8) + img_contour = cv2.fillPoly(img_contour, pts=[contours_imgs[i]], color=255) img_contour = cv2.dilate(img_contour, kernel, iterations=4) - imgrayrot = cv2.cvtColor(img_contour, cv2.COLOR_BGR2GRAY) - _, threshrot = cv2.threshold(imgrayrot, 0, 255, 0) - contours_text_rot, _ = cv2.findContours(threshrot.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + _, threshrot = cv2.threshold(img_contour, 0, 255, 0) + contours_text_rot, _ = cv2.findContours(threshrot.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) ##contour_text_copy[:, 0, 0] = contour_text_copy[:, 0, 0] - box_ind[ ##0] @@ -1344,8 +1338,7 @@ def separate_lines_vertical_cont(img_patch, contour_text_interest, thetha, box_i def textline_contours_postprocessing(textline_mask, slope, contour_text_interest, box_ind, add_boxes_coor_into_textlines=False): - textline_mask = np.repeat(textline_mask[:, :, np.newaxis], 3, axis=2) * 255 - textline_mask = textline_mask.astype(np.uint8) + textline_mask = textline_mask * 255 kernel = np.ones((5, 5), np.uint8) textline_mask = cv2.morphologyEx(textline_mask, cv2.MORPH_OPEN, kernel) textline_mask = cv2.morphologyEx(textline_mask, cv2.MORPH_CLOSE, kernel) @@ -1356,12 +1349,11 @@ def textline_contours_postprocessing(textline_mask, slope, y_help = 2 textline_mask_help = np.zeros((textline_mask.shape[0] + int(2 * y_help), - textline_mask.shape[1] + int(2 * x_help), 3)) + textline_mask.shape[1] + int(2 * x_help))) textline_mask_help[y_help : y_help + textline_mask.shape[0], - x_help : x_help + textline_mask.shape[1], :] = np.copy(textline_mask[:, :, :]) + x_help : x_help + textline_mask.shape[1]] = np.copy(textline_mask[:, :]) dst = rotate_image(textline_mask_help, slope) - dst = dst[:, :, 0] dst[dst != 0] = 1 # if np.abs(slope)>.5 and textline_mask.shape[0]/float(textline_mask.shape[1])>3: @@ -1372,21 +1364,18 @@ def textline_contours_postprocessing(textline_mask, slope, contour_text_copy[:, 0, 0] = contour_text_copy[:, 0, 0] - box_ind[0] contour_text_copy[:, 0, 1] = contour_text_copy[:, 0, 1] - box_ind[1] - img_contour = np.zeros((box_ind[3], box_ind[2], 3)) - img_contour = cv2.fillPoly(img_contour, pts=[contour_text_copy], color=(255, 255, 255)) + img_contour = np.zeros((box_ind[3], box_ind[2])) + img_contour = cv2.fillPoly(img_contour, pts=[contour_text_copy], color=255) img_contour_help = np.zeros((img_contour.shape[0] + int(2 * y_help), - img_contour.shape[1] + int(2 * x_help), 3)) + img_contour.shape[1] + int(2 * x_help))) img_contour_help[y_help : y_help + img_contour.shape[0], - x_help : x_help + img_contour.shape[1], :] = np.copy(img_contour[:, :, :]) + x_help : x_help + img_contour.shape[1]] = np.copy(img_contour[:, :]) img_contour_rot = rotate_image(img_contour_help, slope) - img_contour_rot = img_contour_rot.astype(np.uint8) - # dst_help = dst_help.astype(np.uint8) - imgrayrot = cv2.cvtColor(img_contour_rot, cv2.COLOR_BGR2GRAY) - _, threshrot = cv2.threshold(imgrayrot, 0, 255, 0) - contours_text_rot, _ = cv2.findContours(threshrot.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + _, threshrot = cv2.threshold(img_contour_rot, 0, 255, 0) + contours_text_rot, _ = cv2.findContours(threshrot.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) len_con_text_rot = [len(contours_text_rot[ib]) for ib in range(len(contours_text_rot))] ind_big_con = np.argmax(len_con_text_rot) From 6e57ab3741f5532a30dd2925b423cd40871ab010 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 6 Oct 2025 16:53:59 +0200 Subject: [PATCH 063/101] textline_contours_postprocessing: do not catch arbitrary exceptions --- src/eynollah/utils/separate_lines.py | 68 ++++++++++++++-------------- 1 file changed, 33 insertions(+), 35 deletions(-) diff --git a/src/eynollah/utils/separate_lines.py b/src/eynollah/utils/separate_lines.py index b8c7f3d..3bfc903 100644 --- a/src/eynollah/utils/separate_lines.py +++ b/src/eynollah/utils/separate_lines.py @@ -1344,51 +1344,49 @@ def textline_contours_postprocessing(textline_mask, slope, textline_mask = cv2.morphologyEx(textline_mask, cv2.MORPH_CLOSE, kernel) textline_mask = cv2.erode(textline_mask, kernel, iterations=2) # textline_mask = cv2.erode(textline_mask, kernel, iterations=1) - try: - x_help = 30 - y_help = 2 - textline_mask_help = np.zeros((textline_mask.shape[0] + int(2 * y_help), - textline_mask.shape[1] + int(2 * x_help))) - textline_mask_help[y_help : y_help + textline_mask.shape[0], - x_help : x_help + textline_mask.shape[1]] = np.copy(textline_mask[:, :]) + x_help = 30 + y_help = 2 - dst = rotate_image(textline_mask_help, slope) - dst[dst != 0] = 1 + textline_mask_help = np.zeros((textline_mask.shape[0] + int(2 * y_help), + textline_mask.shape[1] + int(2 * x_help))) + textline_mask_help[y_help : y_help + textline_mask.shape[0], + x_help : x_help + textline_mask.shape[1]] = np.copy(textline_mask[:, :]) - # if np.abs(slope)>.5 and textline_mask.shape[0]/float(textline_mask.shape[1])>3: - # plt.imshow(dst) - # plt.show() + dst = rotate_image(textline_mask_help, slope) + dst[dst != 0] = 1 - contour_text_copy = contour_text_interest.copy() - contour_text_copy[:, 0, 0] = contour_text_copy[:, 0, 0] - box_ind[0] - contour_text_copy[:, 0, 1] = contour_text_copy[:, 0, 1] - box_ind[1] + # if np.abs(slope)>.5 and textline_mask.shape[0]/float(textline_mask.shape[1])>3: + # plt.imshow(dst) + # plt.show() - img_contour = np.zeros((box_ind[3], box_ind[2])) - img_contour = cv2.fillPoly(img_contour, pts=[contour_text_copy], color=255) + contour_text_copy = contour_text_interest.copy() + contour_text_copy[:, 0, 0] = contour_text_copy[:, 0, 0] - box_ind[0] + contour_text_copy[:, 0, 1] = contour_text_copy[:, 0, 1] - box_ind[1] - img_contour_help = np.zeros((img_contour.shape[0] + int(2 * y_help), - img_contour.shape[1] + int(2 * x_help))) - img_contour_help[y_help : y_help + img_contour.shape[0], - x_help : x_help + img_contour.shape[1]] = np.copy(img_contour[:, :]) + img_contour = np.zeros((box_ind[3], box_ind[2])) + img_contour = cv2.fillPoly(img_contour, pts=[contour_text_copy], color=255) - img_contour_rot = rotate_image(img_contour_help, slope) + img_contour_help = np.zeros((img_contour.shape[0] + int(2 * y_help), + img_contour.shape[1] + int(2 * x_help))) + img_contour_help[y_help : y_help + img_contour.shape[0], + x_help : x_help + img_contour.shape[1]] = np.copy(img_contour[:, :]) - _, threshrot = cv2.threshold(img_contour_rot, 0, 255, 0) - contours_text_rot, _ = cv2.findContours(threshrot.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + img_contour_rot = rotate_image(img_contour_help, slope) - len_con_text_rot = [len(contours_text_rot[ib]) for ib in range(len(contours_text_rot))] - ind_big_con = np.argmax(len_con_text_rot) + _, threshrot = cv2.threshold(img_contour_rot, 0, 255, 0) + contours_text_rot, _ = cv2.findContours(threshrot.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - if abs(slope) > 45: - _, contours_rotated_clean = separate_lines_vertical_cont( - textline_mask, contours_text_rot[ind_big_con], box_ind, slope, - add_boxes_coor_into_textlines=add_boxes_coor_into_textlines) - else: - _, contours_rotated_clean = separate_lines( - dst, contours_text_rot[ind_big_con], slope, x_help, y_help) - except: - contours_rotated_clean = [] + len_con_text_rot = [len(contours_text_rot[ib]) for ib in range(len(contours_text_rot))] + ind_big_con = np.argmax(len_con_text_rot) + + if abs(slope) > 45: + _, contours_rotated_clean = separate_lines_vertical_cont( + textline_mask, contours_text_rot[ind_big_con], box_ind, slope, + add_boxes_coor_into_textlines=add_boxes_coor_into_textlines) + else: + _, contours_rotated_clean = separate_lines( + dst, contours_text_rot[ind_big_con], slope, x_help, y_help) return contours_rotated_clean From 595ed02743afc3ab8359de5f6feb0ca680546599 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 6 Oct 2025 17:24:50 +0200 Subject: [PATCH 064/101] run_single: simplify; allow running TrOCR in non-fl mode, too - refactor final `self.full_layout` conditional, removing copied code - allow running `self.ocr` and `self.tr` branch in both cases (non/fl) - when running TrOCR, use model / processor / device initialised during init (instead of ad-hoc loading) --- src/eynollah/eynollah.py | 277 ++++++++++++++++----------------------- 1 file changed, 112 insertions(+), 165 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 834ecf3..079cf8c 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -379,9 +379,14 @@ class Eynollah: self.model_reading_order = self.our_load_model(self.model_reading_order_dir) if self.ocr and self.tr: self.model_ocr = VisionEncoderDecoderModel.from_pretrained(self.model_ocr_dir) - self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - #("microsoft/trocr-base-printed")#("microsoft/trocr-base-handwritten") - self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") + if torch.cuda.is_available(): + self.logger.info("Using GPU acceleration") + self.device = torch.device("cuda:0") + else: + self.logger.info("Using CPU processing") + self.device = torch.device("cpu") + #self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") + self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed") elif self.ocr and not self.tr: model_ocr = load_model(self.model_ocr_dir , compile=False) @@ -4805,12 +4810,13 @@ class Eynollah: slopes_marginals, mid_point_of_page_width) #print(len(polygons_of_marginals), len(ordered_left_marginals), len(ordered_right_marginals), 'marginals ordred') + if np.abs(slope_deskew) >= SLOPE_THRESHOLD: + contours_only_text_parent_d_ordered = self.return_list_of_contours_with_desired_order( + contours_only_text_parent_d_ordered, index_by_text_par_con) + else: + contours_only_text_parent_d_ordered = None + if self.full_layout: - if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - contours_only_text_parent_d_ordered = self.return_list_of_contours_with_desired_order( - contours_only_text_parent_d_ordered, index_by_text_par_con) - else: - contours_only_text_parent_d_ordered = None if self.light_version: fun = check_any_text_region_in_model_one_is_main_or_header_light else: @@ -4869,44 +4875,43 @@ class Eynollah: splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, num_col_classifier, erosion_hurts, self.tables, self.right2left, logger=self.logger) + else: + contours_only_text_parent_h = [] + contours_only_text_parent_h_d_ordered = [] if self.plotter: self.plotter.write_images_into_directory(polygons_of_images, image_page) t_order = time.time() - if self.full_layout: - self.logger.info("Step 4/5: Reading Order Detection") - - if self.reading_order_machine_based: - self.logger.info("Using machine-based detection") - if self.right2left: - self.logger.info("Right-to-left mode enabled") - if self.headers_off: - self.logger.info("Headers ignored in reading order") + #if self.full_layout: + self.logger.info("Step 4/5: Reading Order Detection") - if self.reading_order_machine_based: - tror = time.time() - order_text_new, id_of_texts_tot = self.do_order_of_regions_with_model( - contours_only_text_parent, contours_only_text_parent_h, text_regions_p) + if self.reading_order_machine_based: + self.logger.info("Using machine-based detection") + if self.right2left: + self.logger.info("Right-to-left mode enabled") + if self.headers_off: + self.logger.info("Headers ignored in reading order") + + if self.reading_order_machine_based: + order_text_new, id_of_texts_tot = self.do_order_of_regions_with_model( + contours_only_text_parent, contours_only_text_parent_h, text_regions_p) + else: + if np.abs(slope_deskew) < SLOPE_THRESHOLD: + order_text_new, id_of_texts_tot = self.do_order_of_regions( + contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot) else: - if np.abs(slope_deskew) < SLOPE_THRESHOLD: - order_text_new, id_of_texts_tot = self.do_order_of_regions( - contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot) - else: - order_text_new, id_of_texts_tot = self.do_order_of_regions( - contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, - boxes_d, textline_mask_tot_d) - self.logger.info(f"Detection of reading order took {time.time() - t_order:.1f}s") + order_text_new, id_of_texts_tot = self.do_order_of_regions( + contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, + boxes_d, textline_mask_tot_d) + self.logger.info(f"Detection of reading order took {time.time() - t_order:.1f}s") - if self.ocr and not self.tr: - self.logger.info("Step 4.5/5: OCR Processing") - - if torch.cuda.is_available(): - self.logger.info("Using GPU acceleration") - else: - self.logger.info("Using CPU processing") - + if self.ocr: + self.logger.info("Step 4.5/5: OCR Processing") + + if not self.tr: gc.collect() + if len(all_found_textline_polygons)>0: ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines( image_page, all_found_textline_polygons, self.prediction_model, @@ -4941,15 +4946,68 @@ class Eynollah: self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) else: ocr_all_textlines_drop = None + else: - ocr_all_textlines = None - ocr_all_textlines_marginals_left = None - ocr_all_textlines_marginals_right = None - ocr_all_textlines_h = None - ocr_all_textlines_drop = None + if self.light_version: + self.logger.info("Using light version OCR") + if self.textline_light: + self.logger.info("Using light text line detection for OCR") + self.logger.info("Processing text lines...") + + self.device.reset() + gc.collect() + + torch.cuda.empty_cache() + self.model_ocr.to(self.device) + + ind_tot = 0 + #cv2.imwrite('./img_out.png', image_page) + ocr_all_textlines = [] + for indexing, ind_poly_first in enumerate(all_found_textline_polygons): + ocr_textline_in_textregion = [] + for indexing2, ind_poly in enumerate(ind_poly_first): + if not (self.textline_light or self.curved_line): + ind_poly = copy.deepcopy(ind_poly) + box_ind = all_box_coord[indexing] + #print(ind_poly,np.shape(ind_poly), 'ind_poly') + #print(box_ind) + ind_poly = return_textline_contour_with_added_box_coordinate(ind_poly, box_ind) + #print(ind_poly_copy) + ind_poly[ind_poly<0] = 0 + x, y, w, h = cv2.boundingRect(ind_poly) + #print(ind_poly_copy, np.shape(ind_poly_copy)) + #print(x, y, w, h, h/float(w),'ratio') + h2w_ratio = h/float(w) + mask_poly = np.zeros(image_page.shape) + if not self.light_version: + img_poly_on_img = np.copy(image_page) + else: + img_poly_on_img = np.copy(img_bin_light) + mask_poly = cv2.fillPoly(mask_poly, pts=[ind_poly], color=(1, 1, 1)) + + if self.textline_light: + mask_poly = cv2.dilate(mask_poly, KERNEL, iterations=1) + img_poly_on_img[:,:,0][mask_poly[:,:,0] ==0] = 255 + img_poly_on_img[:,:,1][mask_poly[:,:,0] ==0] = 255 + img_poly_on_img[:,:,2][mask_poly[:,:,0] ==0] = 255 + + img_croped = img_poly_on_img[y:y+h, x:x+w, :] + #cv2.imwrite('./extracted_lines/'+str(ind_tot)+'.jpg', img_croped) + text_ocr = self.return_ocr_of_textline_without_common_section( + img_croped, self.model_ocr, self.processor, self.device, w, h2w_ratio, ind_tot) + ocr_textline_in_textregion.append(text_ocr) + ind_tot = ind_tot +1 + ocr_all_textlines.append(ocr_textline_in_textregion) + else: + ocr_all_textlines = None + ocr_all_textlines_marginals_left = None + ocr_all_textlines_marginals_right = None + ocr_all_textlines_h = None + ocr_all_textlines_drop = None - self.logger.info("Step 5/5: Output Generation") - + self.logger.info("Step 5/5: Output Generation") + + if self.full_layout: pcgts = self.writer.build_pagexml_full_layout( contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, @@ -4962,129 +5020,18 @@ class Eynollah: ocr_all_textlines_marginals_left, ocr_all_textlines_marginals_right, ocr_all_textlines_drop, conf_contours_textregions, conf_contours_textregions_h) - - return pcgts - - contours_only_text_parent_h = [] - self.logger.info("Step 4/5: Reading Order Detection") - - if self.reading_order_machine_based: - self.logger.info("Using machine-based detection") - if self.right2left: - self.logger.info("Right-to-left mode enabled") - if self.headers_off: - self.logger.info("Headers ignored in reading order") - - if self.reading_order_machine_based: - order_text_new, id_of_texts_tot = self.do_order_of_regions_with_model( - contours_only_text_parent, contours_only_text_parent_h, text_regions_p) else: - if np.abs(slope_deskew) < SLOPE_THRESHOLD: - order_text_new, id_of_texts_tot = self.do_order_of_regions( - contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot) - else: - contours_only_text_parent_d_ordered = self.return_list_of_contours_with_desired_order( - contours_only_text_parent_d_ordered, index_by_text_par_con) - order_text_new, id_of_texts_tot = self.do_order_of_regions( - contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d) - - if self.ocr and self.tr: - self.logger.info("Step 4.5/5: OCR Processing") - if torch.cuda.is_available(): - self.logger.info("Using GPU acceleration") - else: - self.logger.info("Using CPU processing") - if self.light_version: - self.logger.info("Using light version OCR") - if self.textline_light: - self.logger.info("Using light text line detection for OCR") - self.logger.info("Processing text lines...") + pcgts = self.writer.build_pagexml_no_full_layout( + txt_con_org, page_coord, order_text_new, id_of_texts_tot, + all_found_textline_polygons, all_box_coord, polygons_of_images, + polygons_of_marginals_left, polygons_of_marginals_right, + all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, + all_box_coord_marginals_left, all_box_coord_marginals_right, + slopes, slopes_marginals_left, slopes_marginals_right, + cont_page, polygons_seplines, contours_tables, ocr_all_textlines, + ocr_all_textlines_marginals_left, ocr_all_textlines_marginals_right, + conf_contours_textregions) - device = cuda.get_current_device() - device.reset() - gc.collect() - model_ocr = VisionEncoderDecoderModel.from_pretrained(self.model_ocr_dir) - device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed") - torch.cuda.empty_cache() - model_ocr.to(device) - - ind_tot = 0 - #cv2.imwrite('./img_out.png', image_page) - ocr_all_textlines = [] - for indexing, ind_poly_first in enumerate(all_found_textline_polygons): - ocr_textline_in_textregion = [] - for indexing2, ind_poly in enumerate(ind_poly_first): - if not (self.textline_light or self.curved_line): - ind_poly = copy.deepcopy(ind_poly) - box_ind = all_box_coord[indexing] - #print(ind_poly,np.shape(ind_poly), 'ind_poly') - #print(box_ind) - ind_poly = return_textline_contour_with_added_box_coordinate(ind_poly, box_ind) - #print(ind_poly_copy) - ind_poly[ind_poly<0] = 0 - x, y, w, h = cv2.boundingRect(ind_poly) - #print(ind_poly_copy, np.shape(ind_poly_copy)) - #print(x, y, w, h, h/float(w),'ratio') - h2w_ratio = h/float(w) - mask_poly = np.zeros(image_page.shape) - if not self.light_version: - img_poly_on_img = np.copy(image_page) - else: - img_poly_on_img = np.copy(img_bin_light) - mask_poly = cv2.fillPoly(mask_poly, pts=[ind_poly], color=(1, 1, 1)) - - if self.textline_light: - mask_poly = cv2.dilate(mask_poly, KERNEL, iterations=1) - img_poly_on_img[:,:,0][mask_poly[:,:,0] ==0] = 255 - img_poly_on_img[:,:,1][mask_poly[:,:,0] ==0] = 255 - img_poly_on_img[:,:,2][mask_poly[:,:,0] ==0] = 255 - - img_croped = img_poly_on_img[y:y+h, x:x+w, :] - #cv2.imwrite('./extracted_lines/'+str(ind_tot)+'.jpg', img_croped) - text_ocr = self.return_ocr_of_textline_without_common_section( - img_croped, model_ocr, processor, device, w, h2w_ratio, ind_tot) - ocr_textline_in_textregion.append(text_ocr) - ind_tot = ind_tot +1 - ocr_all_textlines.append(ocr_textline_in_textregion) - - elif self.ocr and not self.tr: - gc.collect() - if len(all_found_textline_polygons)>0: - ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines( - image_page, all_found_textline_polygons, self.prediction_model, - self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) - - if all_found_textline_polygons_marginals_left and len(all_found_textline_polygons_marginals_left)>0: - ocr_all_textlines_marginals_left = return_rnn_cnn_ocr_of_given_textlines( - image_page, all_found_textline_polygons_marginals_left, self.prediction_model, - self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) - - if all_found_textline_polygons_marginals_right and len(all_found_textline_polygons_marginals_right)>0: - ocr_all_textlines_marginals_right = return_rnn_cnn_ocr_of_given_textlines( - image_page, all_found_textline_polygons_marginals_right, self.prediction_model, - self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) - - else: - ocr_all_textlines = None - ocr_all_textlines_marginals_left = None - ocr_all_textlines_marginals_right = None - self.logger.info(f"Detection of reading order took {time.time() - t_order:.1f}s") - - self.logger.info("Step 5/5: Output Generation") - self.logger.info("Generating PAGE-XML output") - - pcgts = self.writer.build_pagexml_no_full_layout( - txt_con_org, page_coord, order_text_new, id_of_texts_tot, - all_found_textline_polygons, all_box_coord, polygons_of_images, - polygons_of_marginals_left, polygons_of_marginals_right, - all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, - all_box_coord_marginals_left, all_box_coord_marginals_right, - slopes, slopes_marginals_left, slopes_marginals_right, - cont_page, polygons_seplines, contours_tables, ocr_all_textlines, - ocr_all_textlines_marginals_left, ocr_all_textlines_marginals_right, - conf_contours_textregions) - return pcgts From a1904fa660e7cb79ba9b4d8fc7df5befc41072f1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 6 Oct 2025 17:44:12 +0200 Subject: [PATCH 065/101] tests: cover layout with OCR in various modes --- tests/test_run.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_run.py b/tests/test_run.py index 59e5099..d69f021 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -24,14 +24,18 @@ MODELS_BIN = environ.get('MODELS_BIN', str(testdir.joinpath('..', 'default-2021- "options", [ [], # defaults - ["--allow_scaling", "--curved-line"], + #["--allow_scaling", "--curved-line"], ["--allow_scaling", "--curved-line", "--full-layout"], ["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based"], ["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based", "--textline_light", "--light_version"], # -ep ... # -eoi ... - # --do_ocr + ["--do_ocr"], + ["--do_ocr", "--light_version", "--textline_light"], + ["--do_ocr", "--transformer_ocr"], + #["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light"], + ["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light", "--full-layout"], # --skip_layout_and_reading_order ], ids=str) def test_run_eynollah_layout_filename(tmp_path, pytestconfig, caplog, options): From 23535998f7532942d481f3729682969e19c228b6 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 6 Oct 2025 21:27:21 +0200 Subject: [PATCH 066/101] tests: symlink OCR models into layout model directory (so layout with OCR options works with our split model packages) --- Makefile | 19 +++++++++++-------- tests/test_run.py | 3 ++- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 357aa47..5d190b2 100644 --- a/Makefile +++ b/Makefile @@ -90,26 +90,29 @@ deps-test: $(OCR_MODELNAME) endif deps-test: $(BIN_MODELNAME) $(SEG_MODELNAME) $(PIP) install -r requirements-test.txt +ifeq (OCR,$(findstring OCR, $(EXTRAS))) + ln -s $(OCR_MODELNAME)/* $(SEG_MODELNAME)/ +endif smoke-test: TMPDIR != mktemp -d smoke-test: tests/resources/kant_aufklaerung_1784_0020.tif # layout analysis: - eynollah layout -i $< -o $(TMPDIR) -m $(CURDIR)/models_layout_v0_5_0 + eynollah layout -i $< -o $(TMPDIR) -m $(CURDIR)/$(SEG_MODELNAME) fgrep -q http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 $(TMPDIR)/$(basename $( Date: Tue, 7 Oct 2025 00:54:25 +0200 Subject: [PATCH 067/101] CI: run deps-test with OCR extra so symlink rule fires --- .github/workflows/test-eynollah.yml | 2 +- Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-eynollah.yml b/.github/workflows/test-eynollah.yml index 9d5b2c8..7c3f5ae 100644 --- a/.github/workflows/test-eynollah.yml +++ b/.github/workflows/test-eynollah.yml @@ -65,7 +65,7 @@ jobs: run: | python -m pip install --upgrade pip make install-dev EXTRAS=OCR,plotting - make deps-test + make deps-test EXTRAS=OCR,plotting - name: Test with pytest run: make coverage PYTEST_ARGS="-vv --junitxml=pytest.xml" - name: Get coverage results diff --git a/Makefile b/Makefile index 5d190b2..618b1f9 100644 --- a/Makefile +++ b/Makefile @@ -91,7 +91,7 @@ endif deps-test: $(BIN_MODELNAME) $(SEG_MODELNAME) $(PIP) install -r requirements-test.txt ifeq (OCR,$(findstring OCR, $(EXTRAS))) - ln -s $(OCR_MODELNAME)/* $(SEG_MODELNAME)/ + ln -rs $(OCR_MODELNAME)/* $(SEG_MODELNAME)/ endif smoke-test: TMPDIR != mktemp -d From d53f829dfd0b26e4738915b24ffe4256796c6eb4 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 7 Oct 2025 22:06:57 +0200 Subject: [PATCH 068/101] filter_contours_inside_a_bigger_one: fix edge case in 81827c29 --- src/eynollah/eynollah.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 079cf8c..271779f 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4068,7 +4068,9 @@ class Eynollah: for textregion_index_to_del in textline_in_textregion_index_to_del: contours[textregion_index_to_del] = list(np.delete( contours[textregion_index_to_del], - textline_in_textregion_index_to_del[textregion_index_to_del])) + textline_in_textregion_index_to_del[textregion_index_to_del], + # needed so numpy does not flatten the entire result when 0 left + axis=0)) return contours From 2e907875c12b4f22c650c109558917479e0ec3ae Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 7 Oct 2025 22:32:06 +0200 Subject: [PATCH 069/101] get_text_region_boxes_by_given_contours: simplify --- src/eynollah/eynollah.py | 4 ++-- src/eynollah/utils/contour.py | 10 ++-------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 271779f..06be910 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4726,8 +4726,8 @@ class Eynollah: txt_con_org , conf_contours_textregions = get_textregion_contours_in_org_image_light( contours_only_text_parent, self.image, confidence_matrix) #print("text region early 4 in %.1fs", time.time() - t0) - boxes_text, _ = get_text_region_boxes_by_given_contours(contours_only_text_parent) - boxes_marginals, _ = get_text_region_boxes_by_given_contours(polygons_of_marginals) + boxes_text = get_text_region_boxes_by_given_contours(contours_only_text_parent) + boxes_marginals = get_text_region_boxes_by_given_contours(polygons_of_marginals) #print("text region early 5 in %.1fs", time.time() - t0) ## birdan sora chock chakir if not self.curved_line: diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index 22a6f50..fb4bbd0 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -36,14 +36,8 @@ def find_contours_mean_y_diff(contours_main): return np.mean(np.diff(np.sort(np.array(cy_main)))) def get_text_region_boxes_by_given_contours(contours): - boxes = [] - contours_new = [] - for jj in range(len(contours)): - box = cv2.boundingRect(contours[jj]) - boxes.append(box) - contours_new.append(contours[jj]) - - return boxes, contours_new + return [cv2.boundingRect(contour) + for contour in contours] def filter_contours_area_of_image(image, contours, hierarchy, max_area=1.0, min_area=0.0, dilate=0): found_polygons_early = [] From dfdc70537530b55f77b5232ae3cfa1fc8357eed0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 7 Oct 2025 22:33:06 +0200 Subject: [PATCH 070/101] do_work_of_slopes: rm unused old variant --- src/eynollah/eynollah.py | 38 -------------------------------------- 1 file changed, 38 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 06be910..2431a3b 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -108,7 +108,6 @@ from .utils.utils_ocr import ( get_contours_and_bounding_boxes ) from .utils.separate_lines import ( - textline_contours_postprocessing, separate_lines_new2, return_deskew_slop, do_work_of_slopes_new, @@ -2062,43 +2061,6 @@ class Eynollah: (prediction_textline_longshot_true_size[:, :, 0]==1).astype(np.uint8)) - def do_work_of_slopes(self, q, poly, box_sub, boxes_per_process, textline_mask_tot, contours_per_process): - self.logger.debug('enter do_work_of_slopes') - slope_biggest = 0 - slopes_sub = [] - boxes_sub_new = [] - poly_sub = [] - for mv in range(len(boxes_per_process)): - crop_img, _ = crop_image_inside_box(boxes_per_process[mv], textline_mask_tot) - crop_img = cv2.erode(crop_img, KERNEL, iterations=2) - try: - textline_con, hierarchy = return_contours_of_image(crop_img) - textline_con_fil = filter_contours_area_of_image(crop_img, textline_con, hierarchy, - max_area=1, min_area=0.0008) - y_diff_mean = find_contours_mean_y_diff(textline_con_fil) - sigma_des = max(1, int(y_diff_mean * (4.0 / 40.0))) - crop_img[crop_img > 0] = 1 - slope_corresponding_textregion = return_deskew_slop(crop_img, sigma_des, - logger=self.logger, plotter=self.plotter) - except Exception as why: - self.logger.error(why) - slope_corresponding_textregion = MAX_SLOPE - - if slope_corresponding_textregion == MAX_SLOPE: - slope_corresponding_textregion = slope_biggest - slopes_sub.append(slope_corresponding_textregion) - - cnt_clean_rot = textline_contours_postprocessing( - crop_img, slope_corresponding_textregion, contours_per_process[mv], boxes_per_process[mv]) - - poly_sub.append(cnt_clean_rot) - boxes_sub_new.append(boxes_per_process[mv]) - - q.put(slopes_sub) - poly.put(poly_sub) - box_sub.put(boxes_sub_new) - self.logger.debug('exit do_work_of_slopes') - def get_regions_light_v_extract_only_images(self,img,is_image_enhanced, num_col_classifier): self.logger.debug("enter get_regions_extract_images_only") erosion_hurts = False From 0a80cd5dffc7e5c28f41330da8d2f1255ac66e88 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 7 Oct 2025 22:37:05 +0200 Subject: [PATCH 071/101] avoid unnecessary 3-channel conversions: for tables, too --- src/eynollah/eynollah.py | 155 ++++++++++++++++----------------------- 1 file changed, 65 insertions(+), 90 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 2431a3b..70a8a17 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -930,10 +930,8 @@ class Eynollah: img_w = img.shape[1] prediction_true = np.zeros((img_h, img_w, 3)) mask_true = np.zeros((img_h, img_w)) - nxf = img_w / float(width_mid) - nyf = img_h / float(height_mid) - nxf = int(nxf) + 1 if nxf > int(nxf) else int(nxf) - nyf = int(nyf) + 1 if nyf > int(nyf) else int(nyf) + nxf = math.ceil(img_w / float(width_mid)) + nyf = math.ceil(img_h / float(height_mid)) list_i_s = [] list_j_s = [] @@ -946,18 +944,10 @@ class Eynollah: img_patch = np.zeros((n_batch_inference, img_height_model, img_width_model, 3)) for i in range(nxf): for j in range(nyf): - if i == 0: - index_x_d = i * width_mid - index_x_u = index_x_d + img_width_model - else: - index_x_d = i * width_mid - index_x_u = index_x_d + img_width_model - if j == 0: - index_y_d = j * height_mid - index_y_u = index_y_d + img_height_model - else: - index_y_d = j * height_mid - index_y_u = index_y_d + img_height_model + index_x_d = i * width_mid + index_x_u = index_x_d + img_width_model + index_y_d = j * height_mid + index_y_u = index_y_d + img_height_model if index_x_u > img_w: index_x_u = img_w index_x_d = img_w - img_width_model @@ -2600,23 +2590,20 @@ class Eynollah: self, layout, table_prediction_early, pixel_table, num_col_classifier): layout_org = np.copy(layout) - layout_org[:,:,0][layout_org[:,:,0]==pixel_table] = 0 - layout = (layout[:,:,0]==pixel_table)*1 - - layout = layout.astype(np.uint8) + layout_org[layout_org == pixel_table] = 0 + layout = (layout == pixel_table).astype(np.uint8) * 1 _, thresh = cv2.threshold(layout, 0, 255, 0) contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - cnt_size = np.array([cv2.contourArea(contours[j]) - for j in range(len(contours))]) + cnt_size = np.array([cv2.contourArea(cnt) for cnt in contours]) contours_new = [] - for i in range(len(contours)): - x, y, w, h = cv2.boundingRect(contours[i]) + for i, contour in enumerate(contours): + x, y, w, h = cv2.boundingRect(contour) iou = cnt_size[i] /float(w*h) *100 if iou<80: layout_contour = np.zeros(layout_org.shape[:2]) - layout_contour = cv2.fillPoly(layout_contour, pts=[contours[i]] ,color=1) + layout_contour = cv2.fillPoly(layout_contour, pts=[contour] ,color=1) layout_contour_sum = layout_contour.sum(axis=0) layout_contour_sum_diff = np.diff(layout_contour_sum) @@ -2648,26 +2635,26 @@ class Eynollah: #print(iou_in,'iou_in_in1') if iou_in>30: - layout_org= cv2.fillPoly(layout_org, pts=[contours_sep[ji]], color=3 * (pixel_table,)) + layout_org = cv2.fillPoly(layout_org, pts=[contours_sep[ji]], color=pixel_table) else: pass else: - layout_org= cv2.fillPoly(layout_org, pts=[contours_sep[ji]], color=3 * (pixel_table,)) + layout_org= cv2.fillPoly(layout_org, pts=[contours_sep[ji]], color=pixel_table) else: - contours_new.append(contours[i]) + contours_new.append(contour) if num_col_classifier>=2: - only_recent_contour_image = np.zeros((layout.shape[0],layout.shape[1])) - only_recent_contour_image= cv2.fillPoly(only_recent_contour_image,pts=[contours[i]] ,color=(1,1,1)) + only_recent_contour_image = np.zeros(layout.shape[:2]) + only_recent_contour_image = cv2.fillPoly(only_recent_contour_image, pts=[contour],color=1) table_pixels_masked_from_early_pre = only_recent_contour_image * table_prediction_early iou_in = 100. * table_pixels_masked_from_early_pre.sum() / only_recent_contour_image.sum() #print(iou_in,'iou_in') if iou_in>30: - layout_org= cv2.fillPoly(layout_org, pts=[contours[i]], color=3 * (pixel_table,)) + layout_org = cv2.fillPoly(layout_org, pts=[contour], color=pixel_table) else: pass else: - layout_org= cv2.fillPoly(layout_org, pts=[contours[i]], color=3 * (pixel_table,)) + layout_org = cv2.fillPoly(layout_org, pts=[contour], color=pixel_table) return layout_org, contours_new @@ -2714,16 +2701,10 @@ class Eynollah: pass boxes = np.array(boxes, dtype=int) # to be on the safe side - img_comm_e = np.zeros(image_revised_1.shape) - img_comm = np.repeat(img_comm_e[:, :, np.newaxis], 3, axis=2) - + img_comm = np.zeros(image_revised_1.shape, dtype=np.uint8) for indiv in np.unique(image_revised_1): - image_col=(image_revised_1==indiv)*255 - img_comm_in=np.repeat(image_col[:, :, np.newaxis], 3, axis=2) - img_comm_in=img_comm_in.astype(np.uint8) - - imgray = cv2.cvtColor(img_comm_in, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) + image_col = (image_revised_1 == indiv).astype(np.uint8) * 255 + _, thresh = cv2.threshold(image_col, 0, 255, 0) contours,hirarchy=cv2.findContours(thresh.copy(), cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) if indiv==pixel_table: @@ -2733,35 +2714,27 @@ class Eynollah: main_contours = filter_contours_area_of_image_tables(thresh, contours, hirarchy, max_area=1, min_area=min_area) - img_comm = cv2.fillPoly(img_comm, pts = main_contours, color = (indiv, indiv, indiv)) - img_comm = img_comm.astype(np.uint8) + img_comm = cv2.fillPoly(img_comm, pts=main_contours, color=indiv) if not self.isNaN(slope_mean_hor): - image_revised_last = np.zeros((image_regions_eraly_p.shape[0], image_regions_eraly_p.shape[1],3)) + image_revised_last = np.zeros(image_regions_eraly_p.shape[:2]) for i in range(len(boxes)): box_ys = slice(*boxes[i][2:4]) box_xs = slice(*boxes[i][0:2]) image_box = img_comm[box_ys, box_xs] try: - image_box_tabels_1=(image_box[:,:,0]==pixel_table)*1 + image_box_tabels_1 = (image_box == pixel_table) * 1 contours_tab,_=return_contours_of_image(image_box_tabels_1) contours_tab=filter_contours_area_of_image_tables(image_box_tabels_1,contours_tab,_,1,0.003) - image_box_tabels_1=(image_box[:,:,0]==pixel_line)*1 + image_box_tabels_1 = (image_box == pixel_line).astype(np.uint8) * 1 + image_box_tabels_and_m_text = ( (image_box == pixel_table) | + (image_box == 1) ).astype(np.uint8) * 1 - image_box_tabels_and_m_text=( (image_box[:,:,0]==pixel_table) | (image_box[:,:,0]==1) )*1 - image_box_tabels_and_m_text=image_box_tabels_and_m_text.astype(np.uint8) + image_box_tabels_1 = cv2.dilate(image_box_tabels_1, KERNEL, iterations=5) - image_box_tabels_1=image_box_tabels_1.astype(np.uint8) - image_box_tabels_1 = cv2.dilate(image_box_tabels_1,KERNEL,iterations = 5) - - contours_table_m_text,_=return_contours_of_image(image_box_tabels_and_m_text) - image_box_tabels=np.repeat(image_box_tabels_1[:, :, np.newaxis], 3, axis=2) - - image_box_tabels=image_box_tabels.astype(np.uint8) - imgray = cv2.cvtColor(image_box_tabels, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - - contours_line,hierachy=cv2.findContours(thresh,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) + contours_table_m_text, _ = return_contours_of_image(image_box_tabels_and_m_text) + _, thresh = cv2.threshold(image_box_tabels_1, 0, 255, 0) + contours_line, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) y_min_main_line ,y_max_main_line=find_features_of_contours(contours_line) y_min_main_tab ,y_max_main_tab=find_features_of_contours(contours_tab) @@ -2793,18 +2766,20 @@ class Eynollah: y_max_main_tab[i_t] < y_min_main_line[i_l] and y_min_main_tab[i_t] < y_min_main_line[i_l]): pass - elif np.abs(y_max_main_line[i_l]-y_min_main_line[i_l])<100: + elif abs(y_max_main_line[i_l] - y_min_main_line[i_l]) < 100: pass else: - y_up_tab.append(np.min([y_min_main_line[i_l], y_min_main_tab[i_t] ]) ) - y_down_tab.append( np.max([ y_max_main_line[i_l],y_max_main_tab[i_t] ]) ) + y_up_tab.append(min([y_min_main_line[i_l], + y_min_main_tab[i_t]])) + y_down_tab.append(max([y_max_main_line[i_l], + y_max_main_tab[i_t]])) if len(y_up_tab)==0: y_up_tabs.append(y_min_main_tab[i_t]) y_down_tabs.append(y_max_main_tab[i_t]) else: - y_up_tabs.append(np.min(y_up_tab)) - y_down_tabs.append(np.max(y_down_tab)) + y_up_tabs.append(min(y_up_tab)) + y_down_tabs.append(max(y_down_tab)) else: y_down_tabs=[] y_up_tabs=[] @@ -2814,7 +2789,7 @@ class Eynollah: y_up_tabs=[] for ii in range(len(y_up_tabs)): - image_box[y_up_tabs[ii]:y_down_tabs[ii],:,0]=pixel_table + image_box[y_up_tabs[ii]:y_down_tabs[ii]] = pixel_table image_revised_last[box_ys, box_xs] = image_box else: @@ -2825,14 +2800,14 @@ class Eynollah: image_revised_last[box_ys, box_xs] = image_box if num_col_classifier==1: - img_tables_col_1 = (image_revised_last[:,:,0] == pixel_table).astype(np.uint8) + img_tables_col_1 = (image_revised_last == pixel_table).astype(np.uint8) contours_table_col1, _ = return_contours_of_image(img_tables_col_1) _,_ ,_ , _, y_min_tab_col1 ,y_max_tab_col1, _= find_new_features_of_contours(contours_table_col1) if len(y_min_tab_col1)>0: for ijv in range(len(y_min_tab_col1)): - image_revised_last[int(y_min_tab_col1[ijv]):int(y_max_tab_col1[ijv]),:,:]=pixel_table + image_revised_last[int(y_min_tab_col1[ijv]):int(y_max_tab_col1[ijv])] = pixel_table return image_revised_last def get_tables_from_model(self, img, num_col_classifier): @@ -3200,7 +3175,7 @@ class Eynollah: pass else: text_regions_p_tables = np.copy(text_regions_p) - text_regions_p_tables[:,:][(table_prediction[:,:] == 1)] = 10 + text_regions_p_tables[(table_prediction == 1)] = 10 pixel_line = 3 img_revised_tab2 = self.add_tables_heuristic_to_layout( text_regions_p_tables, boxes, 0, splitter_y_new, peaks_neg_tot_tables, text_regions_p_tables, @@ -3221,8 +3196,8 @@ class Eynollah: pass else: text_regions_p_tables = np.copy(text_regions_p_1_n) - text_regions_p_tables =np.round(text_regions_p_tables) - text_regions_p_tables[:,:][(text_regions_p_tables[:,:] != 3) & (table_prediction_n[:,:] == 1)] = 10 + text_regions_p_tables = np.round(text_regions_p_tables) + text_regions_p_tables[(text_regions_p_tables != 3) & (table_prediction_n == 1)] = 10 pixel_line = 3 img_revised_tab2 = self.add_tables_heuristic_to_layout( @@ -3242,21 +3217,21 @@ class Eynollah: if self.tables: if self.light_version: - text_regions_p[:,:][table_prediction[:,:]==1] = 10 - img_revised_tab=text_regions_p[:,:] + text_regions_p[table_prediction == 1] = 10 + img_revised_tab = text_regions_p[:,:] else: if np.abs(slope_deskew) < SLOPE_THRESHOLD: - img_revised_tab = np.copy(img_revised_tab2[:,:,0]) - img_revised_tab[:,:][(text_regions_p[:,:] == 1) & (img_revised_tab[:,:] != 10)] = 1 + img_revised_tab = np.copy(img_revised_tab2) + img_revised_tab[(text_regions_p == 1) & (img_revised_tab != 10)] = 1 else: - img_revised_tab = np.copy(text_regions_p[:,:]) - img_revised_tab[:,:][img_revised_tab[:,:] == 10] = 0 - img_revised_tab[:,:][img_revised_tab2_d_rotated[:,:,0] == 10] = 10 + img_revised_tab = np.copy(text_regions_p) + img_revised_tab[img_revised_tab == 10] = 0 + img_revised_tab[img_revised_tab2_d_rotated == 10] = 10 - text_regions_p[:,:][text_regions_p[:,:]==10] = 0 - text_regions_p[:,:][img_revised_tab[:,:]==10] = 10 + text_regions_p[text_regions_p == 10] = 0 + text_regions_p[img_revised_tab == 10] = 10 else: - img_revised_tab=text_regions_p[:,:] + img_revised_tab = text_regions_p[:,:] #img_revised_tab = text_regions_p[:, :] if self.light_version: polygons_of_images = return_contours_of_interested_region(text_regions_p, 2) @@ -3386,7 +3361,7 @@ class Eynollah: num_col_classifier, erosion_hurts, self.tables, self.right2left) text_regions_p_tables = np.copy(text_regions_p_1_n) text_regions_p_tables = np.round(text_regions_p_tables) - text_regions_p_tables[:,:][(text_regions_p_tables[:,:]!=3) & (table_prediction_n[:,:]==1)] = 10 + text_regions_p_tables[(text_regions_p_tables != 3) & (table_prediction_n == 1)] = 10 pixel_line = 3 img_revised_tab2 = self.add_tables_heuristic_to_layout( @@ -3405,17 +3380,17 @@ class Eynollah: text_regions_p.shape[1]) if np.abs(slope_deskew) < 0.13: - img_revised_tab = np.copy(img_revised_tab2[:,:,0]) + img_revised_tab = np.copy(img_revised_tab2) else: - img_revised_tab = np.copy(text_regions_p[:,:]) - img_revised_tab[:,:][img_revised_tab[:,:] == 10] = 0 - img_revised_tab[:,:][img_revised_tab2_d_rotated[:,:,0] == 10] = 10 + img_revised_tab = np.copy(text_regions_p) + img_revised_tab[img_revised_tab == 10] = 0 + img_revised_tab[img_revised_tab2_d_rotated == 10] = 10 - ##img_revised_tab=img_revised_tab2[:,:,0] - #img_revised_tab=text_regions_p[:,:] - text_regions_p[:,:][text_regions_p[:,:]==10] = 0 - text_regions_p[:,:][img_revised_tab[:,:]==10] = 10 - #img_revised_tab[img_revised_tab2[:,:,0]==10] =10 + ##img_revised_tab = img_revised_tab2[:,:] + #img_revised_tab = text_regions_p[:,:] + text_regions_p[text_regions_p == 10] = 0 + text_regions_p[img_revised_tab == 10] = 10 + #img_revised_tab[img_revised_tab2 == 10] = 10 pixel_img = 4 min_area_mar = 0.00001 From fd43e78442251c552faafeffe02256023ae1a806 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 7 Oct 2025 22:42:36 +0200 Subject: [PATCH 072/101] filter_contours_without_textline_inside: simplify - np.delete in index array instead of contour lists - yield actual resulting indices --- src/eynollah/eynollah.py | 77 ++++------------------------------------ 1 file changed, 6 insertions(+), 71 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 70a8a17..6cc8b1b 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4040,79 +4040,23 @@ class Eynollah: self, contours, text_con_org, contours_textline, contours_only_text_parent_d_ordered, conf_contours_textregions): - ###contours_txtline_of_all_textregions = [] - ###for jj in range(len(contours_textline)): - ###contours_txtline_of_all_textregions = contours_txtline_of_all_textregions + contours_textline[jj] - ###M_main_textline = [cv2.moments(contours_txtline_of_all_textregions[j]) - ### for j in range(len(contours_txtline_of_all_textregions))] - ###cx_main_textline = [(M_main_textline[j]["m10"] / (M_main_textline[j]["m00"] + 1e-32)) - ### for j in range(len(M_main_textline))] - ###cy_main_textline = [(M_main_textline[j]["m01"] / (M_main_textline[j]["m00"] + 1e-32)) - ### for j in range(len(M_main_textline))] - - ###M_main = [cv2.moments(contours[j]) for j in range(len(contours))] - ###cx_main = [(M_main[j]["m10"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))] - ###cy_main = [(M_main[j]["m01"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))] - - ###contours_with_textline = [] - ###for ind_tr, con_tr in enumerate(contours): - ###results = [cv2.pointPolygonTest(con_tr, - ### (cx_main_textline[index_textline_con], - ### cy_main_textline[index_textline_con]), - ### False) - ### for index_textline_con in range(len(contours_txtline_of_all_textregions)) ] - ###results = np.array(results) - ###if np.any(results==1): - ###contours_with_textline.append(con_tr) - - textregion_index_to_del = set() - for index_textregion, textlines_textregion in enumerate(contours_textline): - if len(textlines_textregion) == 0: - textregion_index_to_del.add(index_textregion) + assert len(contours_par) == len(contours_textline) + indices = np.arange(len(contours_textline)) + indices = np.delete(indices, np.flatnonzero([len(lines) == 0 for lines in contours_textline])) def filterfun(lis): if len(lis) == 0: return [] - if len(textregion_index_to_del) == 0: - return lis - return list(np.delete(lis, list(textregion_index_to_del))) + return list(np.array(lis)[indices]) return (filterfun(contours), filterfun(text_con_org), filterfun(conf_contours_textregions), filterfun(contours_textline), filterfun(contours_only_text_parent_d_ordered), - np.arange(len(contours) - len(textregion_index_to_del))) + indices + ) - def delete_regions_without_textlines( - self, slopes, all_found_textline_polygons, boxes_text, txt_con_org, - contours_only_text_parent, index_by_text_par_con): - - slopes_rem = [] - all_found_textline_polygons_rem = [] - boxes_text_rem = [] - txt_con_org_rem = [] - contours_only_text_parent_rem = [] - index_by_text_par_con_rem = [] - - for i, ind_con in enumerate(all_found_textline_polygons): - if len(ind_con): - all_found_textline_polygons_rem.append(ind_con) - slopes_rem.append(slopes[i]) - boxes_text_rem.append(boxes_text[i]) - txt_con_org_rem.append(txt_con_org[i]) - contours_only_text_parent_rem.append(contours_only_text_parent[i]) - index_by_text_par_con_rem.append(index_by_text_par_con[i]) - - index_sort = np.argsort(index_by_text_par_con_rem) - indexes_new = np.array(range(len(index_by_text_par_con_rem))) - - index_by_text_par_con_rem_sort = [indexes_new[index_sort==j][0] - for j in range(len(index_by_text_par_con_rem))] - - return (slopes_rem, all_found_textline_polygons_rem, boxes_text_rem, txt_con_org_rem, - contours_only_text_parent_rem, index_by_text_par_con_rem_sort) - def separate_marginals_to_left_and_right_and_order_from_top_to_down( self, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes_marginals, mid_point_of_page_width): @@ -4679,15 +4623,6 @@ class Eynollah: polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea_org, boxes_marginals, slope_deskew) - #slopes, all_found_textline_polygons, boxes_text, txt_con_org, \ - # contours_only_text_parent, index_by_text_par_con = \ - # self.delete_regions_without_textlines(slopes, all_found_textline_polygons, - # boxes_text, txt_con_org, contours_only_text_parent, index_by_text_par_con) - #slopes_marginals, all_found_textline_polygons_marginals, boxes_marginals, \ - # polygons_of_marginals, polygons_of_marginals, _ = \ - # self.delete_regions_without_textlines(slopes_marginals, all_found_textline_polygons_marginals, - # boxes_marginals, polygons_of_marginals, polygons_of_marginals, - # np.array(range(len(polygons_of_marginals)))) all_found_textline_polygons = dilate_textline_contours( all_found_textline_polygons) all_found_textline_polygons = self.filter_contours_inside_a_bigger_one( From 02a347a48a972de49c4b098f454a9a16cc4ee4fc Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 7 Oct 2025 22:47:34 +0200 Subject: [PATCH 073/101] no more need to rm from `contours_only_text_parent_d_ordered` now --- src/eynollah/eynollah.py | 16 ++-------------- src/eynollah/utils/__init__.py | 8 ++++---- 2 files changed, 6 insertions(+), 18 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 6cc8b1b..c4a6600 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4437,6 +4437,8 @@ class Eynollah: ###min_con_area = 0.000005 contours_only_text, hir_on_text = return_contours_of_image(text_only) contours_only_text_parent = return_parent_contours(contours_only_text, hir_on_text) + contours_only_text_parent_d_ordered = [] + contours_only_text_parent_d = [] if len(contours_only_text_parent) > 0: areas_tot_text = np.prod(text_only.shape) areas_cnt_text = np.array([cv2.contourArea(c) for c in contours_only_text_parent]) @@ -4558,15 +4560,6 @@ class Eynollah: # plt.subplot(2, 2, 2, title="result contours") # plt.imshow(img4) # plt.show() - else: - contours_only_text_parent_d_ordered = [] - contours_only_text_parent_d = [] - contours_only_text_parent = [] - - else: - contours_only_text_parent_d_ordered = [] - contours_only_text_parent_d = [] - #contours_only_text_parent = [] if not len(contours_only_text_parent): # stop early @@ -4684,11 +4677,6 @@ class Eynollah: slopes_marginals, mid_point_of_page_width) #print(len(polygons_of_marginals), len(ordered_left_marginals), len(ordered_right_marginals), 'marginals ordred') - if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - contours_only_text_parent_d_ordered = self.return_list_of_contours_with_desired_order( - contours_only_text_parent_d_ordered, index_by_text_par_con) - else: - contours_only_text_parent_d_ordered = None if self.full_layout: if self.light_version: diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index ebf78fe..5ccb2af 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -938,7 +938,7 @@ def check_any_text_region_in_model_one_is_main_or_header( if (pixels_header>=pixels_main) and ( (length_con[ii]/float(height_con[ii]) )>=1.3 ): regions_model_1[:,:][(regions_model_1[:,:]==1) & (img == 255) ]=2 contours_only_text_parent_head.append(con) - if contours_only_text_parent_d_ordered is not None: + if len(contours_only_text_parent_d_ordered): contours_only_text_parent_head_d.append(contours_only_text_parent_d_ordered[ii]) all_box_coord_head.append(all_box_coord[ii]) slopes_head.append(slopes[ii]) @@ -948,7 +948,7 @@ def check_any_text_region_in_model_one_is_main_or_header( regions_model_1[:,:][(regions_model_1[:,:]==1) & (img == 255) ]=1 contours_only_text_parent_main.append(con) conf_contours_main.append(conf_contours[ii]) - if contours_only_text_parent_d_ordered is not None: + if len(contours_only_text_parent_d_ordered): contours_only_text_parent_main_d.append(contours_only_text_parent_d_ordered[ii]) all_box_coord_main.append(all_box_coord[ii]) slopes_main.append(slopes[ii]) @@ -1033,7 +1033,7 @@ def check_any_text_region_in_model_one_is_main_or_header_light( regions_model_1[:,:][(regions_model_1[:,:]==1) & (img == 255) ] = 2 contours_only_text_parent_head.append(contours_only_text_parent[ii]) conf_contours_head.append(None) # why not conf_contours[ii], too? - if contours_only_text_parent_d_ordered is not None: + if len(contours_only_text_parent_d_ordered): contours_only_text_parent_head_d.append(contours_only_text_parent_d_ordered[ii]) all_box_coord_head.append(all_box_coord[ii]) slopes_head.append(slopes[ii]) @@ -1043,7 +1043,7 @@ def check_any_text_region_in_model_one_is_main_or_header_light( regions_model_1[:,:][(regions_model_1[:,:]==1) & (img == 255) ] = 1 contours_only_text_parent_main.append(contours_only_text_parent[ii]) conf_contours_main.append(conf_contours[ii]) - if contours_only_text_parent_d_ordered is not None: + if len(contours_only_text_parent_d_ordered): contours_only_text_parent_main_d.append(contours_only_text_parent_d_ordered[ii]) all_box_coord_main.append(all_box_coord[ii]) slopes_main.append(slopes[ii]) From d88ca18eec8f1a4def371848c218b817fdb728a1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 7 Oct 2025 22:53:30 +0200 Subject: [PATCH 074/101] get/do_work_of_slopes etc.: reduce call/return signatures - `get_textregion_contours_in_org_image_light`: no more need to also return unchanged contours here (see 41cc38c5); therefore - `txt_con_org`: no more need for this (now mere alias to `contours_only_text_parent`); also - `index_by_text_par_con`: no more need for this (see prev. commit), so do not pass/return - `get_slopes_and_deskew_*`: do not pass `contours_only_text` (where not used) - `get_slopes_and_deskew_*`: do not return unchanged contours, boxes - `do_work_of_slopes_*`: adapt respectively --- src/eynollah/eynollah.py | 98 +++++++++++++--------------- src/eynollah/utils/contour.py | 4 +- src/eynollah/utils/separate_lines.py | 12 ++-- 3 files changed, 54 insertions(+), 60 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index c4a6600..ec68bcd 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -879,7 +879,7 @@ class Eynollah: thresholding_for_fl_light_version=False, threshold_art_class_textline=0.1): - self.logger.debug("enter do_prediction") + self.logger.debug("enter do_prediction (patches=%d)", patches) img_height_model = model.layers[-1].output_shape[1] img_width_model = model.layers[-1].output_shape[2] @@ -1856,7 +1856,7 @@ class Eynollah: return sorted_textlines - def get_slopes_and_deskew_new_light2(self, contours, contours_par, textline_mask_tot, boxes, slope_deskew): + def get_slopes_and_deskew_new_light2(self, contours_par, textline_mask_tot, boxes, slope_deskew): polygons_of_textlines = return_contours_of_interested_region(textline_mask_tot,1,0.00001) cx_main_tot, cy_main_tot = find_center_of_contours(polygons_of_textlines) @@ -1889,16 +1889,12 @@ class Eynollah: all_box_coord.append(crop_coor) return (all_found_textline_polygons, - boxes, - contours, - contours_par, all_box_coord, - np.array(range(len(contours_par))), slopes) def get_slopes_and_deskew_new_light(self, contours, contours_par, textline_mask_tot, boxes, slope_deskew): if not len(contours): - return [], [], [], [], [], [], [] + return [], [], [] self.logger.debug("enter get_slopes_and_deskew_new_light") with share_ndarray(textline_mask_tot) as textline_mask_tot_shared: results = self.executor.map(partial(do_work_of_slopes_new_light, @@ -1906,15 +1902,15 @@ class Eynollah: slope_deskew=slope_deskew, textline_light=self.textline_light, logger=self.logger,), - boxes, contours, contours_par, range(len(contours_par))) + boxes, contours, contours_par) results = list(results) # exhaust prior to release - #textline_polygons, boxes, text_regions, text_regions_par, box_coord, index_text_con, slopes = zip(*results) + #textline_polygons, box_coord, slopes = zip(*results) self.logger.debug("exit get_slopes_and_deskew_new_light") return tuple(zip(*results)) def get_slopes_and_deskew_new(self, contours, contours_par, textline_mask_tot, boxes, slope_deskew): if not len(contours): - return [], [], [], [], [], [], [] + return [], [], [] self.logger.debug("enter get_slopes_and_deskew_new") with share_ndarray(textline_mask_tot) as textline_mask_tot_shared: results = self.executor.map(partial(do_work_of_slopes_new, @@ -1924,16 +1920,16 @@ class Eynollah: KERNEL=KERNEL, logger=self.logger, plotter=self.plotter,), - boxes, contours, contours_par, range(len(contours_par))) + boxes, contours, contours_par) results = list(results) # exhaust prior to release - #textline_polygons, boxes, text_regions, text_regions_par, box_coord, index_text_con, slopes = zip(*results) + #textline_polygons, box_coord, slopes = zip(*results) self.logger.debug("exit get_slopes_and_deskew_new") return tuple(zip(*results)) - def get_slopes_and_deskew_new_curved(self, contours, contours_par, textline_mask_tot, boxes, + def get_slopes_and_deskew_new_curved(self, contours_par, textline_mask_tot, boxes, mask_texts_only, num_col, scale_par, slope_deskew): - if not len(contours): - return [], [], [], [], [], [], [] + if not len(contours_par): + return [], [], [] self.logger.debug("enter get_slopes_and_deskew_new_curved") with share_ndarray(textline_mask_tot) as textline_mask_tot_shared: with share_ndarray(mask_texts_only) as mask_texts_only_shared: @@ -1947,9 +1943,9 @@ class Eynollah: KERNEL=KERNEL, logger=self.logger, plotter=self.plotter,), - boxes, contours, contours_par, range(len(contours_par))) + boxes, contours_par) results = list(results) # exhaust prior to release - #textline_polygons, boxes, text_regions, text_regions_par, box_coord, index_text_con, slopes = zip(*results) + #textline_polygons, box_coord, slopes = zip(*results) self.logger.debug("exit get_slopes_and_deskew_new_curved") return tuple(zip(*results)) @@ -4037,7 +4033,7 @@ class Eynollah: def filter_contours_without_textline_inside( - self, contours, text_con_org, contours_textline, + self, contours_par, contours_textline, contours_only_text_parent_d_ordered, conf_contours_textregions): @@ -4049,12 +4045,11 @@ class Eynollah: return [] return list(np.array(lis)[indices]) - return (filterfun(contours), - filterfun(text_con_org), - filterfun(conf_contours_textregions), + return (filterfun(contours_par), filterfun(contours_textline), filterfun(contours_only_text_parent_d_ordered), - indices + filterfun(conf_contours_textregions), + # indices ) def separate_marginals_to_left_and_right_and_order_from_top_to_down( @@ -4592,12 +4587,11 @@ class Eynollah: contours_only_text_parent, contours_only_text_parent_d_ordered, text_only, marginal_cnts=polygons_of_marginals) #print("text region early 3.5 in %.1fs", time.time() - t0) - txt_con_org , conf_contours_textregions = get_textregion_contours_in_org_image_light( + conf_contours_textregions = get_textregion_contours_in_org_image_light( contours_only_text_parent, self.image, confidence_matrix) - #txt_con_org = dilate_textregion_contours(txt_con_org) #contours_only_text_parent = dilate_textregion_contours(contours_only_text_parent) else: - txt_con_org , conf_contours_textregions = get_textregion_contours_in_org_image_light( + conf_contours_textregions = get_textregion_contours_in_org_image_light( contours_only_text_parent, self.image, confidence_matrix) #print("text region early 4 in %.1fs", time.time() - t0) boxes_text = get_text_region_boxes_by_given_contours(contours_only_text_parent) @@ -4607,13 +4601,13 @@ class Eynollah: if not self.curved_line: if self.light_version: if self.textline_light: - all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, \ - all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_light2( - txt_con_org, contours_only_text_parent, textline_mask_tot_ea_org, + all_found_textline_polygons, \ + all_box_coord, slopes = self.get_slopes_and_deskew_new_light2( + contours_only_text_parent, textline_mask_tot_ea_org, boxes_text, slope_deskew) - all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, \ - all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new_light2( - polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea_org, + all_found_textline_polygons_marginals, \ + all_box_coord_marginals, slopes_marginals = self.get_slopes_and_deskew_new_light2( + polygons_of_marginals, textline_mask_tot_ea_org, boxes_marginals, slope_deskew) all_found_textline_polygons = dilate_textline_contours( @@ -4622,46 +4616,46 @@ class Eynollah: all_found_textline_polygons, None, textline_mask_tot_ea_org, type_contour="textline") all_found_textline_polygons_marginals = dilate_textline_contours( all_found_textline_polygons_marginals) - contours_only_text_parent, txt_con_org, conf_contours_textregions, \ - all_found_textline_polygons, contours_only_text_parent_d_ordered, \ - index_by_text_par_con = self.filter_contours_without_textline_inside( - contours_only_text_parent, txt_con_org, all_found_textline_polygons, + contours_only_text_parent, all_found_textline_polygons, \ + contours_only_text_parent_d_ordered, conf_contours_textregions = \ + self.filter_contours_without_textline_inside( + contours_only_text_parent, all_found_textline_polygons, contours_only_text_parent_d_ordered, conf_contours_textregions) else: textline_mask_tot_ea = cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1) - all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, \ - index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_light( - txt_con_org, contours_only_text_parent, textline_mask_tot_ea, + all_found_textline_polygons, \ + all_box_coord, slopes = self.get_slopes_and_deskew_new_light( + contours_only_text_parent, contours_only_text_parent, textline_mask_tot_ea, boxes_text, slope_deskew) - all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, \ - all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new_light( + all_found_textline_polygons_marginals, \ + all_box_coord_marginals, slopes_marginals = self.get_slopes_and_deskew_new_light( polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, boxes_marginals, slope_deskew) #all_found_textline_polygons = self.filter_contours_inside_a_bigger_one( # all_found_textline_polygons, textline_mask_tot_ea_org, type_contour="textline") else: textline_mask_tot_ea = cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1) - all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, \ - all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new( - txt_con_org, contours_only_text_parent, textline_mask_tot_ea, + all_found_textline_polygons, \ + all_box_coord, slopes = self.get_slopes_and_deskew_new( + contours_only_text_parent, contours_only_text_parent, textline_mask_tot_ea, boxes_text, slope_deskew) - all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, \ - all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new( + all_found_textline_polygons_marginals, \ + all_box_coord_marginals, slopes_marginals = self.get_slopes_and_deskew_new( polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, boxes_marginals, slope_deskew) else: scale_param = 1 textline_mask_tot_ea_erode = cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=2) - all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, \ - all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_curved( - txt_con_org, contours_only_text_parent, textline_mask_tot_ea_erode, + all_found_textline_polygons, \ + all_box_coord, slopes = self.get_slopes_and_deskew_new_curved( + contours_only_text_parent, textline_mask_tot_ea_erode, boxes_text, text_only, num_col_classifier, scale_param, slope_deskew) all_found_textline_polygons = small_textlines_to_parent_adherence2( all_found_textline_polygons, textline_mask_tot_ea, num_col_classifier) - all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, \ - all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new_curved( - polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea_erode, + all_found_textline_polygons_marginals, \ + all_box_coord_marginals, slopes_marginals = self.get_slopes_and_deskew_new_curved( + polygons_of_marginals, textline_mask_tot_ea_erode, boxes_marginals, text_only, num_col_classifier, scale_param, slope_deskew) all_found_textline_polygons_marginals = small_textlines_to_parent_adherence2( @@ -4884,7 +4878,7 @@ class Eynollah: conf_contours_textregions, conf_contours_textregions_h) else: pcgts = self.writer.build_pagexml_no_full_layout( - txt_con_org, page_coord, order_text_new, id_of_texts_tot, + contours_only_text_parent, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals_left, polygons_of_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index fb4bbd0..2560846 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -216,7 +216,7 @@ def do_back_rotation_and_get_cnt_back(contour_par, index_r_con, img, slope_first def get_textregion_contours_in_org_image_light(cnts, img, confidence_matrix): if not len(cnts): - return [], [] + return [] confidence_matrix = cv2.resize(confidence_matrix, (img.shape[1] // 6, img.shape[0] // 6), @@ -226,7 +226,7 @@ def get_textregion_contours_in_org_image_light(cnts, img, confidence_matrix): cnt_mask = np.zeros(confidence_matrix.shape) cnt_mask = cv2.fillPoly(cnt_mask, pts=[cnt // 6], color=1.0) confs.append(np.sum(confidence_matrix * cnt_mask) / np.sum(cnt_mask)) - return cnts, confs + return confs def return_contours_of_interested_textline(region_pre_p, label): # pixels of images are identified by 5 diff --git a/src/eynollah/utils/separate_lines.py b/src/eynollah/utils/separate_lines.py index 3bfc903..22ef00d 100644 --- a/src/eynollah/utils/separate_lines.py +++ b/src/eynollah/utils/separate_lines.py @@ -1592,7 +1592,7 @@ def get_smallest_skew(img, sigma_des, angles, logger=None, plotter=None, map=map @wrap_ndarray_shared(kw='textline_mask_tot_ea') def do_work_of_slopes_new( - box_text, contour, contour_par, index_r_con, + box_text, contour, contour_par, textline_mask_tot_ea=None, slope_deskew=0.0, logger=None, MAX_SLOPE=999, KERNEL=None, plotter=None ): @@ -1647,12 +1647,12 @@ def do_work_of_slopes_new( all_text_region_raw[mask_only_con_region == 0] = 0 cnt_clean_rot = textline_contours_postprocessing(all_text_region_raw, slope_for_all, contour_par, box_text) - return cnt_clean_rot, box_text, contour, contour_par, crop_coor, index_r_con, slope + return cnt_clean_rot, crop_coor, slope @wrap_ndarray_shared(kw='textline_mask_tot_ea') @wrap_ndarray_shared(kw='mask_texts_only') def do_work_of_slopes_new_curved( - box_text, contour, contour_par, index_r_con, + box_text, contour_par, textline_mask_tot_ea=None, mask_texts_only=None, num_col=1, scale_par=1.0, slope_deskew=0.0, logger=None, MAX_SLOPE=999, KERNEL=None, plotter=None @@ -1743,11 +1743,11 @@ def do_work_of_slopes_new_curved( slope_for_all, contour_par, box_text, True) - return textlines_cnt_per_region[::-1], box_text, contour, contour_par, crop_coor, index_r_con, slope + return textlines_cnt_per_region[::-1], crop_coor, slope @wrap_ndarray_shared(kw='textline_mask_tot_ea') def do_work_of_slopes_new_light( - box_text, contour, contour_par, index_r_con, + box_text, contour, contour_par, textline_mask_tot_ea=None, slope_deskew=0, textline_light=True, logger=None ): @@ -1777,4 +1777,4 @@ def do_work_of_slopes_new_light( all_text_region_raw[mask_only_con_region == 0] = 0 cnt_clean_rot = textline_contours_postprocessing(all_text_region_raw, slope_deskew, contour_par, box_text) - return cnt_clean_rot, box_text, contour, contour_par, crop_coor, index_r_con, slope_deskew + return cnt_clean_rot, crop_coor, slope_deskew From e32479765cc52a29462b36f876d253478860f176 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 7 Oct 2025 23:03:27 +0200 Subject: [PATCH 075/101] writer: simplify - simplify serialization of coordinates - re-use `serialize_lines_in_region` (drop `*_in_dropcapital` and `*_in_marginal`) - re-use `calculate_polygon_coords` --- src/eynollah/writer.py | 343 ++++++++++++++++------------------------- 1 file changed, 131 insertions(+), 212 deletions(-) diff --git a/src/eynollah/writer.py b/src/eynollah/writer.py index 936c95f..67a2989 100644 --- a/src/eynollah/writer.py +++ b/src/eynollah/writer.py @@ -56,113 +56,30 @@ class EynollahXmlWriter(): points_page_print = points_page_print + ' ' return points_page_print[:-1] - def serialize_lines_in_marginal(self, marginal_region, all_found_textline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, slopes_marginals, counter, ocr_all_textlines_textregion): - for j in range(len(all_found_textline_polygons_marginals[marginal_idx])): - coords = CoordsType() - textline = TextLineType(id=counter.next_line_id, Coords=coords) - if ocr_all_textlines_textregion: - textline.set_TextEquiv( [ TextEquivType(Unicode=ocr_all_textlines_textregion[j]) ] ) - marginal_region.add_TextLine(textline) - marginal_region.set_orientation(-slopes_marginals[marginal_idx]) - points_co = '' - for l in range(len(all_found_textline_polygons_marginals[marginal_idx][j])): - if not (self.curved_line or self.textline_light): - if len(all_found_textline_polygons_marginals[marginal_idx][j][l]) == 2: - textline_x_coord = max(0, int((all_found_textline_polygons_marginals[marginal_idx][j][l][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x) ) - textline_y_coord = max(0, int((all_found_textline_polygons_marginals[marginal_idx][j][l][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y) ) - else: - textline_x_coord = max(0, int((all_found_textline_polygons_marginals[marginal_idx][j][l][0][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x) ) - textline_y_coord = max(0, int((all_found_textline_polygons_marginals[marginal_idx][j][l][0][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y) ) - points_co += str(textline_x_coord) - points_co += ',' - points_co += str(textline_y_coord) - if (self.curved_line or self.textline_light) and np.abs(slopes_marginals[marginal_idx]) <= 45: - if len(all_found_textline_polygons_marginals[marginal_idx][j][l]) == 2: - points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][0] + page_coord[2]) / self.scale_x)) - points_co += ',' - points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][1] + page_coord[0]) / self.scale_y)) - else: - points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][0][0] + page_coord[2]) / self.scale_x)) - points_co += ',' - points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][0][1] + page_coord[0]) / self.scale_y)) - - elif (self.curved_line or self.textline_light) and np.abs(slopes_marginals[marginal_idx]) > 45: - if len(all_found_textline_polygons_marginals[marginal_idx][j][l]) == 2: - points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x)) - points_co += ',' - points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y)) - else: - points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][0][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x)) - points_co += ',' - points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][0][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y)) - points_co += ' ' - coords.set_points(points_co[:-1]) - def serialize_lines_in_region(self, text_region, all_found_textline_polygons, region_idx, page_coord, all_box_coord, slopes, counter, ocr_all_textlines_textregion): self.logger.debug('enter serialize_lines_in_region') - for j in range(len(all_found_textline_polygons[region_idx])): + for j, polygon_textline in enumerate(all_found_textline_polygons[region_idx]): coords = CoordsType() textline = TextLineType(id=counter.next_line_id, Coords=coords) if ocr_all_textlines_textregion: - textline.set_TextEquiv( [ TextEquivType(Unicode=ocr_all_textlines_textregion[j]) ] ) + # FIXME: add OCR confidence + textline.set_TextEquiv([TextEquivType(Unicode=ocr_all_textlines_textregion[j])]) text_region.add_TextLine(textline) text_region.set_orientation(-slopes[region_idx]) region_bboxes = all_box_coord[region_idx] points_co = '' - for idx_contour_textline, contour_textline in enumerate(all_found_textline_polygons[region_idx][j]): - if not (self.curved_line or self.textline_light): - if len(contour_textline) == 2: - textline_x_coord = max(0, int((contour_textline[0] + region_bboxes[2] + page_coord[2]) / self.scale_x)) - textline_y_coord = max(0, int((contour_textline[1] + region_bboxes[0] + page_coord[0]) / self.scale_y)) - else: - textline_x_coord = max(0, int((contour_textline[0][0] + region_bboxes[2] + page_coord[2]) / self.scale_x)) - textline_y_coord = max(0, int((contour_textline[0][1] + region_bboxes[0] + page_coord[0]) / self.scale_y)) - points_co += str(textline_x_coord) - points_co += ',' - points_co += str(textline_y_coord) - - if self.textline_light or (self.curved_line and np.abs(slopes[region_idx]) <= 45): - if len(contour_textline) == 2: - points_co += str(int((contour_textline[0] + page_coord[2]) / self.scale_x)) - points_co += ',' - points_co += str(int((contour_textline[1] + page_coord[0]) / self.scale_y)) - else: - points_co += str(int((contour_textline[0][0] + page_coord[2]) / self.scale_x)) - points_co += ',' - points_co += str(int((contour_textline[0][1] + page_coord[0])/self.scale_y)) - elif self.curved_line and np.abs(slopes[region_idx]) > 45: - if len(contour_textline)==2: - points_co += str(int((contour_textline[0] + region_bboxes[2] + page_coord[2])/self.scale_x)) - points_co += ',' - points_co += str(int((contour_textline[1] + region_bboxes[0] + page_coord[0])/self.scale_y)) - else: - points_co += str(int((contour_textline[0][0] + region_bboxes[2]+page_coord[2])/self.scale_x)) - points_co += ',' - points_co += str(int((contour_textline[0][1] + region_bboxes[0]+page_coord[0])/self.scale_y)) - points_co += ' ' - coords.set_points(points_co[:-1]) - - def serialize_lines_in_dropcapital(self, text_region, all_found_textline_polygons, region_idx, page_coord, all_box_coord, slopes, counter, ocr_all_textlines_textregion): - self.logger.debug('enter serialize_lines_in_region') - for j in range(1): - coords = CoordsType() - textline = TextLineType(id=counter.next_line_id, Coords=coords) - if ocr_all_textlines_textregion: - textline.set_TextEquiv( [ TextEquivType(Unicode=ocr_all_textlines_textregion[j]) ] ) - text_region.add_TextLine(textline) - #region_bboxes = all_box_coord[region_idx] - points_co = '' - for idx_contour_textline, contour_textline in enumerate(all_found_textline_polygons[j]): - if len(contour_textline) == 2: - points_co += str(int((contour_textline[0] + page_coord[2]) / self.scale_x)) - points_co += ',' - points_co += str(int((contour_textline[1] + page_coord[0]) / self.scale_y)) - else: - points_co += str(int((contour_textline[0][0] + page_coord[2]) / self.scale_x)) - points_co += ',' - points_co += str(int((contour_textline[0][1] + page_coord[0])/self.scale_y)) - - points_co += ' ' + for point in polygon_textline: + if len(point) != 2: + point = point[0] + point_x = point[0] + page_coord[2] + point_y = point[1] + page_coord[0] + # FIXME: or actually... not self.textline_light and not self.curved_line or np.abs(slopes[region_idx]) > 45? + if not self.textline_light and not (self.curved_line and np.abs(slopes[region_idx]) <= 45): + point_x += region_bboxes[2] + point_y += region_bboxes[0] + point_x = max(0, int(point_x / self.scale_x)) + point_y = max(0, int(point_y / self.scale_y)) + points_co += str(point_x) + ',' + str(point_y) + ' ' coords.set_points(points_co[:-1]) def write_pagexml(self, pcgts): @@ -170,7 +87,7 @@ class EynollahXmlWriter(): with open(self.output_filename, 'w') as f: f.write(to_xml(pcgts)) - def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals_left, found_polygons_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_marginals_left, slopes_marginals_right, cont_page, polygons_lines_to_be_written_in_xml, found_polygons_tables, ocr_all_textlines=None, ocr_all_textlines_marginals_left=None, ocr_all_textlines_marginals_right=None, conf_contours_textregion=None, skip_layout_reading_order=False): + def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals_left, found_polygons_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_marginals_left, slopes_marginals_right, cont_page, polygons_seplines, found_polygons_tables, ocr_all_textlines=None, ocr_all_textlines_marginals_left=None, ocr_all_textlines_marginals_right=None, conf_contours_textregion=None, skip_layout_reading_order=False): self.logger.debug('enter build_pagexml_no_full_layout') # create the file structure @@ -179,90 +96,79 @@ class EynollahXmlWriter(): page.set_Border(BorderType(Coords=CoordsType(points=self.calculate_page_coords(cont_page)))) counter = EynollahIdCounter() - if len(found_polygons_text_region) > 0: + if len(order_of_texts): _counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) - id_of_marginalia_left = [_counter_marginals.next_region_id for _ in found_polygons_marginals_left] - id_of_marginalia_right = [_counter_marginals.next_region_id for _ in found_polygons_marginals_right] + id_of_marginalia_left = [_counter_marginals.next_region_id + for _ in found_polygons_marginals_left] + id_of_marginalia_right = [_counter_marginals.next_region_id + for _ in found_polygons_marginals_right] xml_reading_order(page, order_of_texts, id_of_marginalia_left, id_of_marginalia_right) - for mm in range(len(found_polygons_text_region)): - textregion = TextRegionType(id=counter.next_region_id, type_='paragraph', - Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord, skip_layout_reading_order), conf=conf_contours_textregion[mm]), - ) - #textregion.set_conf(conf_contours_textregion[mm]) + for mm, region_contour in enumerate(found_polygons_text_region): + textregion = TextRegionType( + id=counter.next_region_id, type_='paragraph', + Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord, + skip_layout_reading_order), + conf=conf_contours_textregion[mm]), + ) page.add_TextRegion(textregion) if ocr_all_textlines: ocr_textlines = ocr_all_textlines[mm] else: ocr_textlines = None - self.serialize_lines_in_region(textregion, all_found_textline_polygons, mm, page_coord, all_box_coord, slopes, counter, ocr_textlines) + self.serialize_lines_in_region(textregion, all_found_textline_polygons, mm, page_coord, + all_box_coord, slopes, counter, ocr_textlines) - for mm in range(len(found_polygons_marginals_left)): - marginal = TextRegionType(id=counter.next_region_id, type_='marginalia', - Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals_left[mm], page_coord))) + for mm, region_contour in enumerate(found_polygons_marginals_left): + marginal = TextRegionType( + id=counter.next_region_id, type_='marginalia', + Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) + ) page.add_TextRegion(marginal) if ocr_all_textlines_marginals_left: ocr_textlines = ocr_all_textlines_marginals_left[mm] else: ocr_textlines = None - - #print(ocr_textlines, mm, len(all_found_textline_polygons_marginals_left[mm]) ) - self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals_left, mm, page_coord, all_box_coord_marginals_left, slopes_marginals_left, counter, ocr_textlines) + self.serialize_lines_in_region(marginal, all_found_textline_polygons_marginals_left, mm, page_coord, + all_box_coord_marginals_left, slopes_marginals_left, counter, ocr_textlines) - for mm in range(len(found_polygons_marginals_right)): - marginal = TextRegionType(id=counter.next_region_id, type_='marginalia', - Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals_right[mm], page_coord))) + for mm, region_contour in enumerate(found_polygons_marginals_right): + marginal = TextRegionType( + id=counter.next_region_id, type_='marginalia', + Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) + ) page.add_TextRegion(marginal) if ocr_all_textlines_marginals_right: ocr_textlines = ocr_all_textlines_marginals_right[mm] else: ocr_textlines = None - - self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals_right, mm, page_coord, all_box_coord_marginals_right, slopes_marginals_right, counter, ocr_textlines) + self.serialize_lines_in_region(marginal, all_found_textline_polygons_marginals_right, mm, page_coord, + all_box_coord_marginals_right, slopes_marginals_right, counter, ocr_textlines) - for mm in range(len(found_polygons_text_region_img)): - img_region = ImageRegionType(id=counter.next_region_id, Coords=CoordsType()) + for region_contour in found_polygons_text_region_img: + img_region = ImageRegionType( + id=counter.next_region_id, + Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) + ) page.add_ImageRegion(img_region) - points_co = '' - for lmm in range(len(found_polygons_text_region_img[mm])): - try: - points_co += str(int((found_polygons_text_region_img[mm][lmm,0,0] + page_coord[2]) / self.scale_x)) - points_co += ',' - points_co += str(int((found_polygons_text_region_img[mm][lmm,0,1] + page_coord[0]) / self.scale_y)) - points_co += ' ' - except: - points_co += str(int((found_polygons_text_region_img[mm][lmm][0] + page_coord[2])/ self.scale_x )) - points_co += ',' - points_co += str(int((found_polygons_text_region_img[mm][lmm][1] + page_coord[0])/ self.scale_y )) - points_co += ' ' + for region_contour in polygons_seplines: + sep = SeparatorRegionType( + id=counter.next_region_id, + Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, [0, 0, 0, 0])) + ) + page.add_SeparatorRegion(sep) - img_region.get_Coords().set_points(points_co[:-1]) - - for mm in range(len(polygons_lines_to_be_written_in_xml)): - sep_hor = SeparatorRegionType(id=counter.next_region_id, Coords=CoordsType()) - page.add_SeparatorRegion(sep_hor) - points_co = '' - for lmm in range(len(polygons_lines_to_be_written_in_xml[mm])): - points_co += str(int((polygons_lines_to_be_written_in_xml[mm][lmm,0,0] ) / self.scale_x)) - points_co += ',' - points_co += str(int((polygons_lines_to_be_written_in_xml[mm][lmm,0,1] ) / self.scale_y)) - points_co += ' ' - sep_hor.get_Coords().set_points(points_co[:-1]) - for mm in range(len(found_polygons_tables)): - tab_region = TableRegionType(id=counter.next_region_id, Coords=CoordsType()) - page.add_TableRegion(tab_region) - points_co = '' - for lmm in range(len(found_polygons_tables[mm])): - points_co += str(int((found_polygons_tables[mm][lmm,0,0] + page_coord[2]) / self.scale_x)) - points_co += ',' - points_co += str(int((found_polygons_tables[mm][lmm,0,1] + page_coord[0]) / self.scale_y)) - points_co += ' ' - tab_region.get_Coords().set_points(points_co[:-1]) + for region_contour in found_polygons_tables: + tab = TableRegionType( + id=counter.next_region_id, + Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) + ) + page.add_TableRegion(tab) return pcgts - def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals_left,found_polygons_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_h, slopes_marginals_left, slopes_marginals_right, cont_page, polygons_lines_to_be_written_in_xml, ocr_all_textlines=None, ocr_all_textlines_h=None, ocr_all_textlines_marginals_left=None, ocr_all_textlines_marginals_right=None, ocr_all_textlines_drop=None, conf_contours_textregion=None, conf_contours_textregion_h=None): + def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals_left,found_polygons_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_h, slopes_marginals_left, slopes_marginals_right, cont_page, polygons_seplines, ocr_all_textlines=None, ocr_all_textlines_h=None, ocr_all_textlines_marginals_left=None, ocr_all_textlines_marginals_right=None, ocr_all_textlines_drop=None, conf_contours_textregion=None, conf_contours_textregion_h=None): self.logger.debug('enter build_pagexml_full_layout') # create the file structure @@ -271,99 +177,112 @@ class EynollahXmlWriter(): page.set_Border(BorderType(Coords=CoordsType(points=self.calculate_page_coords(cont_page)))) counter = EynollahIdCounter() - _counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) - id_of_marginalia_left = [_counter_marginals.next_region_id for _ in found_polygons_marginals_left] - id_of_marginalia_right = [_counter_marginals.next_region_id for _ in found_polygons_marginals_right] - xml_reading_order(page, order_of_texts, id_of_marginalia_left, id_of_marginalia_right) + if len(order_of_texts): + _counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) + id_of_marginalia_left = [_counter_marginals.next_region_id + for _ in found_polygons_marginals_left] + id_of_marginalia_right = [_counter_marginals.next_region_id + for _ in found_polygons_marginals_right] + xml_reading_order(page, order_of_texts, id_of_marginalia_left, id_of_marginalia_right) - for mm in range(len(found_polygons_text_region)): - textregion = TextRegionType(id=counter.next_region_id, type_='paragraph', - Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord), conf=conf_contours_textregion[mm])) + for mm, region_contour in enumerate(found_polygons_text_region): + textregion = TextRegionType( + id=counter.next_region_id, type_='paragraph', + Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord), + conf=conf_contours_textregion[mm]) + ) page.add_TextRegion(textregion) - if ocr_all_textlines: ocr_textlines = ocr_all_textlines[mm] else: ocr_textlines = None - self.serialize_lines_in_region(textregion, all_found_textline_polygons, mm, page_coord, all_box_coord, slopes, counter, ocr_textlines) + self.serialize_lines_in_region(textregion, all_found_textline_polygons, mm, page_coord, + all_box_coord, slopes, counter, ocr_textlines) self.logger.debug('len(found_polygons_text_region_h) %s', len(found_polygons_text_region_h)) - for mm in range(len(found_polygons_text_region_h)): - textregion = TextRegionType(id=counter.next_region_id, type_='heading', - Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region_h[mm], page_coord))) + for mm, region_contour in enumerate(found_polygons_text_region_h): + textregion = TextRegionType( + id=counter.next_region_id, type_='heading', + Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) + ) page.add_TextRegion(textregion) - if ocr_all_textlines_h: ocr_textlines = ocr_all_textlines_h[mm] else: ocr_textlines = None - self.serialize_lines_in_region(textregion, all_found_textline_polygons_h, mm, page_coord, all_box_coord_h, slopes_h, counter, ocr_textlines) + self.serialize_lines_in_region(textregion, all_found_textline_polygons_h, mm, page_coord, + all_box_coord_h, slopes_h, counter, ocr_textlines) - for mm in range(len(found_polygons_marginals_left)): - marginal = TextRegionType(id=counter.next_region_id, type_='marginalia', - Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals_left[mm], page_coord))) + for mm, region_contour in enumerate(found_polygons_marginals_left): + marginal = TextRegionType( + id=counter.next_region_id, type_='marginalia', + Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) + ) page.add_TextRegion(marginal) if ocr_all_textlines_marginals_left: ocr_textlines = ocr_all_textlines_marginals_left[mm] else: ocr_textlines = None - self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals_left, mm, page_coord, all_box_coord_marginals_left, slopes_marginals_left, counter, ocr_textlines) - - for mm in range(len(found_polygons_marginals_right)): - marginal = TextRegionType(id=counter.next_region_id, type_='marginalia', - Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals_right[mm], page_coord))) + self.serialize_lines_in_region(marginal, all_found_textline_polygons_marginals_left, mm, page_coord, all_box_coord_marginals_left, slopes_marginals_left, counter, ocr_textlines) + + for mm, region_contour in enumerate(found_polygons_marginals_right): + marginal = TextRegionType( + id=counter.next_region_id, type_='marginalia', + Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) + ) page.add_TextRegion(marginal) if ocr_all_textlines_marginals_right: ocr_textlines = ocr_all_textlines_marginals_right[mm] else: ocr_textlines = None - self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals_right, mm, page_coord, all_box_coord_marginals_right, slopes_marginals_right, counter, ocr_textlines) - - for mm in range(len(found_polygons_drop_capitals)): - dropcapital = TextRegionType(id=counter.next_region_id, type_='drop-capital', - Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_drop_capitals[mm], page_coord))) + self.serialize_lines_in_region(marginal, all_found_textline_polygons_marginals_right, mm, page_coord, + all_box_coord_marginals_right, slopes_marginals_right, counter, ocr_textlines) + + for mm, region_contour in enumerate(found_polygons_drop_capitals): + dropcapital = TextRegionType( + id=counter.next_region_id, type_='drop-capital', + Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) + ) page.add_TextRegion(dropcapital) - all_box_coord_drop = None - slopes_drop = None + all_box_coord_drop = [[0, 0, 0, 0]] + slopes_drop = [0] if ocr_all_textlines_drop: ocr_textlines = ocr_all_textlines_drop[mm] else: ocr_textlines = None - self.serialize_lines_in_dropcapital(dropcapital, [found_polygons_drop_capitals[mm]], mm, page_coord, all_box_coord_drop, slopes_drop, counter, ocr_all_textlines_textregion=ocr_textlines) + self.serialize_lines_in_region(dropcapital, [[found_polygons_drop_capitals[mm]]], 0, page_coord, + all_box_coord_drop, slopes_drop, counter, ocr_textlines) - for mm in range(len(found_polygons_text_region_img)): - page.add_ImageRegion(ImageRegionType(id=counter.next_region_id, Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region_img[mm], page_coord)))) + for region_contour in found_polygons_text_region_img: + page.add_ImageRegion( + ImageRegionType(id=counter.next_region_id, + Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)))) - for mm in range(len(polygons_lines_to_be_written_in_xml)): - page.add_SeparatorRegion(SeparatorRegionType(id=counter.next_region_id, Coords=CoordsType(points=self.calculate_polygon_coords(polygons_lines_to_be_written_in_xml[mm], [0 , 0, 0, 0])))) + for region_contour in polygons_seplines: + page.add_SeparatorRegion( + SeparatorRegionType(id=counter.next_region_id, + Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, [0, 0, 0, 0])))) - for mm in range(len(found_polygons_tables)): - page.add_TableRegion(TableRegionType(id=counter.next_region_id, Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_tables[mm], page_coord)))) + for region_contour in found_polygons_tables: + page.add_TableRegion( + TableRegionType(id=counter.next_region_id, + Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)))) return pcgts def calculate_polygon_coords(self, contour, page_coord, skip_layout_reading_order=False): self.logger.debug('enter calculate_polygon_coords') coords = '' - for value_bbox in contour: - if skip_layout_reading_order: - if len(value_bbox) == 2: - coords += str(int((value_bbox[0]) / self.scale_x)) - coords += ',' - coords += str(int((value_bbox[1]) / self.scale_y)) - else: - coords += str(int((value_bbox[0][0]) / self.scale_x)) - coords += ',' - coords += str(int((value_bbox[0][1]) / self.scale_y)) - else: - if len(value_bbox) == 2: - coords += str(int((value_bbox[0] + page_coord[2]) / self.scale_x)) - coords += ',' - coords += str(int((value_bbox[1] + page_coord[0]) / self.scale_y)) - else: - coords += str(int((value_bbox[0][0] + page_coord[2]) / self.scale_x)) - coords += ',' - coords += str(int((value_bbox[0][1] + page_coord[0]) / self.scale_y)) - coords=coords + ' ' + for point in contour: + if len(point) != 2: + point = point[0] + point_x = point[0] + point_y = point[1] + if not skip_layout_reading_order: + point_x += page_coord[2] + point_y += page_coord[0] + point_x = int(point_x / self.scale_x) + point_y = int(point_y / self.scale_y) + coords += str(point_x) + ',' + str(point_y) + ' ' return coords[:-1] From cbbb3248c72c1f3e50b98de1f7e2980bdd14da5d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 8 Oct 2025 00:43:29 +0200 Subject: [PATCH 076/101] writer: simplify - `build_pagexml_no_full_layout`: delegate to `build_pagexml_full_layout` (removing redundant code) --- src/eynollah/writer.py | 133 +++++++++++++++-------------------------- 1 file changed, 49 insertions(+), 84 deletions(-) diff --git a/src/eynollah/writer.py b/src/eynollah/writer.py index 67a2989..eee7440 100644 --- a/src/eynollah/writer.py +++ b/src/eynollah/writer.py @@ -87,8 +87,50 @@ class EynollahXmlWriter(): with open(self.output_filename, 'w') as f: f.write(to_xml(pcgts)) - def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals_left, found_polygons_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_marginals_left, slopes_marginals_right, cont_page, polygons_seplines, found_polygons_tables, ocr_all_textlines=None, ocr_all_textlines_marginals_left=None, ocr_all_textlines_marginals_right=None, conf_contours_textregion=None, skip_layout_reading_order=False): - self.logger.debug('enter build_pagexml_no_full_layout') + def build_pagexml_no_full_layout( + self, found_polygons_text_region, + page_coord, order_of_texts, id_of_texts, + all_found_textline_polygons, + all_box_coord, + found_polygons_text_region_img, + found_polygons_marginals_left, found_polygons_marginals_right, + all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, + all_box_coord_marginals_left, all_box_coord_marginals_right, + slopes, slopes_marginals_left, slopes_marginals_right, + cont_page, polygons_seplines, + found_polygons_tables, + **kwargs): + return self.build_pagexml_full_layout( + found_polygons_text_region, [], + page_coord, order_of_texts, id_of_texts, + all_found_textline_polygons, [], + all_box_coord, [], + found_polygons_text_region_img, found_polygons_tables, [], + found_polygons_marginals_left, found_polygons_marginals_right, + all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, + all_box_coord_marginals_left, all_box_coord_marginals_right, + slopes, [], slopes_marginals_left, slopes_marginals_right, + cont_page, polygons_seplines, + **kwargs) + + def build_pagexml_full_layout( + self, + found_polygons_text_region, found_polygons_text_region_h, + page_coord, order_of_texts, id_of_texts, + all_found_textline_polygons, all_found_textline_polygons_h, + all_box_coord, all_box_coord_h, + found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, + found_polygons_marginals_left,found_polygons_marginals_right, + all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, + all_box_coord_marginals_left, all_box_coord_marginals_right, + slopes, slopes_h, slopes_marginals_left, slopes_marginals_right, + cont_page, polygons_seplines, + ocr_all_textlines=None, ocr_all_textlines_h=None, + ocr_all_textlines_marginals_left=None, ocr_all_textlines_marginals_right=None, + ocr_all_textlines_drop=None, + conf_contours_textregion=None, conf_contours_textregion_h=None, + skip_layout_reading_order=False): + self.logger.debug('enter build_pagexml') # create the file structure pcgts = self.pcgts if self.pcgts else create_page_xml(self.image_filename, self.height_org, self.width_org) @@ -108,89 +150,10 @@ class EynollahXmlWriter(): textregion = TextRegionType( id=counter.next_region_id, type_='paragraph', Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord, - skip_layout_reading_order), - conf=conf_contours_textregion[mm]), - ) - page.add_TextRegion(textregion) - if ocr_all_textlines: - ocr_textlines = ocr_all_textlines[mm] - else: - ocr_textlines = None - self.serialize_lines_in_region(textregion, all_found_textline_polygons, mm, page_coord, - all_box_coord, slopes, counter, ocr_textlines) - - for mm, region_contour in enumerate(found_polygons_marginals_left): - marginal = TextRegionType( - id=counter.next_region_id, type_='marginalia', - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) - ) - page.add_TextRegion(marginal) - if ocr_all_textlines_marginals_left: - ocr_textlines = ocr_all_textlines_marginals_left[mm] - else: - ocr_textlines = None - self.serialize_lines_in_region(marginal, all_found_textline_polygons_marginals_left, mm, page_coord, - all_box_coord_marginals_left, slopes_marginals_left, counter, ocr_textlines) - - for mm, region_contour in enumerate(found_polygons_marginals_right): - marginal = TextRegionType( - id=counter.next_region_id, type_='marginalia', - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) - ) - page.add_TextRegion(marginal) - if ocr_all_textlines_marginals_right: - ocr_textlines = ocr_all_textlines_marginals_right[mm] - else: - ocr_textlines = None - self.serialize_lines_in_region(marginal, all_found_textline_polygons_marginals_right, mm, page_coord, - all_box_coord_marginals_right, slopes_marginals_right, counter, ocr_textlines) - - for region_contour in found_polygons_text_region_img: - img_region = ImageRegionType( - id=counter.next_region_id, - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) - ) - page.add_ImageRegion(img_region) - - for region_contour in polygons_seplines: - sep = SeparatorRegionType( - id=counter.next_region_id, - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, [0, 0, 0, 0])) - ) - page.add_SeparatorRegion(sep) - - for region_contour in found_polygons_tables: - tab = TableRegionType( - id=counter.next_region_id, - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) - ) - page.add_TableRegion(tab) - - return pcgts - - def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals_left,found_polygons_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_h, slopes_marginals_left, slopes_marginals_right, cont_page, polygons_seplines, ocr_all_textlines=None, ocr_all_textlines_h=None, ocr_all_textlines_marginals_left=None, ocr_all_textlines_marginals_right=None, ocr_all_textlines_drop=None, conf_contours_textregion=None, conf_contours_textregion_h=None): - self.logger.debug('enter build_pagexml_full_layout') - - # create the file structure - pcgts = self.pcgts if self.pcgts else create_page_xml(self.image_filename, self.height_org, self.width_org) - page = pcgts.get_Page() - page.set_Border(BorderType(Coords=CoordsType(points=self.calculate_page_coords(cont_page)))) - - counter = EynollahIdCounter() - if len(order_of_texts): - _counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) - id_of_marginalia_left = [_counter_marginals.next_region_id - for _ in found_polygons_marginals_left] - id_of_marginalia_right = [_counter_marginals.next_region_id - for _ in found_polygons_marginals_right] - xml_reading_order(page, order_of_texts, id_of_marginalia_left, id_of_marginalia_right) - - for mm, region_contour in enumerate(found_polygons_text_region): - textregion = TextRegionType( - id=counter.next_region_id, type_='paragraph', - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord), - conf=conf_contours_textregion[mm]) + skip_layout_reading_order)) ) + if conf_contours_textregion: + textregion.Coords.set_conf(conf_contours_textregion[mm]) page.add_TextRegion(textregion) if ocr_all_textlines: ocr_textlines = ocr_all_textlines[mm] @@ -205,6 +168,8 @@ class EynollahXmlWriter(): id=counter.next_region_id, type_='heading', Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) ) + if conf_contours_textregion_h: + textregion.Coords.set_conf(conf_contours_textregion_h[mm]) page.add_TextRegion(textregion) if ocr_all_textlines_h: ocr_textlines = ocr_all_textlines_h[mm] From 75823f9bed64153718acab6f664cdfc114ef34fb Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 8 Oct 2025 00:54:53 +0200 Subject: [PATCH 077/101] run_single: call `writer.build_pagexml_no_full_layout` w/ kwargs --- src/eynollah/eynollah.py | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index ec68bcd..b109c90 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4260,18 +4260,6 @@ class Eynollah: order_text_new = [0] slopes =[0] id_of_texts_tot =['region_0001'] - - polygons_of_images = [] - slopes_marginals_left = [] - slopes_marginals_right = [] - polygons_of_marginals_left = [] - polygons_of_marginals_right = [] - all_found_textline_polygons_marginals_left = [] - all_found_textline_polygons_marginals_right = [] - all_box_coord_marginals_left = [] - all_box_coord_marginals_right = [] - polygons_seplines = [] - contours_tables = [] conf_contours_textregions =[0] if self.ocr and not self.tr: @@ -4284,15 +4272,13 @@ class Eynollah: pcgts = self.writer.build_pagexml_no_full_layout( cont_page, page_coord, order_text_new, id_of_texts_tot, - all_found_textline_polygons, page_coord, polygons_of_images, - polygons_of_marginals_left, polygons_of_marginals_right, - all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, - all_box_coord_marginals_left, all_box_coord_marginals_right, - slopes, slopes_marginals_left, slopes_marginals_right, - cont_page, polygons_seplines, contours_tables, + all_found_textline_polygons, page_coord, [], + [], [], [], [], [], [], + slopes, [], [], + cont_page, [], [], ocr_all_textlines=ocr_all_textlines, conf_contours_textregion=conf_contours_textregions, - skip_layout_reading_order=self.skip_layout_and_reading_order) + skip_layout_reading_order=True) self.logger.info("Basic processing complete") return pcgts @@ -4884,9 +4870,11 @@ class Eynollah: all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_marginals_left, slopes_marginals_right, - cont_page, polygons_seplines, contours_tables, ocr_all_textlines, - ocr_all_textlines_marginals_left, ocr_all_textlines_marginals_right, - conf_contours_textregions) + cont_page, polygons_seplines, contours_tables, + ocr_all_textlines=ocr_all_textlines, + ocr_all_textlines_marginals_left=ocr_all_textlines_marginals_left, + ocr_all_textlines_marginals_right=ocr_all_textlines_marginals_right, + conf_contours_textregions=conf_contours_textregions) return pcgts From 5e11a68a3e18e926b25829e0fce3c279e529aca0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 8 Oct 2025 01:03:48 +0200 Subject: [PATCH 078/101] writer/run_single: consistent kwarg naming `conf_contours_textregion(s)` --- src/eynollah/writer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/eynollah/writer.py b/src/eynollah/writer.py index eee7440..8859d95 100644 --- a/src/eynollah/writer.py +++ b/src/eynollah/writer.py @@ -128,7 +128,7 @@ class EynollahXmlWriter(): ocr_all_textlines=None, ocr_all_textlines_h=None, ocr_all_textlines_marginals_left=None, ocr_all_textlines_marginals_right=None, ocr_all_textlines_drop=None, - conf_contours_textregion=None, conf_contours_textregion_h=None, + conf_contours_textregions=None, conf_contours_textregions_h=None, skip_layout_reading_order=False): self.logger.debug('enter build_pagexml') @@ -152,8 +152,8 @@ class EynollahXmlWriter(): Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord, skip_layout_reading_order)) ) - if conf_contours_textregion: - textregion.Coords.set_conf(conf_contours_textregion[mm]) + if conf_contours_textregions: + textregion.Coords.set_conf(conf_contours_textregions[mm]) page.add_TextRegion(textregion) if ocr_all_textlines: ocr_textlines = ocr_all_textlines[mm] @@ -168,8 +168,8 @@ class EynollahXmlWriter(): id=counter.next_region_id, type_='heading', Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) ) - if conf_contours_textregion_h: - textregion.Coords.set_conf(conf_contours_textregion_h[mm]) + if conf_contours_textregions_h: + textregion.Coords.set_conf(conf_contours_textregions_h[mm]) page.add_TextRegion(textregion) if ocr_all_textlines_h: ocr_textlines = ocr_all_textlines_h[mm] From ca72a095cab373b6daa2f7353f456d9eacfd399b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 8 Oct 2025 00:44:32 +0200 Subject: [PATCH 079/101] tests: cover table detection in various modes --- tests/test_run.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tests/test_run.py b/tests/test_run.py index 98cee30..79c64c2 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -67,6 +67,44 @@ def test_run_eynollah_layout_filename(tmp_path, pytestconfig, caplog, options): lines = tree.xpath("//page:TextLine", namespaces=NS) assert len(lines) == 31, "result is inaccurate" # 29 paragraph lines, 1 page and 1 catch-word line +@pytest.mark.parametrize( + "options", + [ + ["--tables"], + ["--tables", "--full-layout"], + ["--tables", "--full-layout", "--textline_light", "--light_version"], + ], ids=str) +def test_run_eynollah_layout_filename2(tmp_path, pytestconfig, caplog, options): + infile = testdir.joinpath('resources/euler_rechenkunst01_1738_0025.tif') + outfile = tmp_path / 'euler_rechenkunst01_1738_0025.xml' + args = [ + '-m', MODELS_LAYOUT, + '-i', str(infile), + '-o', str(outfile.parent), + ] + if pytestconfig.getoption('verbose') > 0: + args.extend(['-l', 'DEBUG']) + caplog.set_level(logging.INFO) + def only_eynollah(logrec): + return logrec.name == 'eynollah' + runner = CliRunner() + with caplog.filtering(only_eynollah): + result = runner.invoke(layout_cli, args + options, catch_exceptions=False) + assert result.exit_code == 0, result.stdout + logmsgs = [logrec.message for logrec in caplog.records] + assert str(infile) in logmsgs + assert outfile.exists() + tree = page_from_file(str(outfile)).etree + regions = tree.xpath("//page:TextRegion", namespaces=NS) + assert len(regions) >= 2, "result is inaccurate" + regions = tree.xpath("//page:TableRegion", namespaces=NS) + # model/decoding is not very precise, so (depending on mode) we can get fractures/splits/FP + assert len(regions) >= 1, "result is inaccurate" + regions = tree.xpath("//page:SeparatorRegion", namespaces=NS) + assert len(regions) >= 2, "result is inaccurate" + lines = tree.xpath("//page:TextLine", namespaces=NS) + assert len(lines) >= 2, "result is inaccurate" # mostly table (if detected correctly), but 1 page and 1 catch-word line + def test_run_eynollah_layout_directory(tmp_path, pytestconfig, caplog): indir = testdir.joinpath('resources') outdir = tmp_path From e5b52645685b669d5af7c5da2870a01660f81cdb Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 8 Oct 2025 12:17:53 +0200 Subject: [PATCH 080/101] CI: add diagnostic message for model symlink --- .github/workflows/test-eynollah.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test-eynollah.yml b/.github/workflows/test-eynollah.yml index 7c3f5ae..759b26c 100644 --- a/.github/workflows/test-eynollah.yml +++ b/.github/workflows/test-eynollah.yml @@ -66,6 +66,7 @@ jobs: python -m pip install --upgrade pip make install-dev EXTRAS=OCR,plotting make deps-test EXTRAS=OCR,plotting + ls -l models_* - name: Test with pytest run: make coverage PYTEST_ARGS="-vv --junitxml=pytest.xml" - name: Get coverage results From 839b7c4d846d6f73069529aa1f337caa362917c0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 8 Oct 2025 12:33:14 +0200 Subject: [PATCH 081/101] make models: avoid re-download --- Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile b/Makefile index 618b1f9..29dd877 100644 --- a/Makefile +++ b/Makefile @@ -58,6 +58,9 @@ help: # Download and extract models to $(PWD)/models_layout_v0_5_0 models: $(BIN_MODELNAME) $(SEG_MODELNAME) $(OCR_MODELNAME) +# do not download these files if we already have the directories +.INTERMEDIATE: $(BIN_MODELFILE) $(SEG_MODELFILE) $(OCR_MODELFILE) + $(BIN_MODELFILE): wget -O $@ $(BIN_MODEL) $(SEG_MODELFILE): From 1d4815b48f1f5b1bf006efe78141fd3161ee8073 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 8 Oct 2025 14:56:14 +0200 Subject: [PATCH 082/101] utils_ocr: forgot to pass coordinate offsets --- src/eynollah/eynollah.py | 24 ++++++++++++------------ src/eynollah/utils/utils_ocr.py | 10 ++++++++-- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index b109c90..a6b65c4 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4265,8 +4265,8 @@ class Eynollah: if self.ocr and not self.tr: gc.collect() ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines( - image_page, all_found_textline_polygons, self.prediction_model, - self.b_s_ocr, self.num_to_char, textline_light=True) + image_page, all_found_textline_polygons, np.zeros((len(all_found_textline_polygons), 4)), + self.prediction_model, self.b_s_ocr, self.num_to_char, textline_light=True) else: ocr_all_textlines = None @@ -4756,36 +4756,36 @@ class Eynollah: if len(all_found_textline_polygons)>0: ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines( - image_page, all_found_textline_polygons, self.prediction_model, - self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + image_page, all_found_textline_polygons, all_box_coord, + self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) else: ocr_all_textlines = None if all_found_textline_polygons_marginals_left and len(all_found_textline_polygons_marginals_left)>0: ocr_all_textlines_marginals_left = return_rnn_cnn_ocr_of_given_textlines( - image_page, all_found_textline_polygons_marginals_left, self.prediction_model, - self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + image_page, all_found_textline_polygons_marginals_left, all_box_coord_marginals_left, + self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) else: ocr_all_textlines_marginals_left = None if all_found_textline_polygons_marginals_right and len(all_found_textline_polygons_marginals_right)>0: ocr_all_textlines_marginals_right = return_rnn_cnn_ocr_of_given_textlines( - image_page, all_found_textline_polygons_marginals_right, self.prediction_model, - self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + image_page, all_found_textline_polygons_marginals_right, all_box_coord_marginals_right, + self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) else: ocr_all_textlines_marginals_right = None if all_found_textline_polygons_h and len(all_found_textline_polygons)>0: ocr_all_textlines_h = return_rnn_cnn_ocr_of_given_textlines( - image_page, all_found_textline_polygons_h, self.prediction_model, - self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + image_page, all_found_textline_polygons_h, all_box_coord_h, + self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) else: ocr_all_textlines_h = None if polygons_of_drop_capitals and len(polygons_of_drop_capitals)>0: ocr_all_textlines_drop = return_rnn_cnn_ocr_of_given_textlines( - image_page, polygons_of_drop_capitals, self.prediction_model, - self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + image_page, polygons_of_drop_capitals, np.zeros((len(polygons_of_drop_capitals), 4)), + self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) else: ocr_all_textlines_drop = None diff --git a/src/eynollah/utils/utils_ocr.py b/src/eynollah/utils/utils_ocr.py index 602ad6e..6e71b0f 100644 --- a/src/eynollah/utils/utils_ocr.py +++ b/src/eynollah/utils/utils_ocr.py @@ -1,13 +1,17 @@ +import math +import copy + import numpy as np import cv2 import tensorflow as tf from scipy.signal import find_peaks from scipy.ndimage import gaussian_filter1d -import math from PIL import Image, ImageDraw, ImageFont from Bio import pairwise2 + from .resize import resize_image + def decode_batch_predictions(pred, num_to_char, max_len = 128): # input_len is the product of the batch size and the # number of time steps. @@ -370,7 +374,9 @@ def return_textline_contour_with_added_box_coordinate(textline_contour, box_ind return textline_contour -def return_rnn_cnn_ocr_of_given_textlines(image, all_found_textline_polygons, +def return_rnn_cnn_ocr_of_given_textlines(image, + all_found_textline_polygons, + all_box_coord, prediction_model, b_s_ocr, num_to_char, textline_light=False, From 027b87d32125afdc1bebbb968fc32b55b58bf153 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 8 Oct 2025 14:56:57 +0200 Subject: [PATCH 083/101] fixup c0137c2 (missing arguments for utils_ocr) --- src/eynollah/eynollah.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index a6b65c4..aeb01be 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -96,6 +96,7 @@ from .utils.rotate import ( rotation_image_new ) from .utils.utils_ocr import ( + return_start_and_end_of_common_text_of_textline_ocr_without_common_section, return_textline_contour_with_added_box_coordinate, preprocess_and_resize_image_for_ocrcnn_model, return_textlines_split_if_needed, @@ -4796,7 +4797,6 @@ class Eynollah: self.logger.info("Using light text line detection for OCR") self.logger.info("Processing text lines...") - self.device.reset() gc.collect() torch.cuda.empty_cache() From 096def1e9d0b95cf3690734730f675ae5a74c0fd Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 8 Oct 2025 15:13:13 +0200 Subject: [PATCH 084/101] mbreorder/enhancment: fix missing imports (not sure if these models really need that, though) --- src/eynollah/image_enhancer.py | 6 +++--- src/eynollah/mb_ro_on_layout.py | 7 +++---- tests/test_smoke.py | 1 - 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/eynollah/image_enhancer.py b/src/eynollah/image_enhancer.py index 89dde16..9247efe 100644 --- a/src/eynollah/image_enhancer.py +++ b/src/eynollah/image_enhancer.py @@ -6,23 +6,23 @@ from logging import Logger import os import time from typing import Optional -import atexit -from functools import partial from pathlib import Path -from multiprocessing import cpu_count import gc + import cv2 import numpy as np from ocrd_utils import getLogger, tf_disable_interactive_logs import tensorflow as tf from skimage.morphology import skeletonize from tensorflow.keras.models import load_model + from .utils.resize import resize_image from .utils.pil_cv2 import pil2cv from .utils import ( is_image_filename, crop_image_inside_box ) +from .eynollah import PatchEncoder, Patches DPI_THRESHOLD = 298 KERNEL = np.ones((5, 5), np.uint8) diff --git a/src/eynollah/mb_ro_on_layout.py b/src/eynollah/mb_ro_on_layout.py index 45db8e4..218f973 100644 --- a/src/eynollah/mb_ro_on_layout.py +++ b/src/eynollah/mb_ro_on_layout.py @@ -6,25 +6,24 @@ from logging import Logger import os import time from typing import Optional -import atexit -from functools import partial from pathlib import Path -from multiprocessing import cpu_count import xml.etree.ElementTree as ET + import cv2 import numpy as np from ocrd_utils import getLogger import statistics import tensorflow as tf from tensorflow.keras.models import load_model -from .utils.resize import resize_image +from .utils.resize import resize_image from .utils.contour import ( find_new_features_of_contours, return_contours_of_image, return_parent_contours, ) from .utils import is_xml_filename +from .eynollah import PatchEncoder, Patches DPI_THRESHOLD = 298 KERNEL = np.ones((5, 5), np.uint8) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 252213f..e2b323a 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -2,6 +2,5 @@ def test_utils_import(): import eynollah.utils import eynollah.utils.contour import eynollah.utils.drop_capitals - import eynollah.utils.drop_capitals import eynollah.utils.is_nan import eynollah.utils.rotate From 8a2d682e12d8e95414aa53f1e2a9cfea74c778a3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 8 Oct 2025 16:52:22 +0200 Subject: [PATCH 085/101] fix identifier scope in layout OCR options (w/o full_layout) --- src/eynollah/eynollah.py | 33 +++++++++++---------------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index aeb01be..7d6229a 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4726,7 +4726,6 @@ class Eynollah: self.plotter.write_images_into_directory(polygons_of_images, image_page) t_order = time.time() - #if self.full_layout: self.logger.info("Step 4/5: Reading Order Detection") if self.reading_order_machine_based: @@ -4749,46 +4748,41 @@ class Eynollah: boxes_d, textline_mask_tot_d) self.logger.info(f"Detection of reading order took {time.time() - t_order:.1f}s") + ocr_all_textlines = None + ocr_all_textlines_marginals_left = None + ocr_all_textlines_marginals_right = None + ocr_all_textlines_h = None + ocr_all_textlines_drop = None if self.ocr: self.logger.info("Step 4.5/5: OCR Processing") if not self.tr: gc.collect() - if len(all_found_textline_polygons)>0: + if len(all_found_textline_polygons): ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines( image_page, all_found_textline_polygons, all_box_coord, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) - else: - ocr_all_textlines = None - if all_found_textline_polygons_marginals_left and len(all_found_textline_polygons_marginals_left)>0: + if len(all_found_textline_polygons_marginals_left): ocr_all_textlines_marginals_left = return_rnn_cnn_ocr_of_given_textlines( image_page, all_found_textline_polygons_marginals_left, all_box_coord_marginals_left, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) - else: - ocr_all_textlines_marginals_left = None - if all_found_textline_polygons_marginals_right and len(all_found_textline_polygons_marginals_right)>0: + if len(all_found_textline_polygons_marginals_right): ocr_all_textlines_marginals_right = return_rnn_cnn_ocr_of_given_textlines( image_page, all_found_textline_polygons_marginals_right, all_box_coord_marginals_right, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) - else: - ocr_all_textlines_marginals_right = None - if all_found_textline_polygons_h and len(all_found_textline_polygons)>0: + if self.full_layout and len(all_found_textline_polygons): ocr_all_textlines_h = return_rnn_cnn_ocr_of_given_textlines( image_page, all_found_textline_polygons_h, all_box_coord_h, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) - else: - ocr_all_textlines_h = None - if polygons_of_drop_capitals and len(polygons_of_drop_capitals)>0: + if self.full_layout and len(polygons_of_drop_capitals): ocr_all_textlines_drop = return_rnn_cnn_ocr_of_given_textlines( image_page, polygons_of_drop_capitals, np.zeros((len(polygons_of_drop_capitals), 4)), self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) - else: - ocr_all_textlines_drop = None else: if self.light_version: @@ -4805,6 +4799,7 @@ class Eynollah: ind_tot = 0 #cv2.imwrite('./img_out.png', image_page) ocr_all_textlines = [] + # FIXME: what about lines in marginals / headings / drop-capitals here? for indexing, ind_poly_first in enumerate(all_found_textline_polygons): ocr_textline_in_textregion = [] for indexing2, ind_poly in enumerate(ind_poly_first): @@ -4840,12 +4835,6 @@ class Eynollah: ocr_textline_in_textregion.append(text_ocr) ind_tot = ind_tot +1 ocr_all_textlines.append(ocr_textline_in_textregion) - else: - ocr_all_textlines = None - ocr_all_textlines_marginals_left = None - ocr_all_textlines_marginals_right = None - ocr_all_textlines_h = None - ocr_all_textlines_drop = None self.logger.info("Step 5/5: Output Generation") From b3d29bef8961435f85cf0c95ec3dd6c239e74621 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 8 Oct 2025 19:21:07 +0200 Subject: [PATCH 086/101] return_contours_of_interested_region*: rm unused variants --- src/eynollah/eynollah.py | 17 +++++++---------- src/eynollah/utils/contour.py | 33 --------------------------------- 2 files changed, 7 insertions(+), 43 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 7d6229a..e15afd6 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -79,7 +79,6 @@ from .utils.contour import ( get_textregion_contours_in_org_image_light, return_contours_of_image, return_contours_of_interested_region, - return_contours_of_interested_region_by_min_size, return_contours_of_interested_textline, return_parent_contours, dilate_textregion_contours, @@ -4242,14 +4241,11 @@ class Eynollah: all_found_textline_polygons = filter_contours_area_of_image( textline_mask_tot_ea, cnt_clean_rot_raw, hir_on_cnt_clean_rot, max_area=1, min_area=0.00001) - M_main_tot = [cv2.moments(all_found_textline_polygons[j]) - for j in range(len(all_found_textline_polygons))] - w_h_textlines = [cv2.boundingRect(all_found_textline_polygons[j])[2:] - for j in range(len(all_found_textline_polygons))] - w_h_textlines = [w_h_textlines[j][0] / float(w_h_textlines[j][1]) for j in range(len(w_h_textlines))] - cx_main_tot = [(M_main_tot[j]["m10"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] - cy_main_tot = [(M_main_tot[j]["m01"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] - + cx_main_tot, cy_main_tot = find_center_of_contours(all_found_textline_polygons) + w_h_textlines = [cv2.boundingRect(polygon)[2:] + for polygon in all_found_textline_polygons] + w_h_textlines = [w / float(h) for w, h in w_h_textlines] + all_found_textline_polygons = self.get_textlines_of_a_textregion_sorted( #all_found_textline_polygons[::-1] all_found_textline_polygons, cx_main_tot, cy_main_tot, w_h_textlines) @@ -4677,7 +4673,8 @@ class Eynollah: self.plotter.save_plot_of_layout_all(text_regions_p, image_page) label_img = 4 - polygons_of_drop_capitals = return_contours_of_interested_region_by_min_size(text_regions_p, label_img) + polygons_of_drop_capitals = return_contours_of_interested_region(text_regions_p, label_img, + min_area=0.00003) ##all_found_textline_polygons = adhere_drop_capital_region_into_corresponding_textline( ##text_regions_p, polygons_of_drop_capitals, contours_only_text_parent, contours_only_text_parent_h, ##all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index 2560846..f998c4d 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -253,39 +253,6 @@ def return_contours_of_image(image): contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) return contours, hierarchy -def return_contours_of_interested_region_by_min_size(region_pre_p, label, min_size=0.00003): - # pixels of images are identified by 5 - if region_pre_p.ndim == 3: - cnts_images = (region_pre_p[:, :, 0] == label) * 1 - else: - cnts_images = (region_pre_p[:, :] == label) * 1 - _, thresh = cv2.threshold(cnts_images.astype(np.uint8), 0, 255, 0) - - contours_imgs, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - contours_imgs = return_parent_contours(contours_imgs, hierarchy) - contours_imgs = filter_contours_area_of_image_tables( - thresh, contours_imgs, hierarchy, max_area=1, min_area=min_size) - - return contours_imgs - -def return_contours_of_interested_region_by_size(region_pre_p, label, min_area, max_area): - # pixels of images are identified by 5 - if region_pre_p.ndim == 3: - cnts_images = (region_pre_p[:, :, 0] == label) * 1 - else: - cnts_images = (region_pre_p[:, :] == label) * 1 - _, thresh = cv2.threshold(cnts_images.astype(np.uint8), 0, 255, 0) - contours_imgs, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - - contours_imgs = return_parent_contours(contours_imgs, hierarchy) - contours_imgs = filter_contours_area_of_image_tables( - thresh, contours_imgs, hierarchy, max_area=max_area, min_area=min_area) - - img_ret = np.zeros((region_pre_p.shape[0], region_pre_p.shape[1])) - img_ret = cv2.fillPoly(img_ret, pts=contours_imgs, color=1) - - return img_ret - def dilate_textline_contours(all_found_textline_polygons): return [[polygon2contour(contour2polygon(contour, dilate=6)) for contour in region] From a144026b2789ae056c7bac619d2e3e2b582e62d6 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 8 Oct 2025 15:13:57 +0200 Subject: [PATCH 087/101] add rough ruff config --- pyproject.toml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 8a63543..2df39b9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,3 +51,18 @@ where = ["src"] [tool.coverage.run] branch = true source = ["eynollah"] + +[tool.ruff] +line-length = 120 + +[tool.ruff.lint] +ignore = [ +# disable unused imports +"F401", +# disable import order +"E402", +# disable unused variables +"F841", +# disable bare except +"E722", +] From e1b56d97dab9eed6110fabd85b5ae74b36f18c9f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 8 Oct 2025 17:54:38 +0200 Subject: [PATCH 088/101] CI: lint with ruff --- .github/workflows/test-eynollah.yml | 4 ++++ pyproject.toml | 3 +++ 2 files changed, 7 insertions(+) diff --git a/.github/workflows/test-eynollah.yml b/.github/workflows/test-eynollah.yml index 759b26c..466e690 100644 --- a/.github/workflows/test-eynollah.yml +++ b/.github/workflows/test-eynollah.yml @@ -67,6 +67,10 @@ jobs: make install-dev EXTRAS=OCR,plotting make deps-test EXTRAS=OCR,plotting ls -l models_* + - name: Lint with ruff + uses: astral-sh/ruff-action@v3 + with: + src: "./src" - name: Test with pytest run: make coverage PYTEST_ARGS="-vv --junitxml=pytest.xml" - name: Get coverage results diff --git a/pyproject.toml b/pyproject.toml index 2df39b9..79f9164 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,3 +66,6 @@ ignore = [ # disable bare except "E722", ] + +[tool.ruff.format] +quote-style = "preserve" From cab392601e74e0360e659296f26e1719fb6f742f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 9 Oct 2025 20:12:06 +0200 Subject: [PATCH 089/101] :memo: update changelog --- CHANGELOG.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f6776d6..ab3dd83 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,11 +15,17 @@ Fixed: * `get_smallest_skew`: after shifting search range of rotation angle, use overall best result * Dockerfile: fix CUDA installation (cuDNN contested between Torch and TF due to extra OCR) * OCR: re-instate missing methods and fix `utils_ocr` function calls + * mbreorder/enhancement CLIs: missing imports * :fire: writer: `SeparatorRegion` needs `SeparatorRegionType` (not `ImageRegionType`) f458e3e * tests: switch from `pytest-subtests` to `parametrize` so we can use `pytest-isolate` (so CUDA memory gets freed between tests if running on GPU) +Added: + * test coverage for OCR options in `layout` + * test coverage for table detection in `layout` + * CI linting with ruff + Changed: * polygons: slightly widen for regions and lines, increase for separators @@ -28,7 +34,19 @@ Changed: but use shared memory if necessary, and switch back from `loky` to stdlib, and shutdown in `del()` instead of `atexit` * :fire: OCR: switch CNN-RNN model to `20250930` version compatible with TF 2.12 on CPU, too + * OCR: allow running `-tr` without `-fl`, too * :fire: writer: use `@type='heading'` instead of `'header'` for headings + * :fire: performance gains via refactoring (simplification, less copy-code, vectorization, + avoiding unused calculations, avoiding unnecessary 3-channel image operations) + * :fire: heuristic reading order detection: many improvements + - contour vs splitter box matching: + * contour must be contained in box exactly instead of heuristics + * make fallback center matching, center must be contained in box + - original vs deskewed contour matching: + * same min-area filter on both sides + * similar area score in addition to center proximity + * avoid duplicate and missing mappings by allowing N:M + matches and splitting+joining where necessary * CI: update+improve model caching From c4cb16c2a8e92b0d14b2388ad7a7e8d06e6472fe Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 9 Oct 2025 23:05:50 +0200 Subject: [PATCH 090/101] simplify (`skip_layout_and_reading_order` is already an attr) --- src/eynollah/eynollah.py | 205 +++++++++++++++++++-------------------- 1 file changed, 102 insertions(+), 103 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 1b6cee0..3579078 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2155,7 +2155,7 @@ class Eynollah: page_coord, cont_page) - def get_regions_light_v(self,img,is_image_enhanced, num_col_classifier, skip_layout_and_reading_order=False): + def get_regions_light_v(self,img,is_image_enhanced, num_col_classifier): self.logger.debug("enter get_regions_light_v") t_in = time.time() erosion_hurts = False @@ -2221,110 +2221,110 @@ class Eynollah: #plt.imshwo(self.image_page_org_size) #plt.show() - if not skip_layout_and_reading_order: - #print("inside 2 ", time.time()-t_in) - if num_col_classifier == 1 or num_col_classifier == 2: - if self.image_org.shape[0]/self.image_org.shape[1] > 2.5: - self.logger.debug("resized to %dx%d for %d cols", - img_resized.shape[1], img_resized.shape[0], num_col_classifier) - prediction_regions_org, confidence_matrix = self.do_prediction_new_concept( - True, img_resized, self.model_region_1_2, n_batch_inference=1, - thresholding_for_some_classes_in_light_version=True, - threshold_art_class_layout=self.threshold_art_class_layout) - else: - prediction_regions_org = np.zeros((self.image_org.shape[0], self.image_org.shape[1], 3)) - confidence_matrix = np.zeros((self.image_org.shape[0], self.image_org.shape[1])) - prediction_regions_page, confidence_matrix_page = self.do_prediction_new_concept( - False, self.image_page_org_size, self.model_region_1_2, n_batch_inference=1, - thresholding_for_artificial_class_in_light_version=True, - threshold_art_class_layout=self.threshold_art_class_layout) - ys = slice(*self.page_coord[0:2]) - xs = slice(*self.page_coord[2:4]) - prediction_regions_org[ys, xs] = prediction_regions_page - confidence_matrix[ys, xs] = confidence_matrix_page - - else: - new_h = (900+ (num_col_classifier-3)*100) - img_resized = resize_image(img_bin, int(new_h * img_bin.shape[0] /img_bin.shape[1]), new_h) - self.logger.debug("resized to %dx%d (new_h=%d) for %d cols", - img_resized.shape[1], img_resized.shape[0], new_h, num_col_classifier) - prediction_regions_org, confidence_matrix = self.do_prediction_new_concept( - True, img_resized, self.model_region_1_2, n_batch_inference=2, - thresholding_for_some_classes_in_light_version=True, - threshold_art_class_layout=self.threshold_art_class_layout) - ###prediction_regions_org = self.do_prediction(True, img_bin, self.model_region, - ###n_batch_inference=3, - ###thresholding_for_some_classes_in_light_version=True) - #print("inside 3 ", time.time()-t_in) - #plt.imshow(prediction_regions_org[:,:,0]) - #plt.show() - - prediction_regions_org = resize_image(prediction_regions_org, img_height_h, img_width_h ) - confidence_matrix = resize_image(confidence_matrix, img_height_h, img_width_h ) - img_bin = resize_image(img_bin, img_height_h, img_width_h ) - prediction_regions_org=prediction_regions_org[:,:,0] - - mask_lines_only = (prediction_regions_org[:,:] ==3)*1 - mask_texts_only = (prediction_regions_org[:,:] ==1)*1 - mask_texts_only = mask_texts_only.astype('uint8') - - ##if num_col_classifier == 1 or num_col_classifier == 2: - ###mask_texts_only = cv2.erode(mask_texts_only, KERNEL, iterations=1) - ##mask_texts_only = cv2.dilate(mask_texts_only, KERNEL, iterations=1) - - mask_texts_only = cv2.dilate(mask_texts_only, kernel=np.ones((2,2), np.uint8), iterations=1) - mask_images_only=(prediction_regions_org[:,:] ==2)*1 - - polygons_seplines, hir_seplines = return_contours_of_image(mask_lines_only) - test_khat = np.zeros(prediction_regions_org.shape) - test_khat = cv2.fillPoly(test_khat, pts=polygons_seplines, color=(1,1,1)) - - #plt.imshow(test_khat[:,:]) - #plt.show() - #for jv in range(1): - #print(jv, hir_seplines[0][232][3]) - #test_khat = np.zeros(prediction_regions_org.shape) - #test_khat = cv2.fillPoly(test_khat, pts = [polygons_seplines[232]], color=(1,1,1)) - #plt.imshow(test_khat[:,:]) - #plt.show() - - polygons_seplines = filter_contours_area_of_image( - mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) - - test_khat = np.zeros(prediction_regions_org.shape) - test_khat = cv2.fillPoly(test_khat, pts = polygons_seplines, color=(1,1,1)) - - #plt.imshow(test_khat[:,:]) - #plt.show() - #sys.exit() - - polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only,1,0.00001) - ##polygons_of_only_texts = dilate_textregion_contours(polygons_of_only_texts) - polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only,1,0.00001) - - text_regions_p_true = np.zeros(prediction_regions_org.shape) - text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts=polygons_of_only_lines, color=(3,3,3)) - - text_regions_p_true[:,:][mask_images_only[:,:] == 1] = 2 - text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_texts, color=(1,1,1)) - - textline_mask_tot_ea[(text_regions_p_true==0) | (text_regions_p_true==4) ] = 0 - #plt.imshow(textline_mask_tot_ea) - #plt.show() - #print("inside 4 ", time.time()-t_in) - self.logger.debug("exit get_regions_light_v") - return (text_regions_p_true, - erosion_hurts, - polygons_seplines, - polygons_of_only_texts, - textline_mask_tot_ea, - img_bin, - confidence_matrix) - else: + if self.skip_layout_and_reading_order: img_bin = resize_image(img_bin,img_height_h, img_width_h ) self.logger.debug("exit get_regions_light_v") return None, erosion_hurts, None, None, textline_mask_tot_ea, img_bin, None + #print("inside 2 ", time.time()-t_in) + if num_col_classifier == 1 or num_col_classifier == 2: + if self.image_org.shape[0]/self.image_org.shape[1] > 2.5: + self.logger.debug("resized to %dx%d for %d cols", + img_resized.shape[1], img_resized.shape[0], num_col_classifier) + prediction_regions_org, confidence_matrix = self.do_prediction_new_concept( + True, img_resized, self.model_region_1_2, n_batch_inference=1, + thresholding_for_some_classes_in_light_version=True, + threshold_art_class_layout=self.threshold_art_class_layout) + else: + prediction_regions_org = np.zeros((self.image_org.shape[0], self.image_org.shape[1], 3)) + confidence_matrix = np.zeros((self.image_org.shape[0], self.image_org.shape[1])) + prediction_regions_page, confidence_matrix_page = self.do_prediction_new_concept( + False, self.image_page_org_size, self.model_region_1_2, n_batch_inference=1, + thresholding_for_artificial_class_in_light_version=True, + threshold_art_class_layout=self.threshold_art_class_layout) + ys = slice(*self.page_coord[0:2]) + xs = slice(*self.page_coord[2:4]) + prediction_regions_org[ys, xs] = prediction_regions_page + confidence_matrix[ys, xs] = confidence_matrix_page + + else: + new_h = (900+ (num_col_classifier-3)*100) + img_resized = resize_image(img_bin, int(new_h * img_bin.shape[0] /img_bin.shape[1]), new_h) + self.logger.debug("resized to %dx%d (new_h=%d) for %d cols", + img_resized.shape[1], img_resized.shape[0], new_h, num_col_classifier) + prediction_regions_org, confidence_matrix = self.do_prediction_new_concept( + True, img_resized, self.model_region_1_2, n_batch_inference=2, + thresholding_for_some_classes_in_light_version=True, + threshold_art_class_layout=self.threshold_art_class_layout) + ###prediction_regions_org = self.do_prediction(True, img_bin, self.model_region, + ###n_batch_inference=3, + ###thresholding_for_some_classes_in_light_version=True) + #print("inside 3 ", time.time()-t_in) + #plt.imshow(prediction_regions_org[:,:,0]) + #plt.show() + + prediction_regions_org = resize_image(prediction_regions_org, img_height_h, img_width_h ) + confidence_matrix = resize_image(confidence_matrix, img_height_h, img_width_h ) + img_bin = resize_image(img_bin, img_height_h, img_width_h ) + prediction_regions_org=prediction_regions_org[:,:,0] + + mask_lines_only = (prediction_regions_org[:,:] ==3)*1 + mask_texts_only = (prediction_regions_org[:,:] ==1)*1 + mask_texts_only = mask_texts_only.astype('uint8') + + ##if num_col_classifier == 1 or num_col_classifier == 2: + ###mask_texts_only = cv2.erode(mask_texts_only, KERNEL, iterations=1) + ##mask_texts_only = cv2.dilate(mask_texts_only, KERNEL, iterations=1) + + mask_texts_only = cv2.dilate(mask_texts_only, kernel=np.ones((2,2), np.uint8), iterations=1) + mask_images_only=(prediction_regions_org[:,:] ==2)*1 + + polygons_seplines, hir_seplines = return_contours_of_image(mask_lines_only) + test_khat = np.zeros(prediction_regions_org.shape) + test_khat = cv2.fillPoly(test_khat, pts=polygons_seplines, color=(1,1,1)) + + #plt.imshow(test_khat[:,:]) + #plt.show() + #for jv in range(1): + #print(jv, hir_seplines[0][232][3]) + #test_khat = np.zeros(prediction_regions_org.shape) + #test_khat = cv2.fillPoly(test_khat, pts = [polygons_seplines[232]], color=(1,1,1)) + #plt.imshow(test_khat[:,:]) + #plt.show() + + polygons_seplines = filter_contours_area_of_image( + mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) + + test_khat = np.zeros(prediction_regions_org.shape) + test_khat = cv2.fillPoly(test_khat, pts = polygons_seplines, color=(1,1,1)) + + #plt.imshow(test_khat[:,:]) + #plt.show() + #sys.exit() + + polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only,1,0.00001) + ##polygons_of_only_texts = dilate_textregion_contours(polygons_of_only_texts) + polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only,1,0.00001) + + text_regions_p_true = np.zeros(prediction_regions_org.shape) + text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts=polygons_of_only_lines, color=(3,3,3)) + + text_regions_p_true[:,:][mask_images_only[:,:] == 1] = 2 + text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_texts, color=(1,1,1)) + + textline_mask_tot_ea[(text_regions_p_true==0) | (text_regions_p_true==4) ] = 0 + #plt.imshow(textline_mask_tot_ea) + #plt.show() + #print("inside 4 ", time.time()-t_in) + self.logger.debug("exit get_regions_light_v") + return (text_regions_p_true, + erosion_hurts, + polygons_seplines, + polygons_of_only_texts, + textline_mask_tot_ea, + img_bin, + confidence_matrix) + def get_regions_from_xy_2models(self,img,is_image_enhanced, num_col_classifier): self.logger.debug("enter get_regions_from_xy_2models") erosion_hurts = False @@ -4226,8 +4226,7 @@ class Eynollah: self.logger.info("Skipping layout analysis and reading order detection") _ ,_, _, _, textline_mask_tot_ea, img_bin_light, _ = \ - self.get_regions_light_v(img_res, is_image_enhanced, num_col_classifier, - skip_layout_and_reading_order=self.skip_layout_and_reading_order) + self.get_regions_light_v(img_res, is_image_enhanced, num_col_classifier,) page_coord, image_page, textline_mask_tot_ea, img_bin_light, cont_page = \ self.run_graphics_and_columns_without_layout(textline_mask_tot_ea, img_bin_light) From 374818de118dc0292dde789c6c3a233dbce4d83d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 9 Oct 2025 23:11:05 +0200 Subject: [PATCH 091/101] :memo: update changelog for 5725e4f --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9a0f190..6fd3b2e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,9 @@ Versioned according to [Semantic Versioning](http://semver.org/). Fixed: + * continue processing when no columns detected but text regions exist + * convert marginalia to main text if no main text is present + * reset deskewing angle to 0° when text covers <30% image area and detected angle >45° * :fire: polygons: avoid invalid paths (use `Polygon.buffer()` instead of dilation etc.) * `return_boxes_of_images_by_order_of_reading_new`: avoid Numpy.dtype mismatch, simplify * `return_boxes_of_images_by_order_of_reading_new`: log any exceptions instead of ignoring From 4e9a1618c355a7aeed471c9f63018440adf441cf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 10 Oct 2025 03:18:09 +0200 Subject: [PATCH 092/101] layout: refactor model setup, allow loading custom versions - simplify definition of (defaults for) model versions - unify loading of loadable models (depending on mode) - use `self.models` dict instead of `self.model_*` attributes - add `model_versions` kwarg / `--model_version` CLI option --- CHANGELOG.md | 1 + src/eynollah/cli.py | 10 +- src/eynollah/eynollah.py | 362 +++++++++++++++++++-------------------- 3 files changed, 191 insertions(+), 182 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6fd3b2e..df1e12e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ f458e3e (so CUDA memory gets freed between tests if running on GPU) Added: + * :fire: `layout` CLI: new option `--model_version` to override default choices * test coverage for OCR options in `layout` * test coverage for table detection in `layout` * CI linting with ruff diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py index 93bb676..c9bad52 100644 --- a/src/eynollah/cli.py +++ b/src/eynollah/cli.py @@ -202,6 +202,13 @@ def enhancement(image, out, overwrite, dir_in, model, num_col_upper, num_col_low type=click.Path(exists=True, file_okay=False), required=True, ) +@click.option( + "--model_version", + "-mv", + help="override default versions of model categories", + type=(str, str), + multiple=True, +) @click.option( "--save_images", "-si", @@ -373,7 +380,7 @@ def enhancement(image, out, overwrite, dir_in, model, num_col_upper, num_col_low help="Setup a basic console logger", ) -def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_deskewed, save_all, extract_only_images, save_page, enable_plotting, allow_enhancement, curved_line, textline_light, full_layout, tables, right2left, input_binary, allow_scaling, headers_off, light_version, reading_order_machine_based, do_ocr, transformer_ocr, batch_size_ocr, num_col_upper, num_col_lower, threshold_art_class_textline, threshold_art_class_layout, skip_layout_and_reading_order, ignore_page_extraction, log_level, setup_logging): +def layout(image, out, overwrite, dir_in, model, model_version, save_images, save_layout, save_deskewed, save_all, extract_only_images, save_page, enable_plotting, allow_enhancement, curved_line, textline_light, full_layout, tables, right2left, input_binary, allow_scaling, headers_off, light_version, reading_order_machine_based, do_ocr, transformer_ocr, batch_size_ocr, num_col_upper, num_col_lower, threshold_art_class_textline, threshold_art_class_layout, skip_layout_and_reading_order, ignore_page_extraction, log_level, setup_logging): if setup_logging: console_handler = logging.StreamHandler(sys.stdout) console_handler.setLevel(logging.INFO) @@ -404,6 +411,7 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ assert bool(image) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both." eynollah = Eynollah( model, + model_versions=model_version, extract_only_images=extract_only_images, enable_plotting=enable_plotting, allow_enhancement=allow_enhancement, diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 3579078..0992c8c 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -19,7 +19,7 @@ import math import os import sys import time -from typing import Optional +from typing import Dict, List, Optional, Tuple import atexit import warnings from functools import partial @@ -180,7 +180,6 @@ class Patches(layers.Layer): }) return config - class PatchEncoder(layers.Layer): def __init__(self, **kwargs): super(PatchEncoder, self).__init__() @@ -208,6 +207,7 @@ class Eynollah: def __init__( self, dir_models : str, + model_versions: List[Tuple[str, str]] = [], extract_only_images : bool =False, enable_plotting : bool = False, allow_enhancement : bool = False, @@ -254,6 +254,10 @@ class Eynollah: self.skip_layout_and_reading_order = skip_layout_and_reading_order self.ocr = do_ocr self.tr = transformer_ocr + if not batch_size_ocr: + self.b_s_ocr = 8 + else: + self.b_s_ocr = int(batch_size_ocr) if num_col_upper: self.num_col_upper = int(num_col_upper) else: @@ -275,69 +279,6 @@ class Eynollah: self.threshold_art_class_textline = float(threshold_art_class_textline) else: self.threshold_art_class_textline = 0.1 - - self.model_dir_of_enhancement = dir_models + "/eynollah-enhancement_20210425" - self.model_dir_of_binarization = dir_models + "/eynollah-binarization_20210425" - self.model_dir_of_col_classifier = dir_models + "/eynollah-column-classifier_20210425" - self.model_region_dir_p = dir_models + "/eynollah-main-regions-aug-scaling_20210425" - self.model_region_dir_p2 = dir_models + "/eynollah-main-regions-aug-rotation_20210425" - #"/modelens_full_lay_1_3_031124" - #"/modelens_full_lay_13__3_19_241024" - #"/model_full_lay_13_241024" - #"/modelens_full_lay_13_17_231024" - #"/modelens_full_lay_1_2_221024" - #"/eynollah-full-regions-1column_20210425" - self.model_region_dir_fully_np = dir_models + "/modelens_full_lay_1__4_3_091124" - #self.model_region_dir_fully = dir_models + "/eynollah-full-regions-3+column_20210425" - self.model_page_dir = dir_models + "/model_eynollah_page_extraction_20250915" - self.model_region_dir_p_ens = dir_models + "/eynollah-main-regions-ensembled_20210425" - self.model_region_dir_p_ens_light = dir_models + "/eynollah-main-regions_20220314" - self.model_region_dir_p_ens_light_only_images_extraction = (dir_models + - "/eynollah-main-regions_20231127_672_org_ens_11_13_16_17_18" - ) - self.model_reading_order_dir = (dir_models + - "/model_eynollah_reading_order_20250824" - #"/model_mb_ro_aug_ens_11" - #"/model_step_3200000_mb_ro" - #"/model_ens_reading_order_machine_based" - #"/model_mb_ro_aug_ens_8" - #"/model_ens_reading_order_machine_based" - ) - #"/modelens_12sp_elay_0_3_4__3_6_n" - #"/modelens_earlylayout_12spaltige_2_3_5_6_7_8" - #"/modelens_early12_sp_2_3_5_6_7_8_9_10_12_14_15_16_18" - #"/modelens_1_2_4_5_early_lay_1_2_spaltige" - #"/model_3_eraly_layout_no_patches_1_2_spaltige" - self.model_region_dir_p_1_2_sp_np = dir_models + "/modelens_e_l_all_sp_0_1_2_3_4_171024" - ##self.model_region_dir_fully_new = dir_models + "/model_2_full_layout_new_trans" - #"/modelens_full_lay_1_3_031124" - #"/modelens_full_lay_13__3_19_241024" - #"/model_full_lay_13_241024" - #"/modelens_full_lay_13_17_231024" - #"/modelens_full_lay_1_2_221024" - #"/modelens_full_layout_24_till_28" - #"/model_2_full_layout_new_trans" - self.model_region_dir_fully = dir_models + "/modelens_full_lay_1__4_3_091124" - if self.textline_light: - #"/modelens_textline_1_4_16092024" - #"/model_textline_ens_3_4_5_6_artificial" - #"/modelens_textline_1_3_4_20240915" - #"/model_textline_ens_3_4_5_6_artificial" - #"/modelens_textline_9_12_13_14_15" - #"/eynollah-textline_light_20210425" - self.model_textline_dir = dir_models + "/modelens_textline_0_1__2_4_16092024" - else: - #"/eynollah-textline_20210425" - self.model_textline_dir = dir_models + "/modelens_textline_0_1__2_4_16092024" - if self.ocr and self.tr: - self.model_ocr_dir = dir_models + "/model_eynollah_ocr_trocr_20250919" - elif self.ocr and not self.tr: - self.model_ocr_dir = dir_models + "/model_eynollah_ocr_cnnrnn_20250930" - if self.tables: - if self.light_version: - self.model_table_dir = dir_models + "/modelens_table_0t4_201124" - else: - self.model_table_dir = dir_models + "/eynollah-tables_20210319" t_start = time.time() @@ -356,28 +297,124 @@ class Eynollah: self.logger.warning("no GPU device available") self.logger.info("Loading models...") - - self.model_page = self.our_load_model(self.model_page_dir) - self.model_classifier = self.our_load_model(self.model_dir_of_col_classifier) - self.model_bin = self.our_load_model(self.model_dir_of_binarization) - if self.extract_only_images: - self.model_region = self.our_load_model(self.model_region_dir_p_ens_light_only_images_extraction) - else: - self.model_textline = self.our_load_model(self.model_textline_dir) + self.setup_models(dir_models, model_versions) + self.logger.info(f"Model initialization complete ({time.time() - t_start:.1f}s)") + + @staticmethod + def our_load_model(model_file, basedir=""): + if basedir: + model_file = os.path.join(basedir, model_file) + if model_file.endswith('.h5') and Path(model_file[:-3]).exists(): + # prefer SavedModel over HDF5 format if it exists + model_file = model_file[:-3] + try: + model = load_model(model_file, compile=False) + except: + model = load_model(model_file, compile=False, custom_objects={ + "PatchEncoder": PatchEncoder, "Patches": Patches}) + return model + + def setup_models(self, basedir: Path, model_versions: List[Tuple[str, str]] = []): + self.model_versions = { + "enhancement": "eynollah-enhancement_20210425", + "binarization": "eynollah-binarization_20210425", + "col_classifier": "eynollah-column-classifier_20210425", + "page": "model_eynollah_page_extraction_20250915", + #?: "eynollah-main-regions-aug-scaling_20210425", + "region": ( # early layout + "eynollah-main-regions_20231127_672_org_ens_11_13_16_17_18" if self.extract_only_images else + "eynollah-main-regions_20220314" if self.light_version else + "eynollah-main-regions-ensembled_20210425"), + "region_p2": ( # early layout, non-light, 2nd part + "eynollah-main-regions-aug-rotation_20210425"), + "region_1_2": ( # early layout, light, 1-or-2-column + #"modelens_12sp_elay_0_3_4__3_6_n" + #"modelens_earlylayout_12spaltige_2_3_5_6_7_8" + #"modelens_early12_sp_2_3_5_6_7_8_9_10_12_14_15_16_18" + #"modelens_1_2_4_5_early_lay_1_2_spaltige" + #"model_3_eraly_layout_no_patches_1_2_spaltige" + "modelens_e_l_all_sp_0_1_2_3_4_171024"), + "region_fl_np": ( # full layout / no patches + #"modelens_full_lay_1_3_031124" + #"modelens_full_lay_13__3_19_241024" + #"model_full_lay_13_241024" + #"modelens_full_lay_13_17_231024" + #"modelens_full_lay_1_2_221024" + #"eynollah-full-regions-1column_20210425" + "modelens_full_lay_1__4_3_091124"), + "region_fl": ( # full layout / with patches + #"eynollah-full-regions-3+column_20210425" + ##"model_2_full_layout_new_trans" + #"modelens_full_lay_1_3_031124" + #"modelens_full_lay_13__3_19_241024" + #"model_full_lay_13_241024" + #"modelens_full_lay_13_17_231024" + #"modelens_full_lay_1_2_221024" + #"modelens_full_layout_24_till_28" + #"model_2_full_layout_new_trans" + "modelens_full_lay_1__4_3_091124"), + "reading_order": ( + #"model_mb_ro_aug_ens_11" + #"model_step_3200000_mb_ro" + #"model_ens_reading_order_machine_based" + #"model_mb_ro_aug_ens_8" + #"model_ens_reading_order_machine_based" + "model_eynollah_reading_order_20250824"), + "textline": ( + #"modelens_textline_1_4_16092024" + #"model_textline_ens_3_4_5_6_artificial" + #"modelens_textline_1_3_4_20240915" + #"model_textline_ens_3_4_5_6_artificial" + #"modelens_textline_9_12_13_14_15" + #"eynollah-textline_light_20210425" + "modelens_textline_0_1__2_4_16092024" if self.textline_light else + #"eynollah-textline_20210425" + "modelens_textline_0_1__2_4_16092024"), + "table": ( + None if not self.tables else + "modelens_table_0t4_201124" if self.light_version else + "eynollah-tables_20210319"), + "ocr": ( + None if not self.ocr else + "model_eynollah_ocr_trocr_20250919" if self.tr else + "model_eynollah_ocr_cnnrnn_20250930") + } + # override defaults from CLI + for key, val in model_versions: + assert key in self.model_versions, "unknown model category '%s'" % key + self.logger.warning("overriding default model %s version %s to %s", key, self.model_versions[key], val) + self.model_versions[key] = val + # load models, depending on modes + loadable = [ + "col_classifier", + "binarization", + "page", + "region" + ] + if not self.extract_only_images: + loadable.append("textline") if self.light_version: - self.model_region = self.our_load_model(self.model_region_dir_p_ens_light) - self.model_region_1_2 = self.our_load_model(self.model_region_dir_p_1_2_sp_np) + loadable.append("region_1_2") else: - self.model_region = self.our_load_model(self.model_region_dir_p_ens) - self.model_region_p2 = self.our_load_model(self.model_region_dir_p2) - self.model_enhancement = self.our_load_model(self.model_dir_of_enhancement) - ###self.model_region_fl_new = self.our_load_model(self.model_region_dir_fully_new) - self.model_region_fl_np = self.our_load_model(self.model_region_dir_fully_np) - self.model_region_fl = self.our_load_model(self.model_region_dir_fully) + loadable.append("region_p2") + # if self.allow_enhancement:? + loadable.append("enhancement") + if self.full_layout: + loadable.extend(["region_fl_np", + "region_fl"]) if self.reading_order_machine_based: - self.model_reading_order = self.our_load_model(self.model_reading_order_dir) - if self.ocr and self.tr: - self.model_ocr = VisionEncoderDecoderModel.from_pretrained(self.model_ocr_dir) + loadable.append("reading_order") + if self.tables: + loadable.append("table") + + self.models = {name: self.our_load_model(self.model_versions[name], basedir) + for name in loadable + } + + if self.ocr: + ocr_model_dir = os.path.join(basedir, self.model_versions["ocr"]) + if self.tr: + self.models["ocr"] = VisionEncoderDecoderModel.from_pretrained(ocr_model_dir) if torch.cuda.is_available(): self.logger.info("Using GPU acceleration") self.device = torch.device("cuda:0") @@ -386,54 +423,29 @@ class Eynollah: self.device = torch.device("cpu") #self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed") - elif self.ocr and not self.tr: - model_ocr = load_model(self.model_ocr_dir , compile=False) - - self.prediction_model = tf.keras.models.Model( - model_ocr.get_layer(name = "image").input, - model_ocr.get_layer(name = "dense2").output) - if not batch_size_ocr: - self.b_s_ocr = 8 - else: - self.b_s_ocr = int(batch_size_ocr) + else: + ocr_model = load_model(ocr_model_dir, compile=False) + self.models["ocr"] = tf.keras.models.Model( + ocr_model.get_layer(name = "image").input, + ocr_model.get_layer(name = "dense2").output) - with open(os.path.join(self.model_ocr_dir, "characters_org.txt"),"r") as config_file: + with open(os.path.join(ocr_model_dir, "characters_org.txt"), "r") as config_file: characters = json.load(config_file) - - AUTOTUNE = tf.data.AUTOTUNE - # Mapping characters to integers. char_to_num = StringLookup(vocabulary=list(characters), mask_token=None) - # Mapping integers back to original characters. self.num_to_char = StringLookup( vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True ) - - if self.tables: - self.model_table = self.our_load_model(self.model_table_dir) - - self.logger.info(f"Model initialization complete ({time.time() - t_start:.1f}s)") def __del__(self): if hasattr(self, 'executor') and getattr(self, 'executor'): self.executor.shutdown() - for model_name in ['model_page', - 'model_classifier', - 'model_bin', - 'model_enhancement', - 'model_region', - 'model_region_1_2', - 'model_region_p2', - 'model_region_fl_np', - 'model_region_fl', - 'model_textline', - 'model_reading_order', - 'model_table', - 'model_ocr', - 'processor']: - if hasattr(self, model_name) and getattr(self, model_name): - delattr(self, model_name) + self.executor = None + if hasattr(self, 'models') and getattr(self, 'models'): + for model_name in list(self.models): + if self.models[model_name]: + del self.models[model_name] def cache_images(self, image_filename=None, image_pil=None, dpi=None): ret = {} @@ -480,8 +492,8 @@ class Eynollah: def predict_enhancement(self, img): self.logger.debug("enter predict_enhancement") - img_height_model = self.model_enhancement.layers[-1].output_shape[1] - img_width_model = self.model_enhancement.layers[-1].output_shape[2] + img_height_model = self.models["enhancement"].layers[-1].output_shape[1] + img_width_model = self.models["enhancement"].layers[-1].output_shape[2] if img.shape[0] < img_height_model: img = cv2.resize(img, (img.shape[1], img_width_model), interpolation=cv2.INTER_NEAREST) if img.shape[1] < img_width_model: @@ -522,7 +534,7 @@ class Eynollah: index_y_d = img_h - img_height_model img_patch = img[np.newaxis, index_y_d:index_y_u, index_x_d:index_x_u, :] - label_p_pred = self.model_enhancement.predict(img_patch, verbose=0) + label_p_pred = self.models["enhancement"].predict(img_patch, verbose=0) seg = label_p_pred[0, :, :, :] * 255 if i == 0 and j == 0: @@ -697,7 +709,7 @@ class Eynollah: img_in[0, :, :, 1] = img_1ch[:, :] img_in[0, :, :, 2] = img_1ch[:, :] - label_p_pred = self.model_classifier.predict(img_in, verbose=0) + label_p_pred = self.models["col_classifier"].predict(img_in, verbose=0) num_col = np.argmax(label_p_pred[0]) + 1 self.logger.info("Found %s columns (%s)", num_col, label_p_pred) @@ -715,7 +727,7 @@ class Eynollah: self.logger.info("Detected %s DPI", dpi) if self.input_binary: img = self.imread() - prediction_bin = self.do_prediction(True, img, self.model_bin, n_batch_inference=5) + prediction_bin = self.do_prediction(True, img, self.models["binarization"], n_batch_inference=5) prediction_bin = 255 * (prediction_bin[:,:,0] == 0) prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2).astype(np.uint8) img= np.copy(prediction_bin) @@ -755,7 +767,7 @@ class Eynollah: img_in[0, :, :, 1] = img_1ch[:, :] img_in[0, :, :, 2] = img_1ch[:, :] - label_p_pred = self.model_classifier.predict(img_in, verbose=0) + label_p_pred = self.models["col_classifier"].predict(img_in, verbose=0) num_col = np.argmax(label_p_pred[0]) + 1 elif (self.num_col_upper and self.num_col_lower) and (self.num_col_upper!=self.num_col_lower): @@ -776,7 +788,7 @@ class Eynollah: img_in[0, :, :, 1] = img_1ch[:, :] img_in[0, :, :, 2] = img_1ch[:, :] - label_p_pred = self.model_classifier.predict(img_in, verbose=0) + label_p_pred = self.models["col_classifier"].predict(img_in, verbose=0) num_col = np.argmax(label_p_pred[0]) + 1 if num_col > self.num_col_upper: @@ -1628,7 +1640,7 @@ class Eynollah: cont_page = [] if not self.ignore_page_extraction: img = np.copy(self.image)#cv2.GaussianBlur(self.image, (5, 5), 0) - img_page_prediction = self.do_prediction(False, img, self.model_page) + img_page_prediction = self.do_prediction(False, img, self.models["page"]) imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY) _, thresh = cv2.threshold(imgray, 0, 255, 0) ##thresh = cv2.dilate(thresh, KERNEL, iterations=3) @@ -1676,7 +1688,7 @@ class Eynollah: else: img = self.imread() img = cv2.GaussianBlur(img, (5, 5), 0) - img_page_prediction = self.do_prediction(False, img, self.model_page) + img_page_prediction = self.do_prediction(False, img, self.models["page"]) imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY) _, thresh = cv2.threshold(imgray, 0, 255, 0) @@ -1702,7 +1714,7 @@ class Eynollah: self.logger.debug("enter extract_text_regions") img_height_h = img.shape[0] img_width_h = img.shape[1] - model_region = self.model_region_fl if patches else self.model_region_fl_np + model_region = self.models["region_fl"] if patches else self.models["region_fl_np"] if self.light_version: thresholding_for_fl_light_version = True @@ -1737,7 +1749,7 @@ class Eynollah: self.logger.debug("enter extract_text_regions") img_height_h = img.shape[0] img_width_h = img.shape[1] - model_region = self.model_region_fl if patches else self.model_region_fl_np + model_region = self.models["region_fl"] if patches else self.models["region_fl_np"] if not patches: img = otsu_copy_binary(img) @@ -1958,14 +1970,14 @@ class Eynollah: img_w = img_org.shape[1] img = resize_image(img_org, int(img_org.shape[0] * scaler_h), int(img_org.shape[1] * scaler_w)) - prediction_textline = self.do_prediction(use_patches, img, self.model_textline, + prediction_textline = self.do_prediction(use_patches, img, self.models["textline"], marginal_of_patch_percent=0.15, n_batch_inference=3, thresholding_for_artificial_class_in_light_version=self.textline_light, threshold_art_class_textline=self.threshold_art_class_textline) #if not self.textline_light: #if num_col_classifier==1: - #prediction_textline_nopatch = self.do_prediction(False, img, self.model_textline) + #prediction_textline_nopatch = self.do_prediction(False, img, self.models["textline"]) #prediction_textline[:,:][prediction_textline_nopatch[:,:]==0] = 0 prediction_textline = resize_image(prediction_textline, img_h, img_w) @@ -2036,7 +2048,7 @@ class Eynollah: #cv2.imwrite('prediction_textline2.png', prediction_textline[:,:,0]) - prediction_textline_longshot = self.do_prediction(False, img, self.model_textline) + prediction_textline_longshot = self.do_prediction(False, img, self.models["textline"]) prediction_textline_longshot_true_size = resize_image(prediction_textline_longshot, img_h, img_w) @@ -2069,7 +2081,7 @@ class Eynollah: img_h_new = int(img.shape[0] / float(img.shape[1]) * img_w_new) img_resized = resize_image(img,img_h_new, img_w_new ) - prediction_regions_org, _ = self.do_prediction_new_concept(True, img_resized, self.model_region) + prediction_regions_org, _ = self.do_prediction_new_concept(True, img_resized, self.models["region"]) prediction_regions_org = resize_image(prediction_regions_org,img_height_h, img_width_h ) image_page, page_coord, cont_page = self.extract_page() @@ -2185,7 +2197,7 @@ class Eynollah: #if self.input_binary: #img_bin = np.copy(img_resized) ###if (not self.input_binary and self.full_layout) or (not self.input_binary and num_col_classifier >= 30): - ###prediction_bin = self.do_prediction(True, img_resized, self.model_bin, n_batch_inference=5) + ###prediction_bin = self.do_prediction(True, img_resized, self.models["binarization"], n_batch_inference=5) ####print("inside bin ", time.time()-t_bin) ###prediction_bin=prediction_bin[:,:,0] @@ -2200,7 +2212,7 @@ class Eynollah: ###else: ###img_bin = np.copy(img_resized) if (self.ocr and self.tr) and not self.input_binary: - prediction_bin = self.do_prediction(True, img_resized, self.model_bin, n_batch_inference=5) + prediction_bin = self.do_prediction(True, img_resized, self.models["binarization"], n_batch_inference=5) prediction_bin = 255 * (prediction_bin[:,:,0] == 0) prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2) prediction_bin = prediction_bin.astype(np.uint16) @@ -2232,14 +2244,14 @@ class Eynollah: self.logger.debug("resized to %dx%d for %d cols", img_resized.shape[1], img_resized.shape[0], num_col_classifier) prediction_regions_org, confidence_matrix = self.do_prediction_new_concept( - True, img_resized, self.model_region_1_2, n_batch_inference=1, + True, img_resized, self.models["region_1_2"], n_batch_inference=1, thresholding_for_some_classes_in_light_version=True, threshold_art_class_layout=self.threshold_art_class_layout) else: prediction_regions_org = np.zeros((self.image_org.shape[0], self.image_org.shape[1], 3)) confidence_matrix = np.zeros((self.image_org.shape[0], self.image_org.shape[1])) prediction_regions_page, confidence_matrix_page = self.do_prediction_new_concept( - False, self.image_page_org_size, self.model_region_1_2, n_batch_inference=1, + False, self.image_page_org_size, self.models["region_1_2"], n_batch_inference=1, thresholding_for_artificial_class_in_light_version=True, threshold_art_class_layout=self.threshold_art_class_layout) ys = slice(*self.page_coord[0:2]) @@ -2253,10 +2265,10 @@ class Eynollah: self.logger.debug("resized to %dx%d (new_h=%d) for %d cols", img_resized.shape[1], img_resized.shape[0], new_h, num_col_classifier) prediction_regions_org, confidence_matrix = self.do_prediction_new_concept( - True, img_resized, self.model_region_1_2, n_batch_inference=2, + True, img_resized, self.models["region_1_2"], n_batch_inference=2, thresholding_for_some_classes_in_light_version=True, threshold_art_class_layout=self.threshold_art_class_layout) - ###prediction_regions_org = self.do_prediction(True, img_bin, self.model_region, + ###prediction_regions_org = self.do_prediction(True, img_bin, self.models["region"], ###n_batch_inference=3, ###thresholding_for_some_classes_in_light_version=True) #print("inside 3 ", time.time()-t_in) @@ -2336,7 +2348,7 @@ class Eynollah: ratio_x=1 img = resize_image(img_org, int(img_org.shape[0]*ratio_y), int(img_org.shape[1]*ratio_x)) - prediction_regions_org_y = self.do_prediction(True, img, self.model_region) + prediction_regions_org_y = self.do_prediction(True, img, self.models["region"]) prediction_regions_org_y = resize_image(prediction_regions_org_y, img_height_h, img_width_h ) #plt.imshow(prediction_regions_org_y[:,:,0]) @@ -2351,7 +2363,7 @@ class Eynollah: _, _ = find_num_col(img_only_regions, num_col_classifier, self.tables, multiplier=6.0) img = resize_image(img_org, int(img_org.shape[0]), int(img_org.shape[1]*(1.2 if is_image_enhanced else 1))) - prediction_regions_org = self.do_prediction(True, img, self.model_region) + prediction_regions_org = self.do_prediction(True, img, self.models["region"]) prediction_regions_org = resize_image(prediction_regions_org, img_height_h, img_width_h ) prediction_regions_org=prediction_regions_org[:,:,0] @@ -2359,7 +2371,7 @@ class Eynollah: img = resize_image(img_org, int(img_org.shape[0]), int(img_org.shape[1])) - prediction_regions_org2 = self.do_prediction(True, img, self.model_region_p2, marginal_of_patch_percent=0.2) + prediction_regions_org2 = self.do_prediction(True, img, self.models["region_p2"], marginal_of_patch_percent=0.2) prediction_regions_org2=resize_image(prediction_regions_org2, img_height_h, img_width_h ) mask_zeros2 = (prediction_regions_org2[:,:,0] == 0) @@ -2383,7 +2395,7 @@ class Eynollah: if self.input_binary: prediction_bin = np.copy(img_org) else: - prediction_bin = self.do_prediction(True, img_org, self.model_bin, n_batch_inference=5) + prediction_bin = self.do_prediction(True, img_org, self.models["binarization"], n_batch_inference=5) prediction_bin = resize_image(prediction_bin, img_height_h, img_width_h ) prediction_bin = 255 * (prediction_bin[:,:,0]==0) prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2) @@ -2393,7 +2405,7 @@ class Eynollah: img = resize_image(prediction_bin, int(img_org.shape[0]*ratio_y), int(img_org.shape[1]*ratio_x)) - prediction_regions_org = self.do_prediction(True, img, self.model_region) + prediction_regions_org = self.do_prediction(True, img, self.models["region"]) prediction_regions_org = resize_image(prediction_regions_org, img_height_h, img_width_h ) prediction_regions_org=prediction_regions_org[:,:,0] @@ -2420,7 +2432,7 @@ class Eynollah: except: if self.input_binary: prediction_bin = np.copy(img_org) - prediction_bin = self.do_prediction(True, img_org, self.model_bin, n_batch_inference=5) + prediction_bin = self.do_prediction(True, img_org, self.models["binarization"], n_batch_inference=5) prediction_bin = resize_image(prediction_bin, img_height_h, img_width_h ) prediction_bin = 255 * (prediction_bin[:,:,0]==0) prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2) @@ -2431,14 +2443,14 @@ class Eynollah: img = resize_image(prediction_bin, int(img_org.shape[0]*ratio_y), int(img_org.shape[1]*ratio_x)) - prediction_regions_org = self.do_prediction(True, img, self.model_region) + prediction_regions_org = self.do_prediction(True, img, self.models["region"]) prediction_regions_org = resize_image(prediction_regions_org, img_height_h, img_width_h ) prediction_regions_org=prediction_regions_org[:,:,0] #mask_lines_only=(prediction_regions_org[:,:]==3)*1 #img = resize_image(img_org, int(img_org.shape[0]*1), int(img_org.shape[1]*1)) - #prediction_regions_org = self.do_prediction(True, img, self.model_region) + #prediction_regions_org = self.do_prediction(True, img, self.models["region"]) #prediction_regions_org = resize_image(prediction_regions_org, img_height_h, img_width_h ) #prediction_regions_org = prediction_regions_org[:,:,0] #prediction_regions_org[(prediction_regions_org[:,:] == 1) & (mask_zeros_y[:,:] == 1)]=0 @@ -2809,13 +2821,13 @@ class Eynollah: img_width_h = img_org.shape[1] patches = False if self.light_version: - prediction_table, _ = self.do_prediction_new_concept(patches, img, self.model_table) + prediction_table, _ = self.do_prediction_new_concept(patches, img, self.models["table"]) prediction_table = prediction_table.astype(np.int16) return prediction_table[:,:,0] else: if num_col_classifier < 4 and num_col_classifier > 2: - prediction_table = self.do_prediction(patches, img, self.model_table) - pre_updown = self.do_prediction(patches, cv2.flip(img[:,:,:], -1), self.model_table) + prediction_table = self.do_prediction(patches, img, self.models["table"]) + pre_updown = self.do_prediction(patches, cv2.flip(img[:,:,:], -1), self.models["table"]) pre_updown = cv2.flip(pre_updown, -1) prediction_table[:,:,0][pre_updown[:,:,0]==1]=1 @@ -2834,8 +2846,8 @@ class Eynollah: xs = slice(w_start, w_start + img.shape[1]) img_new[ys, xs] = img - prediction_ext = self.do_prediction(patches, img_new, self.model_table) - pre_updown = self.do_prediction(patches, cv2.flip(img_new[:,:,:], -1), self.model_table) + prediction_ext = self.do_prediction(patches, img_new, self.models["table"]) + pre_updown = self.do_prediction(patches, cv2.flip(img_new[:,:,:], -1), self.models["table"]) pre_updown = cv2.flip(pre_updown, -1) prediction_table = prediction_ext[ys, xs] @@ -2856,8 +2868,8 @@ class Eynollah: xs = slice(w_start, w_start + img.shape[1]) img_new[ys, xs] = img - prediction_ext = self.do_prediction(patches, img_new, self.model_table) - pre_updown = self.do_prediction(patches, cv2.flip(img_new[:,:,:], -1), self.model_table) + prediction_ext = self.do_prediction(patches, img_new, self.models["table"]) + pre_updown = self.do_prediction(patches, cv2.flip(img_new[:,:,:], -1), self.models["table"]) pre_updown = cv2.flip(pre_updown, -1) prediction_table = prediction_ext[ys, xs] @@ -2869,10 +2881,10 @@ class Eynollah: prediction_table = np.zeros(img.shape) img_w_half = img.shape[1] // 2 - pre1 = self.do_prediction(patches, img[:,0:img_w_half,:], self.model_table) - pre2 = self.do_prediction(patches, img[:,img_w_half:,:], self.model_table) - pre_full = self.do_prediction(patches, img[:,:,:], self.model_table) - pre_updown = self.do_prediction(patches, cv2.flip(img[:,:,:], -1), self.model_table) + pre1 = self.do_prediction(patches, img[:,0:img_w_half,:], self.models["table"]) + pre2 = self.do_prediction(patches, img[:,img_w_half:,:], self.models["table"]) + pre_full = self.do_prediction(patches, img[:,:,:], self.models["table"]) + pre_updown = self.do_prediction(patches, cv2.flip(img[:,:,:], -1), self.models["table"]) pre_updown = cv2.flip(pre_updown, -1) prediction_table_full_erode = cv2.erode(pre_full[:,:,0], KERNEL, iterations=4) @@ -3474,18 +3486,6 @@ class Eynollah: regions_without_separators_d, regions_fully, regions_without_separators, polygons_of_marginals, contours_tables) - @staticmethod - def our_load_model(model_file): - if model_file.endswith('.h5') and Path(model_file[:-3]).exists(): - # prefer SavedModel over HDF5 format if it exists - model_file = model_file[:-3] - try: - model = load_model(model_file, compile=False) - except: - model = load_model(model_file, compile=False, custom_objects={ - "PatchEncoder": PatchEncoder, "Patches": Patches}) - return model - def do_order_of_regions_with_model(self, contours_only_text_parent, contours_only_text_parent_h, text_regions_p): height1 =672#448 @@ -3676,7 +3676,7 @@ class Eynollah: tot_counter += 1 batch.append(j) if tot_counter % inference_bs == 0 or tot_counter == len(ij_list): - y_pr = self.model_reading_order.predict(input_1 , verbose=0) + y_pr = self.models["reading_order"].predict(input_1 , verbose=0) for jb, j in enumerate(batch): if y_pr[jb][0]>=0.5: post_list.append(j) @@ -4259,7 +4259,7 @@ class Eynollah: gc.collect() ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines( image_page, all_found_textline_polygons, np.zeros((len(all_found_textline_polygons), 4)), - self.prediction_model, self.b_s_ocr, self.num_to_char, textline_light=True) + self.models["ocr"], self.b_s_ocr, self.num_to_char, textline_light=True) else: ocr_all_textlines = None @@ -4768,27 +4768,27 @@ class Eynollah: if len(all_found_textline_polygons): ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines( image_page, all_found_textline_polygons, all_box_coord, - self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + self.models["ocr"], self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) if len(all_found_textline_polygons_marginals_left): ocr_all_textlines_marginals_left = return_rnn_cnn_ocr_of_given_textlines( image_page, all_found_textline_polygons_marginals_left, all_box_coord_marginals_left, - self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + self.models["ocr"], self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) if len(all_found_textline_polygons_marginals_right): ocr_all_textlines_marginals_right = return_rnn_cnn_ocr_of_given_textlines( image_page, all_found_textline_polygons_marginals_right, all_box_coord_marginals_right, - self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + self.models["ocr"], self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) if self.full_layout and len(all_found_textline_polygons): ocr_all_textlines_h = return_rnn_cnn_ocr_of_given_textlines( image_page, all_found_textline_polygons_h, all_box_coord_h, - self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + self.models["ocr"], self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) if self.full_layout and len(polygons_of_drop_capitals): ocr_all_textlines_drop = return_rnn_cnn_ocr_of_given_textlines( image_page, polygons_of_drop_capitals, np.zeros((len(polygons_of_drop_capitals), 4)), - self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + self.models["ocr"], self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) else: if self.light_version: @@ -4800,7 +4800,7 @@ class Eynollah: gc.collect() torch.cuda.empty_cache() - self.model_ocr.to(self.device) + self.models["ocr"].to(self.device) ind_tot = 0 #cv2.imwrite('./img_out.png', image_page) @@ -4837,7 +4837,7 @@ class Eynollah: img_croped = img_poly_on_img[y:y+h, x:x+w, :] #cv2.imwrite('./extracted_lines/'+str(ind_tot)+'.jpg', img_croped) text_ocr = self.return_ocr_of_textline_without_common_section( - img_croped, self.model_ocr, self.processor, self.device, w, h2w_ratio, ind_tot) + img_croped, self.models["ocr"], self.processor, self.device, w, h2w_ratio, ind_tot) ocr_textline_in_textregion.append(text_ocr) ind_tot = ind_tot +1 ocr_all_textlines.append(ocr_textline_in_textregion) From 2056a8bdb9aff8895235f36f2ddf11a42b0469a3 Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 10 Oct 2025 16:32:47 +0200 Subject: [PATCH 093/101] :package: v0.6.0rc1 --- CHANGELOG.md | 3 +++ src/eynollah/ocrd-tool.json | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index df1e12e..d0ad43c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [0.6.0rc1] - 2025-10-10 + Fixed: * continue processing when no columns detected but text regions exist @@ -289,6 +291,7 @@ Fixed: Initial release +[0.6.0rc1]: ../../compare/v0.6.0rc1...v0.5.0 [0.5.0]: ../../compare/v0.5.0...v0.4.0 [0.4.0]: ../../compare/v0.4.0...v0.3.1 [0.3.1]: ../../compare/v0.3.1...v0.3.0 diff --git a/src/eynollah/ocrd-tool.json b/src/eynollah/ocrd-tool.json index 5d89c92..2ae4ead 100644 --- a/src/eynollah/ocrd-tool.json +++ b/src/eynollah/ocrd-tool.json @@ -1,5 +1,5 @@ { - "version": "0.5.0", + "version": "0.6.0rc1", "git_url": "https://github.com/qurator-spk/eynollah", "dockerhub": "ocrd/eynollah", "tools": { From 745cf3be48ad6d5fee9c6297e50ea2d52d7f8fd2 Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 10 Oct 2025 16:39:16 +0200 Subject: [PATCH 094/101] XML encoding should be utf-8 not utf8 ... and should use OCR-D's generateDS PAGE API consistently --- src/eynollah/eynollah.py | 4 ++-- src/eynollah/mb_ro_on_layout.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 0992c8c..94bd10c 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -5284,7 +5284,7 @@ class Eynollah_ocr: ##unicode_textpage.text = tot_page_text ET.register_namespace("",name_space) - tree1.write(out_file_ocr,xml_declaration=True,method='xml',encoding="utf8",default_namespace=None) + tree1.write(out_file_ocr,xml_declaration=True,method='xml',encoding="utf-8",default_namespace=None) else: ###max_len = 280#512#280#512 ###padding_token = 1500#299#1500#299 @@ -5833,5 +5833,5 @@ class Eynollah_ocr: ##unicode_textpage.text = tot_page_text ET.register_namespace("",name_space) - tree1.write(out_file_ocr,xml_declaration=True,method='xml',encoding="utf8",default_namespace=None) + tree1.write(out_file_ocr,xml_declaration=True,method='xml',encoding="utf-8",default_namespace=None) #print("Job done in %.1fs", time.time() - t0) diff --git a/src/eynollah/mb_ro_on_layout.py b/src/eynollah/mb_ro_on_layout.py index 218f973..1b991ae 100644 --- a/src/eynollah/mb_ro_on_layout.py +++ b/src/eynollah/mb_ro_on_layout.py @@ -805,7 +805,7 @@ class machine_based_reading_order_on_layout: tree_xml.write(os.path.join(dir_out, file_name+'.xml'), xml_declaration=True, method='xml', - encoding="utf8", + encoding="utf-8", default_namespace=None) #sys.exit() From e8b7212f36af40c536bdf3607d53d6c60460b129 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 14 Oct 2025 14:16:39 +0200 Subject: [PATCH 095/101] `polygon2contour`: avoid uint for coords (introduced in a433c736 to make consistent with `filter_contours_area_of_image`, but actually np.uint is prone to create overflows downstream) --- src/eynollah/utils/contour.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index f998c4d..21068b3 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -276,7 +276,7 @@ def contour2polygon(contour: Union[np.ndarray, Sequence[Sequence[Sequence[Number def polygon2contour(polygon: Polygon) -> np.ndarray: polygon = np.array(polygon.exterior.coords[:-1], dtype=int) - return np.maximum(0, polygon).astype(np.uint)[:, np.newaxis] + return np.maximum(0, polygon).astype(int)[:, np.newaxis] def make_intersection(poly1, poly2): interp = poly1.intersection(poly2) From 8299e7009a569c0c3c82e603df245c730f4f52b4 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 14 Oct 2025 14:23:29 +0200 Subject: [PATCH 096/101] `setup_models`: avoid unnecessarily loading `region_fl` --- src/eynollah/eynollah.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 0992c8c..6367c91 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -385,6 +385,8 @@ class Eynollah: self.logger.warning("overriding default model %s version %s to %s", key, self.model_versions[key], val) self.model_versions[key] = val # load models, depending on modes + # (note: loading too many models can cause OOM on GPU/CUDA, + # thus, we try set up the minimal configuration for the current mode) loadable = [ "col_classifier", "binarization", @@ -400,8 +402,8 @@ class Eynollah: # if self.allow_enhancement:? loadable.append("enhancement") if self.full_layout: - loadable.extend(["region_fl_np", - "region_fl"]) + loadable.append("region_fl_np") + #loadable.append("region_fl") if self.reading_order_machine_based: loadable.append("reading_order") if self.tables: From 2febf534797eaa5be35caf16d7965c3ac39bdd39 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 14 Oct 2025 14:52:31 +0200 Subject: [PATCH 097/101] :memo: changelog --- CHANGELOG.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d0ad43c..dfd6868 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Fixed: + + * Prevent OOM GPU error by avoiding loading the `region_fl` model, #199 + ## [0.6.0rc1] - 2025-10-10 Fixed: @@ -21,8 +25,7 @@ Fixed: * Dockerfile: fix CUDA installation (cuDNN contested between Torch and TF due to extra OCR) * OCR: re-instate missing methods and fix `utils_ocr` function calls * mbreorder/enhancement CLIs: missing imports - * :fire: writer: `SeparatorRegion` needs `SeparatorRegionType` (not `ImageRegionType`) -f458e3e + * :fire: writer: `SeparatorRegion` needs `SeparatorRegionType` (not `ImageRegionType`), f458e3e * tests: switch from `pytest-subtests` to `parametrize` so we can use `pytest-isolate` (so CUDA memory gets freed between tests if running on GPU) From c1f01588062714ba0c5146dc676c2dacade3e36f Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 14 Oct 2025 14:53:15 +0200 Subject: [PATCH 098/101] :memo: changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index dfd6868..636880f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ Versioned according to [Semantic Versioning](http://semver.org/). Fixed: * Prevent OOM GPU error by avoiding loading the `region_fl` model, #199 + * XML output: encoding should be `utf-8`, not `utf8`, #196, #197 ## [0.6.0rc1] - 2025-10-10 From f485dd41819018a39960e45d5fd61c68d835cf1a Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 14 Oct 2025 16:10:50 +0200 Subject: [PATCH 099/101] :package: v0.6.0rc2 --- CHANGELOG.md | 3 +++ src/eynollah/ocrd-tool.json | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 636880f..f84c153 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [0.6.0rc2] - 2025-10-14 + Fixed: * Prevent OOM GPU error by avoiding loading the `region_fl` model, #199 @@ -295,6 +297,7 @@ Fixed: Initial release +[0.6.0rc2]: ../../compare/v0.6.0rc2...v0.6.0rc1 [0.6.0rc1]: ../../compare/v0.6.0rc1...v0.5.0 [0.5.0]: ../../compare/v0.5.0...v0.4.0 [0.4.0]: ../../compare/v0.4.0...v0.3.1 diff --git a/src/eynollah/ocrd-tool.json b/src/eynollah/ocrd-tool.json index 2ae4ead..f9c6f4d 100644 --- a/src/eynollah/ocrd-tool.json +++ b/src/eynollah/ocrd-tool.json @@ -1,5 +1,5 @@ { - "version": "0.6.0rc1", + "version": "0.6.0rc2", "git_url": "https://github.com/qurator-spk/eynollah", "dockerhub": "ocrd/eynollah", "tools": { From 948c8c3441f6dfa1f371e01a73f79ba957acd5c7 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 15 Oct 2025 16:58:17 +0200 Subject: [PATCH 100/101] join_polygons: try to catch rare case of MultiPolygon --- src/eynollah/utils/contour.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index 21068b3..f71bdc4 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -353,6 +353,8 @@ def join_polygons(polygons: Sequence[Polygon], scale=20) -> Polygon: bridgep = orient(LineString(nearest).buffer(max(1, scale/5), resolution=1), -1) polygons.append(bridgep) jointp = unary_union(polygons) + if jointp.geom_type == 'MultiPolygon': + jointp = unary_union(jointp.geoms) assert jointp.geom_type == 'Polygon', jointp.wkt # follow-up calculations will necessarily be integer; # so anticipate rounding here and then ensure validity From d2f0a43088e31a8948b903b5b1de10cd695ce3ae Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 16 Oct 2025 20:46:03 +0200 Subject: [PATCH 101/101] :memo: changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f84c153..249affa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Fixed: + + * `join_polygons` always returning Polygon, not MultiPolygon, #203 + ## [0.6.0rc2] - 2025-10-14 Fixed: