From 0b9d4901a61ea777fc0db6e90930a734fe33302d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 2 Oct 2025 20:51:03 +0200 Subject: [PATCH 01/44] contour features: avoid unused calculations, simplify, add shortcuts - new function: `find_center_of_contours` - simplified: `find_(new_)features_of_contours` --- src/eynollah/utils/contour.py | 78 ++++++++++++----------------------- 1 file changed, 27 insertions(+), 51 deletions(-) diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index 0700ed4..041cbf6 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -79,61 +79,37 @@ def filter_contours_area_of_image_tables(image, contours, hierarchy, max_area=1. found_polygons_early.append(polygon2contour(polygon)) return found_polygons_early -def find_new_features_of_contours(contours_main): - areas_main = np.array([cv2.contourArea(contours_main[j]) - for j in range(len(contours_main))]) - M_main = [cv2.moments(contours_main[j]) - for j in range(len(contours_main))] - cx_main = [(M_main[j]["m10"] / (M_main[j]["m00"] + 1e-32)) - for j in range(len(M_main))] - cy_main = [(M_main[j]["m01"] / (M_main[j]["m00"] + 1e-32)) - for j in range(len(M_main))] - try: - x_min_main = np.array([np.min(contours_main[j][:, 0, 0]) - for j in range(len(contours_main))]) - argmin_x_main = np.array([np.argmin(contours_main[j][:, 0, 0]) - for j in range(len(contours_main))]) - x_min_from_argmin = np.array([contours_main[j][argmin_x_main[j], 0, 0] - for j in range(len(contours_main))]) - y_corr_x_min_from_argmin = np.array([contours_main[j][argmin_x_main[j], 0, 1] - for j in range(len(contours_main))]) - x_max_main = np.array([np.max(contours_main[j][:, 0, 0]) - for j in range(len(contours_main))]) - y_min_main = np.array([np.min(contours_main[j][:, 0, 1]) - for j in range(len(contours_main))]) - y_max_main = np.array([np.max(contours_main[j][:, 0, 1]) - for j in range(len(contours_main))]) - except: - x_min_main = np.array([np.min(contours_main[j][:, 0]) - for j in range(len(contours_main))]) - argmin_x_main = np.array([np.argmin(contours_main[j][:, 0]) - for j in range(len(contours_main))]) - x_min_from_argmin = np.array([contours_main[j][argmin_x_main[j], 0] - for j in range(len(contours_main))]) - y_corr_x_min_from_argmin = np.array([contours_main[j][argmin_x_main[j], 1] - for j in range(len(contours_main))]) - x_max_main = np.array([np.max(contours_main[j][:, 0]) - for j in range(len(contours_main))]) - y_min_main = np.array([np.min(contours_main[j][:, 1]) - for j in range(len(contours_main))]) - y_max_main = np.array([np.max(contours_main[j][:, 1]) - for j in range(len(contours_main))]) - # dis_x=np.abs(x_max_main-x_min_main) +def find_center_of_contours(contours): + moments = [cv2.moments(contour) for contour in contours] + cx = [feat["m10"] / (feat["m00"] + 1e-32) + for feat in moments] + cy = [feat["m01"] / (feat["m00"] + 1e-32) + for feat in moments] + return cx, cy - return cx_main, cy_main, x_min_main, x_max_main, y_min_main, y_max_main, y_corr_x_min_from_argmin +def find_new_features_of_contours(contours): + # areas = np.array([cv2.contourArea(contour) for contour in contours]) + cx, cy = find_center_of_contours(contours) + slice_x = np.index_exp[:, 0, 0] + slice_y = np.index_exp[:, 0, 1] + if any(contour.ndim < 3 for contour in contours): + slice_x = np.index_exp[:, 0] + slice_y = np.index_exp[:, 1] + x_min = np.array([np.min(contour[slice_x]) for contour in contours]) + x_max = np.array([np.max(contour[slice_x]) for contour in contours]) + y_min = np.array([np.min(contour[slice_y]) for contour in contours]) + y_max = np.array([np.max(contour[slice_y]) for contour in contours]) + # dis_x=np.abs(x_max-x_min) + y_corr_x_min = np.array([contour[np.argmin(contour[slice_x])][slice_y[1:]] + for contour in contours]) -def find_features_of_contours(contours_main): - areas_main=np.array([cv2.contourArea(contours_main[j]) for j in range(len(contours_main))]) - M_main=[cv2.moments(contours_main[j]) for j in range(len(contours_main))] - cx_main=[(M_main[j]['m10']/(M_main[j]['m00']+1e-32)) for j in range(len(M_main))] - cy_main=[(M_main[j]['m01']/(M_main[j]['m00']+1e-32)) for j in range(len(M_main))] - x_min_main=np.array([np.min(contours_main[j][:,0,0]) for j in range(len(contours_main))]) - x_max_main=np.array([np.max(contours_main[j][:,0,0]) for j in range(len(contours_main))]) + return cx, cy, x_min, x_max, y_min, y_max, y_corr_x_min - y_min_main=np.array([np.min(contours_main[j][:,0,1]) for j in range(len(contours_main))]) - y_max_main=np.array([np.max(contours_main[j][:,0,1]) for j in range(len(contours_main))]) +def find_features_of_contours(contours): + y_min = np.array([np.min(contour[:,0,1]) for contour in contours]) + y_max = np.array([np.max(contour[:,0,1]) for contour in contours]) - return y_min_main, y_max_main + return y_min, y_max def return_parent_contours(contours, hierarchy): contours_parent = [contours[i] From 81827c2942e0a6b7e4121b9de510108de4f026fa Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 2 Oct 2025 21:03:07 +0200 Subject: [PATCH 02/44] filter_contours_inside_a_bigger_one: simplify - use new `find_center_of_contours` - avoid loops in favour of array processing - use sets instead of `np.unique` and `np.delete` instead of list.pop --- src/eynollah/eynollah.py | 102 +++++++++++++++------------------------ 1 file changed, 39 insertions(+), 63 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 62ce002..b2d9016 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4208,7 +4208,7 @@ class Eynollah: return generated_text def return_list_of_contours_with_desired_order(self, ls_cons, sorted_indexes): - return [ls_cons[sorted_indexes[index]] for index in range(len(sorted_indexes))] + return list(np.array(ls_cons)[np.array(sorted_indexes)]) def return_it_in_two_groups(self, x_differential): split = [ind if x_differential[ind]!=x_differential[ind+1] else -1 @@ -4237,47 +4237,38 @@ class Eynollah: def filter_contours_inside_a_bigger_one(self, contours, contours_d_ordered, image, marginal_cnts=None, type_contour="textregion"): - if type_contour=="textregion": - areas = [cv2.contourArea(contours[j]) for j in range(len(contours))] + if type_contour == "textregion": + areas = np.array(list(map(cv2.contourArea, contours))) area_tot = image.shape[0]*image.shape[1] + areas_ratio = areas / area_tot + cx_main, cy_main = find_center_of_contours(contours) - M_main = [cv2.moments(contours[j]) - for j in range(len(contours))] - cx_main = [(M_main[j]["m10"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))] - cy_main = [(M_main[j]["m01"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))] + contours_index_small = np.flatnonzero(areas_ratio < 1e-3) + contours_index_large = np.flatnonzero(areas_ratio >= 1e-3) - areas_ratio = np.array(areas)/ area_tot - contours_index_small = [ind for ind in range(len(contours)) if areas_ratio[ind] < 1e-3] - contours_index_big = [ind for ind in range(len(contours)) if areas_ratio[ind] >= 1e-3] - - #contours_> = [contours[ind] for ind in contours_index_big] + #contours_> = [contours[ind] for ind in contours_index_large] indexes_to_be_removed = [] for ind_small in contours_index_small: - results = [cv2.pointPolygonTest(contours[ind], (cx_main[ind_small], cy_main[ind_small]), False) - for ind in contours_index_big] - if marginal_cnts: - results_marginal = [cv2.pointPolygonTest(marginal_cnts[ind], + results = [cv2.pointPolygonTest(contours[ind_large], (cx_main[ind_small], + cy_main[ind_small]), + False) + for ind_large in contours_index_large] + results = np.array(results) + if np.any(results==1): + indexes_to_be_removed.append(ind_small) + elif marginal_cnts: + results_marginal = [cv2.pointPolygonTest(marginal_cnt, (cx_main[ind_small], cy_main[ind_small]), False) - for ind in range(len(marginal_cnts))] + for marginal_cnt in marginal_cnts] results_marginal = np.array(results_marginal) - if np.any(results_marginal==1): indexes_to_be_removed.append(ind_small) - results = np.array(results) - - if np.any(results==1): - indexes_to_be_removed.append(ind_small) - - if len(indexes_to_be_removed)>0: - indexes_to_be_removed = np.unique(indexes_to_be_removed) - indexes_to_be_removed = np.sort(indexes_to_be_removed)[::-1] - for ind in indexes_to_be_removed: - contours.pop(ind) - if len(contours_d_ordered)>0: - contours_d_ordered.pop(ind) + contours = np.delete(contours, indexes_to_be_removed, axis=0) + if len(contours_d_ordered): + contours_d_ordered = np.delete(contours_d_ordered, indexes_to_be_removed, axis=0) return contours, contours_d_ordered @@ -4285,33 +4276,21 @@ class Eynollah: contours_txtline_of_all_textregions = [] indexes_of_textline_tot = [] index_textline_inside_textregion = [] + for ind_region, textlines in enumerate(contours): + contours_txtline_of_all_textregions.extend(textlines) + index_textline_inside_textregion.extend(list(range(len(textlines)))) + indexes_of_textline_tot.extend([ind_region] * len(textlines)) - for jj in range(len(contours)): - contours_txtline_of_all_textregions = contours_txtline_of_all_textregions + contours[jj] - - ind_textline_inside_tr = list(range(len(contours[jj]))) - index_textline_inside_textregion = index_textline_inside_textregion + ind_textline_inside_tr - ind_ins = [jj] * len(contours[jj]) - indexes_of_textline_tot = indexes_of_textline_tot + ind_ins - - M_main_tot = [cv2.moments(contours_txtline_of_all_textregions[j]) - for j in range(len(contours_txtline_of_all_textregions))] - cx_main_tot = [(M_main_tot[j]["m10"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] - cy_main_tot = [(M_main_tot[j]["m01"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] - - areas_tot = [cv2.contourArea(con_ind) for con_ind in contours_txtline_of_all_textregions] + areas_tot = np.array(list(map(cv2.contourArea, contours_txtline_of_all_textregions))) area_tot_tot = image.shape[0]*image.shape[1] + cx_main_tot, cy_main_tot = find_center_of_contours(contours_txtline_of_all_textregions) - textregion_index_to_del = [] - textline_in_textregion_index_to_del = [] + textline_in_textregion_index_to_del = {} for ij in range(len(contours_txtline_of_all_textregions)): - args_all = list(np.array(range(len(contours_txtline_of_all_textregions)))) - args_all.pop(ij) - - areas_without = np.array(areas_tot)[args_all] area_of_con_interest = areas_tot[ij] - - args_with_bigger_area = np.array(args_all)[areas_without > 1.5*area_of_con_interest] + args_without = np.delete(np.arange(len(contours_txtline_of_all_textregions)), ij) + areas_without = areas_tot[args_without] + args_with_bigger_area = args_without[areas_without > 1.5*area_of_con_interest] if len(args_with_bigger_area)>0: results = [cv2.pointPolygonTest(contours_txtline_of_all_textregions[ind], @@ -4322,18 +4301,15 @@ class Eynollah: results = np.array(results) if np.any(results==1): #print(indexes_of_textline_tot[ij], index_textline_inside_textregion[ij]) - textregion_index_to_del.append(int(indexes_of_textline_tot[ij])) - textline_in_textregion_index_to_del.append(int(index_textline_inside_textregion[ij])) - #contours[int(indexes_of_textline_tot[ij])].pop(int(index_textline_inside_textregion[ij])) + textline_in_textregion_index_to_del.setdefault( + indexes_of_textline_tot[ij], list()).append( + index_textline_inside_textregion[ij]) + #contours[indexes_of_textline_tot[ij]].pop(index_textline_inside_textregion[ij]) - textregion_index_to_del = np.array(textregion_index_to_del) - textline_in_textregion_index_to_del = np.array(textline_in_textregion_index_to_del) - for ind_u_a_trs in np.unique(textregion_index_to_del): - textline_in_textregion_index_to_del_ind = \ - textline_in_textregion_index_to_del[textregion_index_to_del==ind_u_a_trs] - textline_in_textregion_index_to_del_ind = np.sort(textline_in_textregion_index_to_del_ind)[::-1] - for ittrd in textline_in_textregion_index_to_del_ind: - contours[ind_u_a_trs].pop(ittrd) + for textregion_index_to_del in textline_in_textregion_index_to_del: + contours[textregion_index_to_del] = list(np.delete( + contours[textregion_index_to_del], + textline_in_textregion_index_to_del[textregion_index_to_del])) return contours From 8c3d5eb0eb0eccd97542a86b0d3385e95f4f1da0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 2 Oct 2025 21:07:35 +0200 Subject: [PATCH 03/44] separate_marginals_to_left_and_right_and_order_from_top_to_down: simplify - use new `find_center_of_contours` - avoid loops in favour of array processing - avoid repeated sorting --- src/eynollah/eynollah.py | 75 +++++++++++++++++----------------- src/eynollah/utils/__init__.py | 2 +- 2 files changed, 39 insertions(+), 38 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index b2d9016..9eba3d3 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4418,52 +4418,53 @@ class Eynollah: def separate_marginals_to_left_and_right_and_order_from_top_to_down( self, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes_marginals, mid_point_of_page_width): - cx_marg, cy_marg, _, _, _, _, _ = find_new_features_of_contours( - polygons_of_marginals) - + cx_marg, cy_marg = find_center_of_contours(polygons_of_marginals) cx_marg = np.array(cx_marg) cy_marg = np.array(cy_marg) + + def split(lis): + array = np.array(lis) + return (list(array[cx_marg < mid_point_of_page_width]), + list(array[cx_marg >= mid_point_of_page_width])) + + (poly_marg_left, + poly_marg_right) = \ + split(polygons_of_marginals) + + (all_found_textline_polygons_marginals_left, + all_found_textline_polygons_marginals_right) = \ + split(all_found_textline_polygons_marginals) - poly_marg_left = list( np.array(polygons_of_marginals)[cx_marg < mid_point_of_page_width] ) - poly_marg_right = list( np.array(polygons_of_marginals)[cx_marg >= mid_point_of_page_width] ) + (all_box_coord_marginals_left, + all_box_coord_marginals_right) = \ + split(all_box_coord_marginals) - all_found_textline_polygons_marginals_left = \ - list( np.array(all_found_textline_polygons_marginals)[cx_marg < mid_point_of_page_width] ) - all_found_textline_polygons_marginals_right = \ - list( np.array(all_found_textline_polygons_marginals)[cx_marg >= mid_point_of_page_width] ) + (slopes_marg_left, + slopes_marg_right) = \ + split(slopes_marginals) - all_box_coord_marginals_left = list( np.array(all_box_coord_marginals)[cx_marg < mid_point_of_page_width] ) - all_box_coord_marginals_right = list( np.array(all_box_coord_marginals)[cx_marg >= mid_point_of_page_width] ) + (cy_marg_left, + cy_marg_right) = \ + split(cy_marg) + + order_left = np.argsort(cy_marg_left) + order_right = np.argsort(cy_marg_right) + def sort_left(lis): + return list(np.array(lis)[order_left]) + def sort_right(lis): + return list(np.array(lis)[order_right]) - slopes_marg_left = list( np.array(slopes_marginals)[cx_marg < mid_point_of_page_width] ) - slopes_marg_right = list( np.array(slopes_marginals)[cx_marg >= mid_point_of_page_width] ) + ordered_left_marginals = sort_left(poly_marg_left) + ordered_right_marginals = sort_right(poly_marg_right) - cy_marg_left = cy_marg[cx_marg < mid_point_of_page_width] - cy_marg_right = cy_marg[cx_marg >= mid_point_of_page_width] + ordered_left_marginals_textline = sort_left(all_found_textline_polygons_marginals_left) + ordered_right_marginals_textline = sort_right(all_found_textline_polygons_marginals_right) - ordered_left_marginals = [poly for _, poly in sorted(zip(cy_marg_left, poly_marg_left), - key=lambda x: x[0])] - ordered_right_marginals = [poly for _, poly in sorted(zip(cy_marg_right, poly_marg_right), - key=lambda x: x[0])] + ordered_left_marginals_bbox = sort_left(all_box_coord_marginals_left) + ordered_right_marginals_bbox = sort_right(all_box_coord_marginals_right) - ordered_left_marginals_textline = [poly for _, poly in sorted(zip(cy_marg_left, - all_found_textline_polygons_marginals_left), - key=lambda x: x[0])] - ordered_right_marginals_textline = [poly for _, poly in sorted(zip(cy_marg_right, - all_found_textline_polygons_marginals_right), - key=lambda x: x[0])] - - ordered_left_marginals_bbox = [poly for _, poly in sorted(zip(cy_marg_left, - all_box_coord_marginals_left), - key=lambda x: x[0])] - ordered_right_marginals_bbox = [poly for _, poly in sorted(zip(cy_marg_right, - all_box_coord_marginals_right), - key=lambda x: x[0])] - - ordered_left_slopes_marginals = [poly for _, poly in sorted(zip(cy_marg_left, slopes_marg_left), - key=lambda x: x[0])] - ordered_right_slopes_marginals = [poly for _, poly in sorted(zip(cy_marg_right, slopes_marg_right), - key=lambda x: x[0])] + ordered_left_slopes_marginals = sort_left(slopes_marg_left) + ordered_right_slopes_marginals = sort_right(slopes_marg_right) return (ordered_left_marginals, ordered_right_marginals, diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 52bf3ef..4eee5a9 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1417,7 +1417,7 @@ def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( imgray = cv2.cvtColor(sep_ver_hor_cross, cv2.COLOR_BGR2GRAY) ret, thresh = cv2.threshold(imgray, 0, 255, 0) contours_cross,_=cv2.findContours(thresh,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) - cx_cross,cy_cross ,_ , _, _ ,_,_=find_new_features_of_contours(contours_cross) + cx_cross, cy_cross = find_center_of_contours(contours_cross) for ii in range(len(cx_cross)): img_p_in[int(cy_cross[ii])-30:int(cy_cross[ii])+30,int(cx_cross[ii])+5:int(cx_cross[ii])+40,0]=0 img_p_in[int(cy_cross[ii])-30:int(cy_cross[ii])+30,int(cx_cross[ii])-40:int(cx_cross[ii])-4,0]=0 From 3f3353ec3a53384a100ef9ebe2fefb7e092e968c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 2 Oct 2025 21:28:04 +0200 Subject: [PATCH 04/44] do_order_of_regions: simplify - avoid loops in favour of array processing --- src/eynollah/eynollah.py | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 9eba3d3..7f7f53f 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2518,6 +2518,8 @@ class Eynollah: self, contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot): self.logger.debug("enter do_order_of_regions_full_layout") + contours_only_text_parent = np.array(contours_only_text_parent) + contours_only_text_parent_h = np.array(contours_only_text_parent_h) boxes = np.array(boxes, dtype=int) # to be on the safe side cx_text_only, cy_text_only, x_min_text_only, _, _, _, y_cor_x_min_main = find_new_features_of_contours( contours_only_text_parent) @@ -2573,14 +2575,9 @@ class Eynollah: xs = slice(*boxes[iij][0:2]) args_contours_box = args_contours[np.array(arg_text_con) == iij] args_contours_box_h = args_contours_h[np.array(arg_text_con_h) == iij] - con_inter_box = [] - con_inter_box_h = [] + con_inter_box = contours_only_text_parent[args_contours_box] + con_inter_box_h = contours_only_text_parent_h[args_contours_box_h] - for box in args_contours_box: - con_inter_box.append(contours_only_text_parent[box]) - - for box in args_contours_box_h: - con_inter_box_h.append(contours_only_text_parent_h[box]) indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, boxes[iij][2]) @@ -2675,14 +2672,8 @@ class Eynollah: xs = slice(*boxes[iij][0:2]) args_contours_box = args_contours[np.array(arg_text_con) == iij] args_contours_box_h = args_contours_h[np.array(arg_text_con_h) == iij] - con_inter_box = [] - con_inter_box_h = [] - - for box in args_contours_box: - con_inter_box.append(contours_only_text_parent[box]) - - for box in args_contours_box_h: - con_inter_box_h.append(contours_only_text_parent_h[box]) + con_inter_box = contours_only_text_parent[args_contours_box] + con_inter_box_h = contours_only_text_parent_h[args_contours_box_h] indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, boxes[iij][2]) @@ -2729,6 +2720,8 @@ class Eynollah: self, contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot): self.logger.debug("enter do_order_of_regions_no_full_layout") + contours_only_text_parent = np.array(contours_only_text_parent) + contours_only_text_parent_h = np.array(contours_only_text_parent_h) boxes = np.array(boxes, dtype=int) # to be on the safe side cx_text_only, cy_text_only, x_min_text_only, _, _, _, y_cor_x_min_main = find_new_features_of_contours( contours_only_text_parent) @@ -2761,10 +2754,8 @@ class Eynollah: ys = slice(*boxes[iij][2:4]) xs = slice(*boxes[iij][0:2]) args_contours_box = args_contours[np.array(arg_text_con) == iij] - con_inter_box = [] + con_inter_box = contours_only_text_parent[args_contours_box] con_inter_box_h = [] - for i in range(len(args_contours_box)): - con_inter_box.append(contours_only_text_parent[args_contours_box[i]]) indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, boxes[iij][2]) From 415b2cbad843d4fa083f94f459777af97bd81234 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 2 Oct 2025 21:36:22 +0200 Subject: [PATCH 05/44] eynollah, drop_capitals: simplify - use new `find_center_of_contours` --- src/eynollah/eynollah.py | 21 ++++++++------------- src/eynollah/utils/drop_capitals.py | 27 ++++++++++++++------------- 2 files changed, 22 insertions(+), 26 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 7f7f53f..357c0c2 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -70,6 +70,7 @@ from .utils.contour import ( filter_contours_area_of_image, filter_contours_area_of_image_tables, find_contours_mean_y_diff, + find_center_of_contours, find_new_features_of_contours, find_features_of_contours, get_text_region_boxes_by_given_contours, @@ -1859,14 +1860,10 @@ class Eynollah: def get_slopes_and_deskew_new_light2(self, contours, contours_par, textline_mask_tot, boxes, slope_deskew): polygons_of_textlines = return_contours_of_interested_region(textline_mask_tot,1,0.00001) - M_main_tot = [cv2.moments(polygons_of_textlines[j]) - for j in range(len(polygons_of_textlines))] + cx_main_tot, cy_main_tot = find_center_of_contours(polygons_of_textlines) + w_h_textlines = [cv2.boundingRect(polygon)[2:] for polygon in polygons_of_textlines] - w_h_textlines = [cv2.boundingRect(polygons_of_textlines[i])[2:] for i in range(len(polygons_of_textlines))] - cx_main_tot = [(M_main_tot[j]["m10"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] - cy_main_tot = [(M_main_tot[j]["m01"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] - - args_textlines = np.array(range(len(polygons_of_textlines))) + args_textlines = np.arange(len(polygons_of_textlines)) all_found_textline_polygons = [] slopes = [] all_box_coord =[] @@ -4809,8 +4806,8 @@ class Eynollah: areas_cnt_text_parent = self.return_list_of_contours_with_desired_order( areas_cnt_text_parent, index_con_parents) - cx_bigest_big, cy_biggest_big, _, _, _, _, _ = find_new_features_of_contours([contours_biggest]) - cx_bigest, cy_biggest, _, _, _, _, _ = find_new_features_of_contours(contours_only_text_parent) + cx_bigest_big, cy_biggest_big = find_center_of_contours([contours_biggest]) + cx_bigest, cy_biggest = find_center_of_contours(contours_only_text_parent) if np.abs(slope_deskew) >= SLOPE_THRESHOLD: contours_only_text_d, hir_on_text_d = return_contours_of_image(text_only_d) @@ -4834,10 +4831,8 @@ class Eynollah: areas_cnt_text_d = self.return_list_of_contours_with_desired_order( areas_cnt_text_d, index_con_parents_d) - cx_bigest_d_big, cy_biggest_d_big, _, _, _, _, _ = \ - find_new_features_of_contours([contours_biggest_d]) - cx_bigest_d, cy_biggest_d, _, _, _, _, _ = \ - find_new_features_of_contours(contours_only_text_parent_d) + cx_bigest_d_big, cy_biggest_d_big = find_center_of_contours([contours_biggest_d]) + cx_bigest_d, cy_biggest_d = find_center_of_contours(contours_only_text_parent_d) try: if len(cx_bigest_d) >= 5: cx_bigest_d_last5 = cx_bigest_d[-5:] diff --git a/src/eynollah/utils/drop_capitals.py b/src/eynollah/utils/drop_capitals.py index 67547d3..9f82fac 100644 --- a/src/eynollah/utils/drop_capitals.py +++ b/src/eynollah/utils/drop_capitals.py @@ -1,6 +1,7 @@ import numpy as np import cv2 from .contour import ( + find_center_of_contours, find_new_features_of_contours, return_contours_of_image, return_parent_contours, @@ -22,8 +23,8 @@ def adhere_drop_capital_region_into_corresponding_textline( ): # print(np.shape(all_found_textline_polygons),np.shape(all_found_textline_polygons[3]),'all_found_textline_polygonsshape') # print(all_found_textline_polygons[3]) - cx_m, cy_m, _, _, _, _, _ = find_new_features_of_contours(contours_only_text_parent) - cx_h, cy_h, _, _, _, _, _ = find_new_features_of_contours(contours_only_text_parent_h) + cx_m, cy_m = find_center_of_contours(contours_only_text_parent) + cx_h, cy_h = find_center_of_contours(contours_only_text_parent_h) cx_d, cy_d, _, _, y_min_d, y_max_d, _ = find_new_features_of_contours(polygons_of_drop_capitals) img_con_all = np.zeros((text_regions_p.shape[0], text_regions_p.shape[1], 3)) @@ -89,9 +90,9 @@ def adhere_drop_capital_region_into_corresponding_textline( region_final = region_with_intersected_drop[np.argmax(sum_pixels_of_intersection)] - 1 # print(region_final,'region_final') - # cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) + # cx_t, cy_t = find_center_of_contours(all_found_textline_polygons[int(region_final)]) try: - cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) + cx_t, cy_t = find_center_of_contours(all_found_textline_polygons[int(region_final)]) # print(all_box_coord[j_cont]) # print(cx_t) # print(cy_t) @@ -153,9 +154,9 @@ def adhere_drop_capital_region_into_corresponding_textline( # areas_main=np.array([cv2.contourArea(all_found_textline_polygons[int(region_final)][0][j] ) for j in range(len(all_found_textline_polygons[int(region_final)]))]) - # cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) + # cx_t, cy_t = find_center_of_contours(all_found_textline_polygons[int(region_final)]) try: - cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) + cx_t, cy_t = find_center_of_contours(all_found_textline_polygons[int(region_final)]) # print(all_box_coord[j_cont]) # print(cx_t) # print(cy_t) @@ -208,7 +209,7 @@ def adhere_drop_capital_region_into_corresponding_textline( try: # print(all_found_textline_polygons[j_cont][0]) - cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) + cx_t, cy_t = find_center_of_contours(all_found_textline_polygons[int(region_final)]) # print(all_box_coord[j_cont]) # print(cx_t) # print(cy_t) @@ -261,7 +262,7 @@ def adhere_drop_capital_region_into_corresponding_textline( else: pass - ##cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) + ##cx_t, cy_t = find_center_of_contours(all_found_textline_polygons[int(region_final)]) ###print(all_box_coord[j_cont]) ###print(cx_t) ###print(cy_t) @@ -315,9 +316,9 @@ def adhere_drop_capital_region_into_corresponding_textline( region_final = region_with_intersected_drop[np.argmax(sum_pixels_of_intersection)] - 1 # print(region_final,'region_final') - # cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) + # cx_t, cy_t = find_center_of_contours(all_found_textline_polygons[int(region_final)]) try: - cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) + cx_t, cy_t = find_center_of_contours(all_found_textline_polygons[int(region_final)]) # print(all_box_coord[j_cont]) # print(cx_t) # print(cy_t) @@ -375,12 +376,12 @@ def adhere_drop_capital_region_into_corresponding_textline( # areas_main=np.array([cv2.contourArea(all_found_textline_polygons[int(region_final)][0][j] ) for j in range(len(all_found_textline_polygons[int(region_final)]))]) - # cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) + # cx_t, cy_t = find_center_of_contours(all_found_textline_polygons[int(region_final)]) # print(cx_t,'print') try: # print(all_found_textline_polygons[j_cont][0]) - cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) + cx_t, cy_t = find_center_of_contours(all_found_textline_polygons[int(region_final)]) # print(all_box_coord[j_cont]) # print(cx_t) # print(cy_t) @@ -453,7 +454,7 @@ def adhere_drop_capital_region_into_corresponding_textline( #####try: #####if len(contours_new_parent)==1: ######print(all_found_textline_polygons[j_cont][0]) - #####cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_textline_polygons[j_cont]) + #####cx_t, cy_t = find_center_of_contours(all_found_textline_polygons[j_cont]) ######print(all_box_coord[j_cont]) ######print(cx_t) ######print(cy_t) From a1c8fd44677fc894395652de070710a5fc6aae2e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 2 Oct 2025 21:41:37 +0200 Subject: [PATCH 06/44] do_order_of_regions / order_of_regions: simplify - array-convert only once (before returning from `order_of_regions`) - avoid passing `matrix_of_orders` unnecessarily between `order_of_regions` and `order_and_id_of_texts` --- src/eynollah/eynollah.py | 73 +++++++++++++++++----------------- src/eynollah/utils/__init__.py | 2 +- src/eynollah/utils/xml.py | 6 +-- 3 files changed, 38 insertions(+), 43 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 357c0c2..8351ab6 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2567,26 +2567,25 @@ class Eynollah: ref_point = 0 order_of_texts_tot = [] id_of_texts_tot = [] - for iij in range(len(boxes)): - ys = slice(*boxes[iij][2:4]) - xs = slice(*boxes[iij][0:2]) + for iij, box in enumerate(boxes): + ys = slice(*box[2:4]) + xs = slice(*box[0:2]) args_contours_box = args_contours[np.array(arg_text_con) == iij] args_contours_box_h = args_contours_h[np.array(arg_text_con_h) == iij] con_inter_box = contours_only_text_parent[args_contours_box] con_inter_box_h = contours_only_text_parent_h[args_contours_box_h] - - indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( - textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, boxes[iij][2]) + indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( + textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2]) order_of_texts, id_of_texts = order_and_id_of_texts( con_inter_box, con_inter_box_h, - matrix_of_orders, indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) + indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) - indexes_sorted_main = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 1] - indexes_by_type_main = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 1] - indexes_sorted_head = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 2] - indexes_by_type_head = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 2] + indexes_sorted_main = indexes_sorted[kind_of_texts_sorted == 1] + indexes_by_type_main = index_by_kind_sorted[kind_of_texts_sorted == 1] + indexes_sorted_head = indexes_sorted[kind_of_texts_sorted == 2] + indexes_by_type_head = index_by_kind_sorted[kind_of_texts_sorted == 2] for zahler, _ in enumerate(args_contours_box): arg_order_v = indexes_sorted_main[zahler] @@ -2664,25 +2663,25 @@ class Eynollah: ref_point = 0 order_of_texts_tot = [] id_of_texts_tot = [] - for iij, _ in enumerate(boxes): - ys = slice(*boxes[iij][2:4]) - xs = slice(*boxes[iij][0:2]) + for iij, box in enumerate(boxes): + ys = slice(*box[2:4]) + xs = slice(*box[0:2]) args_contours_box = args_contours[np.array(arg_text_con) == iij] args_contours_box_h = args_contours_h[np.array(arg_text_con_h) == iij] con_inter_box = contours_only_text_parent[args_contours_box] con_inter_box_h = contours_only_text_parent_h[args_contours_box_h] - indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( - textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, boxes[iij][2]) + indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( + textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2]) order_of_texts, id_of_texts = order_and_id_of_texts( con_inter_box, con_inter_box_h, - matrix_of_orders, indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) + indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) - indexes_sorted_main = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 1] - indexes_by_type_main = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 1] - indexes_sorted_head = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 2] - indexes_by_type_head = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 2] + indexes_sorted_main = indexes_sorted[kind_of_texts_sorted == 1] + indexes_by_type_main = index_by_kind_sorted[kind_of_texts_sorted == 1] + indexes_sorted_head = indexes_sorted[kind_of_texts_sorted == 2] + indexes_by_type_head = index_by_kind_sorted[kind_of_texts_sorted == 2] for zahler, _ in enumerate(args_contours_box): arg_order_v = indexes_sorted_main[zahler] @@ -2747,22 +2746,22 @@ class Eynollah: ref_point = 0 order_of_texts_tot = [] id_of_texts_tot = [] - for iij in range(len(boxes)): - ys = slice(*boxes[iij][2:4]) - xs = slice(*boxes[iij][0:2]) + for iij, box in enumerate(boxes): + ys = slice(*box[2:4]) + xs = slice(*box[0:2]) args_contours_box = args_contours[np.array(arg_text_con) == iij] con_inter_box = contours_only_text_parent[args_contours_box] con_inter_box_h = [] - indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( - textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, boxes[iij][2]) + indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( + textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2]) order_of_texts, id_of_texts = order_and_id_of_texts( con_inter_box, con_inter_box_h, - matrix_of_orders, indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) + indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) - indexes_sorted_main = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 1] - indexes_by_type_main = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 1] + indexes_sorted_main = indexes_sorted[kind_of_texts_sorted == 1] + indexes_by_type_main = index_by_kind_sorted[kind_of_texts_sorted == 1] for zahler, _ in enumerate(args_contours_box): arg_order_v = indexes_sorted_main[zahler] @@ -2808,24 +2807,24 @@ class Eynollah: ref_point = 0 order_of_texts_tot = [] id_of_texts_tot = [] - for iij in range(len(boxes)): - ys = slice(*boxes[iij][2:4]) - xs = slice(*boxes[iij][0:2]) + for iij, box in enumerate(boxes): + ys = slice(*box[2:4]) + xs = slice(*box[0:2]) args_contours_box = args_contours[np.array(arg_text_con) == iij] con_inter_box = [] con_inter_box_h = [] for i in range(len(args_contours_box)): con_inter_box.append(contours_only_text_parent[args_contours_box[i]]) - indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( - textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, boxes[iij][2]) + indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( + textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2]) order_of_texts, id_of_texts = order_and_id_of_texts( con_inter_box, con_inter_box_h, - matrix_of_orders, indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) + indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) - indexes_sorted_main = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 1] - indexes_by_type_main = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 1] + indexes_sorted_main = indexes_sorted[kind_of_texts_sorted == 1] + indexes_by_type_main = index_by_kind_sorted[kind_of_texts_sorted == 1] for zahler, _ in enumerate(args_contours_box): arg_order_v = indexes_sorted_main[zahler] diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 4eee5a9..27a85da 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1325,7 +1325,7 @@ def order_of_regions(textline_mask, contours_main, contours_header, y_ref): final_types.append(1) final_index_type.append(ind_missed) - return final_indexers_sorted, matrix_of_orders, final_types, final_index_type + return np.array(final_indexers_sorted), np.array(final_types), np.array(final_index_type) def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( img_p_in_ver, img_in_hor,num_col_classifier): diff --git a/src/eynollah/utils/xml.py b/src/eynollah/utils/xml.py index 13420df..a61dadb 100644 --- a/src/eynollah/utils/xml.py +++ b/src/eynollah/utils/xml.py @@ -65,11 +65,7 @@ def xml_reading_order(page, order_of_texts, id_of_marginalia_left, id_of_margina og.add_RegionRefIndexed(RegionRefIndexedType(index=str(region_counter.get('region')), regionRef=id_marginal)) region_counter.inc('region') -def order_and_id_of_texts(found_polygons_text_region, found_polygons_text_region_h, matrix_of_orders, indexes_sorted, index_of_types, kind_of_texts, ref_point): - indexes_sorted = np.array(indexes_sorted) - index_of_types = np.array(index_of_types) - kind_of_texts = np.array(kind_of_texts) - +def order_and_id_of_texts(found_polygons_text_region, found_polygons_text_region_h, indexes_sorted, index_of_types, kind_of_texts, ref_point): id_of_texts = [] order_of_texts = [] From 4950e6bd784e2078ca7b65b1fcbf20de29d0f613 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 2 Oct 2025 22:28:52 +0200 Subject: [PATCH 07/44] order_of_regions: simplify - use new `find_center_of_contours` - avoid unused calculations - avoid loops in favour of array processing --- src/eynollah/utils/__init__.py | 131 +++++++++------------------------ 1 file changed, 34 insertions(+), 97 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 27a85da..92da14a 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -15,10 +15,21 @@ from scipy.ndimage import gaussian_filter1d from .is_nan import isNaN from .contour import (contours_in_same_horizon, + find_center_of_contours, find_new_features_of_contours, return_contours_of_image, return_parent_contours) +def pairwise(iterable): + # pairwise('ABCDEFG') → AB BC CD DE EF FG + + iterator = iter(iterable) + a = next(iterator, None) + + for b in iterator: + yield a, b + a = b + def return_x_start_end_mothers_childs_and_type_of_reading_order( x_min_hor_some, x_max_hor_some, cy_hor_some, peak_points, cy_hor_diff): @@ -1183,106 +1194,45 @@ def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col) textlines_con_changed.append(textlines_big_org_form) return textlines_con_changed -def order_of_regions(textline_mask, contours_main, contours_header, y_ref): +def order_of_regions(textline_mask, contours_main, contours_head, y_ref): ##plt.imshow(textline_mask) ##plt.show() - """ - print(len(contours_main),'contours_main') - mada_n=textline_mask.sum(axis=1) - y=mada_n[:] - - y_help=np.zeros(len(y)+40) - y_help[20:len(y)+20]=y - x=np.arange(len(y)) - - peaks_real, _ = find_peaks(gaussian_filter1d(y, 3), height=0) - ##plt.imshow(textline_mask[:,:]) - ##plt.show() - - sigma_gaus=8 - z= gaussian_filter1d(y_help, sigma_gaus) - zneg_rev=-y_help+np.max(y_help) - zneg=np.zeros(len(zneg_rev)+40) - zneg[20:len(zneg_rev)+20]=zneg_rev - zneg= gaussian_filter1d(zneg, sigma_gaus) - - peaks, _ = find_peaks(z, height=0) - peaks_neg, _ = find_peaks(zneg, height=0) - peaks_neg=peaks_neg-20-20 - peaks=peaks-20 - """ - textline_sum_along_width = textline_mask.sum(axis=1) - - y = textline_sum_along_width[:] + y = textline_mask.sum(axis=1) # horizontal projection profile y_padded = np.zeros(len(y) + 40) y_padded[20 : len(y) + 20] = y - x = np.arange(len(y)) - - peaks_real, _ = find_peaks(gaussian_filter1d(y, 3), height=0) sigma_gaus = 8 - z = gaussian_filter1d(y_padded, sigma_gaus) - zneg_rev = -y_padded + np.max(y_padded) + #z = gaussian_filter1d(y_padded, sigma_gaus) + #peaks, _ = find_peaks(z, height=0) + #peaks = peaks - 20 + zneg_rev = np.max(y_padded) - y_padded zneg = np.zeros(len(zneg_rev) + 40) zneg[20 : len(zneg_rev) + 20] = zneg_rev zneg = gaussian_filter1d(zneg, sigma_gaus) - peaks, _ = find_peaks(z, height=0) peaks_neg, _ = find_peaks(zneg, height=0) peaks_neg = peaks_neg - 20 - 20 - peaks = peaks - 20 ##plt.plot(z) ##plt.show() - if contours_main != None: - areas_main = np.array([cv2.contourArea(contours_main[j]) for j in range(len(contours_main))]) - M_main = [cv2.moments(contours_main[j]) for j in range(len(contours_main))] - cx_main = [(M_main[j]["m10"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))] - cy_main = [(M_main[j]["m01"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))] - x_min_main = np.array([np.min(contours_main[j][:, 0, 0]) for j in range(len(contours_main))]) - x_max_main = np.array([np.max(contours_main[j][:, 0, 0]) for j in range(len(contours_main))]) + cx_main, cy_main = find_center_of_contours(contours_main) + cx_head, cy_head = find_center_of_contours(contours_head) - y_min_main = np.array([np.min(contours_main[j][:, 0, 1]) for j in range(len(contours_main))]) - y_max_main = np.array([np.max(contours_main[j][:, 0, 1]) for j in range(len(contours_main))]) + peaks_neg_new = np.append(np.insert(peaks_neg, 0, 0), textline_mask.shape[0]) + # offset from bbox of mask + peaks_neg_new += y_ref - if len(contours_header) != None: - areas_header = np.array([cv2.contourArea(contours_header[j]) for j in range(len(contours_header))]) - M_header = [cv2.moments(contours_header[j]) for j in range(len(contours_header))] - cx_header = [(M_header[j]["m10"] / (M_header[j]["m00"] + 1e-32)) for j in range(len(M_header))] - cy_header = [(M_header[j]["m01"] / (M_header[j]["m00"] + 1e-32)) for j in range(len(M_header))] - x_min_header = np.array([np.min(contours_header[j][:, 0, 0]) for j in range(len(contours_header))]) - x_max_header = np.array([np.max(contours_header[j][:, 0, 0]) for j in range(len(contours_header))]) - - y_min_header = np.array([np.min(contours_header[j][:, 0, 1]) for j in range(len(contours_header))]) - y_max_header = np.array([np.max(contours_header[j][:, 0, 1]) for j in range(len(contours_header))]) - # print(cy_main,'mainy') - - peaks_neg_new = [] - peaks_neg_new.append(0 + y_ref) - for iii in range(len(peaks_neg)): - peaks_neg_new.append(peaks_neg[iii] + y_ref) - peaks_neg_new.append(textline_mask.shape[0] + y_ref) - - if len(cy_main) > 0 and np.max(cy_main) > np.max(peaks_neg_new): - cy_main = np.array(cy_main) * (np.max(peaks_neg_new) / np.max(cy_main)) - 10 - if contours_main != None: - indexer_main = np.arange(len(contours_main)) - if contours_main != None: - len_main = len(contours_main) - else: - len_main = 0 - - matrix_of_orders = np.zeros((len(contours_main) + len(contours_header), 5)) - matrix_of_orders[:, 0] = np.arange(len(contours_main) + len(contours_header)) + matrix_of_orders = np.zeros((len(contours_main) + len(contours_head), 5), dtype=int) + matrix_of_orders[:, 0] = np.arange(len(contours_main) + len(contours_head)) matrix_of_orders[: len(contours_main), 1] = 1 matrix_of_orders[len(contours_main) :, 1] = 2 matrix_of_orders[: len(contours_main), 2] = cx_main - matrix_of_orders[len(contours_main) :, 2] = cx_header + matrix_of_orders[len(contours_main) :, 2] = cx_head matrix_of_orders[: len(contours_main), 3] = cy_main - matrix_of_orders[len(contours_main) :, 3] = cy_header + matrix_of_orders[len(contours_main) :, 3] = cy_head matrix_of_orders[: len(contours_main), 4] = np.arange(len(contours_main)) - matrix_of_orders[len(contours_main) :, 4] = np.arange(len(contours_header)) + matrix_of_orders[len(contours_main) :, 4] = np.arange(len(contours_head)) # print(peaks_neg_new,'peaks_neg_new') # print(matrix_of_orders,'matrix_of_orders') @@ -1290,27 +1240,14 @@ def order_of_regions(textline_mask, contours_main, contours_header, y_ref): final_indexers_sorted = [] final_types = [] final_index_type = [] - for i in range(len(peaks_neg_new) - 1): - top = peaks_neg_new[i] - down = peaks_neg_new[i + 1] - indexes_in = matrix_of_orders[:, 0][(matrix_of_orders[:, 3] >= top) & - ((matrix_of_orders[:, 3] < down))] - cxs_in = matrix_of_orders[:, 2][(matrix_of_orders[:, 3] >= top) & - ((matrix_of_orders[:, 3] < down))] - cys_in = matrix_of_orders[:, 3][(matrix_of_orders[:, 3] >= top) & - ((matrix_of_orders[:, 3] < down))] - types_of_text = matrix_of_orders[:, 1][(matrix_of_orders[:, 3] >= top) & - (matrix_of_orders[:, 3] < down)] - index_types_of_text = matrix_of_orders[:, 4][(matrix_of_orders[:, 3] >= top) & - (matrix_of_orders[:, 3] < down)] + for top, bot in pairwise(peaks_neg_new): + indexes_in, types_in, cxs_in, cys_in, typed_indexes_in = \ + matrix_of_orders[(matrix_of_orders[:, 3] >= top) & + (matrix_of_orders[:, 3] < bot)].T sorted_inside = np.argsort(cxs_in) - ind_in_int = indexes_in[sorted_inside] - ind_in_type = types_of_text[sorted_inside] - ind_ind_type = index_types_of_text[sorted_inside] - for j in range(len(ind_in_int)): - final_indexers_sorted.append(int(ind_in_int[j])) - final_types.append(int(ind_in_type[j])) - final_index_type.append(int(ind_ind_type[j])) + final_indexers_sorted.extend(indexes_in[sorted_inside]) + final_types.extend(types_in[sorted_inside]) + final_index_type.extend(typed_indexes_in[sorted_inside]) ##matrix_of_orders[:len_main,4]=final_indexers_sorted[:] From 7387f5a92994bc5c2678be643816e5883f32cfa1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 2 Oct 2025 22:35:40 +0200 Subject: [PATCH 08/44] do_order_of_regions: improve box matching, simplify - when searching for boxes matching contour, be more precise: - avoid heuristic rules ("xmin + 80 within xrange") in favour of exact criteria (contour properly contained in box) - for fallback criterion (nearest centers), also require proper containment of center in box - `order_of_regions`: remove (now) unnecessary (and insufficient) workaround for missing indexes (if boxes are not covering contours exactly) --- src/eynollah/eynollah.py | 185 ++++++++++++++++++--------------- src/eynollah/utils/__init__.py | 14 +-- 2 files changed, 106 insertions(+), 93 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 8351ab6..3194b66 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2518,51 +2518,59 @@ class Eynollah: contours_only_text_parent = np.array(contours_only_text_parent) contours_only_text_parent_h = np.array(contours_only_text_parent_h) boxes = np.array(boxes, dtype=int) # to be on the safe side - cx_text_only, cy_text_only, x_min_text_only, _, _, _, y_cor_x_min_main = find_new_features_of_contours( + c_boxes = np.stack((0.5 * boxes[:, 2:4].sum(axis=1), + 0.5 * boxes[:, 0:2].sum(axis=1))) + cx_main, cy_main, mx_main, Mx_main, my_main, My_main, mxy_main = find_new_features_of_contours( contours_only_text_parent) - cx_text_only_h, cy_text_only_h, x_min_text_only_h, _, _, _, y_cor_x_min_main_h = find_new_features_of_contours( + cx_head, cy_head, mx_head, Mx_head, my_head, My_head, mxy_head = find_new_features_of_contours( contours_only_text_parent_h) try: arg_text_con = [] - for ii in range(len(cx_text_only)): + for ii in range(len(contours_only_text_parent)): check_if_textregion_located_in_a_box = False - for jj in range(len(boxes)): - if (x_min_text_only[ii] + 80 >= boxes[jj][0] and - x_min_text_only[ii] + 80 < boxes[jj][1] and - y_cor_x_min_main[ii] >= boxes[jj][2] and - y_cor_x_min_main[ii] < boxes[jj][3]): + for jj, box in enumerate(boxes): + if (mx_main[ii] >= box[0] and + Mx_main[ii] < box[1] and + my_main[ii] >= box[2] and + My_main[ii] < box[3]): arg_text_con.append(jj) check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: - dists_tr_from_box = [math.sqrt((cx_text_only[ii] - boxes[jj][1]) ** 2 + - (cy_text_only[ii] - boxes[jj][2]) ** 2) - for jj in range(len(boxes))] - ind_min = np.argmin(dists_tr_from_box) + # dists_tr_from_box = [math.sqrt((cx_main[ii] - 0.5 * box[1] - 0.5 * box[0]) ** 2 + + # (cy_main[ii] - 0.5 * box[3] - 0.5 * box[2]) ** 2) + # for box in boxes] + dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0) + pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & + (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) + ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) arg_text_con.append(ind_min) - args_contours = np.array(range(len(arg_text_con))) + args_contours = np.arange(len(arg_text_con)) + order_by_con_main = np.zeros(len(arg_text_con)) + arg_text_con_h = [] - for ii in range(len(cx_text_only_h)): + for ii in range(len(contours_only_text_parent_h)): check_if_textregion_located_in_a_box = False - for jj in range(len(boxes)): - if (x_min_text_only_h[ii] + 80 >= boxes[jj][0] and - x_min_text_only_h[ii] + 80 < boxes[jj][1] and - y_cor_x_min_main_h[ii] >= boxes[jj][2] and - y_cor_x_min_main_h[ii] < boxes[jj][3]): + for jj, box in enumerate(boxes): + if (mx_head[ii] >= box[0] and + Mx_head[ii] < box[1] and + my_head[ii] >= box[2] and + My_head[ii] < box[3]): arg_text_con_h.append(jj) check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: - dists_tr_from_box = [math.sqrt((cx_text_only_h[ii] - boxes[jj][1]) ** 2 + - (cy_text_only_h[ii] - boxes[jj][2]) ** 2) - for jj in range(len(boxes))] - ind_min = np.argmin(dists_tr_from_box) + # dists_tr_from_box = [math.sqrt((cx_head[ii] - 0.5 * box[1] - 0.5 * box[0]) ** 2 + + # (cy_head[ii] - 0.5 * box[3] - 0.5 * box[2]) ** 2) + # for box in boxes] + dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_head[ii]], [cx_head[ii]]]), axis=0) + pcontained_in_box = ((boxes[:, 2] <= cy_head[ii]) & (cy_head[ii] < boxes[:, 3]) & + (boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1])) + ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) arg_text_con_h.append(ind_min) - args_contours_h = np.array(range(len(arg_text_con_h))) - + args_contours_h = np.arange(len(arg_text_con_h)) order_by_con_head = np.zeros(len(arg_text_con_h)) - order_by_con_main = np.zeros(len(arg_text_con)) ref_point = 0 order_of_texts_tot = [] @@ -2590,12 +2598,12 @@ class Eynollah: for zahler, _ in enumerate(args_contours_box): arg_order_v = indexes_sorted_main[zahler] order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = \ - np.where(indexes_sorted == arg_order_v)[0][0] + ref_point + np.flatnonzero(indexes_sorted == arg_order_v) + ref_point for zahler, _ in enumerate(args_contours_box_h): arg_order_v = indexes_sorted_head[zahler] order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = \ - np.where(indexes_sorted == arg_order_v)[0][0] + ref_point + np.flatnonzero(indexes_sorted == arg_order_v) + ref_point for jji in range(len(id_of_texts)): order_of_texts_tot.append(order_of_texts[jji] + ref_point) @@ -2611,53 +2619,59 @@ class Eynollah: order_text_new = [] for iii in range(len(order_of_texts_tot)): - order_text_new.append(np.where(np.array(order_of_texts_tot) == iii)[0][0]) + order_text_new.append(np.flatnonzero(np.array(order_of_texts_tot) == iii)) except Exception as why: self.logger.error(why) arg_text_con = [] - for ii in range(len(cx_text_only)): + for ii in range(len(contours_only_text_parent)): check_if_textregion_located_in_a_box = False - for jj in range(len(boxes)): - if (cx_text_only[ii] >= boxes[jj][0] and - cx_text_only[ii] < boxes[jj][1] and - cy_text_only[ii] >= boxes[jj][2] and - cy_text_only[ii] < boxes[jj][3]): + for jj, box in enumerate(boxes): + if (cx_main[ii] >= box[0] and + cx_main[ii] < box[1] and + cy_main[ii] >= box[2] and + cy_main[ii] < box[3]): # this is valid if the center of region identify in which box it is located arg_text_con.append(jj) check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: - dists_tr_from_box = [math.sqrt((cx_text_only[ii] - boxes[jj][1]) ** 2 + - (cy_text_only[ii] - boxes[jj][2]) ** 2) - for jj in range(len(boxes))] - ind_min = np.argmin(dists_tr_from_box) + # dists_tr_from_box = [math.sqrt((cx_main[ii] - 0.5 * box[1] - 0.5 * box[0]) ** 2 + + # (cy_main[ii] - 0.5 * box[3] - 0.5 * box[2]) ** 2) + # for box in boxes] + dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0) + pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & + (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) + ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) arg_text_con.append(ind_min) - args_contours = np.array(range(len(arg_text_con))) + args_contours = np.arange(len(arg_text_con)) order_by_con_main = np.zeros(len(arg_text_con)) ############################# head arg_text_con_h = [] - for ii in range(len(cx_text_only_h)): + for ii in range(len(contours_only_text_parent_h)): check_if_textregion_located_in_a_box = False - for jj in range(len(boxes)): - if (cx_text_only_h[ii] >= boxes[jj][0] and - cx_text_only_h[ii] < boxes[jj][1] and - cy_text_only_h[ii] >= boxes[jj][2] and - cy_text_only_h[ii] < boxes[jj][3]): + for jj, box in enumerate(boxes): + if (cx_head[ii] >= box[0] and + cx_head[ii] < box[1] and + cy_head[ii] >= box[2] and + cy_head[ii] < box[3]): # this is valid if the center of region identify in which box it is located arg_text_con_h.append(jj) check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: - dists_tr_from_box = [math.sqrt((cx_text_only_h[ii] - boxes[jj][1]) ** 2 + - (cy_text_only_h[ii] - boxes[jj][2]) ** 2) - for jj in range(len(boxes))] - ind_min = np.argmin(dists_tr_from_box) + # dists_tr_from_box = [math.sqrt((cx_head[ii] - 0.5 * box[1] - 0.5 * box[0]) ** 2 + + # (cy_head[ii] - 0.5 * box[3] - 0.5 * box[2]) ** 2) + # for box in boxes] + dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_head[ii]], [cx_head[ii]]]), axis=0) + pcontained_in_box = ((boxes[:, 2] <= cy_head[ii]) & (cy_head[ii] < boxes[:, 3]) & + (boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1])) + ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) arg_text_con_h.append(ind_min) - args_contours_h = np.array(range(len(arg_text_con_h))) + args_contours_h = np.arange(len(arg_text_con_h)) order_by_con_head = np.zeros(len(arg_text_con_h)) ref_point = 0 @@ -2686,14 +2700,14 @@ class Eynollah: for zahler, _ in enumerate(args_contours_box): arg_order_v = indexes_sorted_main[zahler] order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = \ - np.where(indexes_sorted == arg_order_v)[0][0] + ref_point + np.flatnonzero(indexes_sorted == arg_order_v) + ref_point for zahler, _ in enumerate(args_contours_box_h): arg_order_v = indexes_sorted_head[zahler] order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = \ - np.where(indexes_sorted == arg_order_v)[0][0] + ref_point + np.flatnonzero(indexes_sorted == arg_order_v) + ref_point - for jji, _ in enumerate(id_of_texts): + for jji in range(len(id_of_texts)): order_of_texts_tot.append(order_of_texts[jji] + ref_point) id_of_texts_tot.append(id_of_texts[jji]) ref_point += len(id_of_texts) @@ -2707,7 +2721,7 @@ class Eynollah: order_text_new = [] for iii in range(len(order_of_texts_tot)): - order_text_new.append(np.where(np.array(order_of_texts_tot) == iii)[0][0]) + order_text_new.append(np.flatnonzero(np.array(order_of_texts_tot) == iii)) self.logger.debug("exit do_order_of_regions_full_layout") return order_text_new, id_of_texts_tot @@ -2719,28 +2733,33 @@ class Eynollah: contours_only_text_parent = np.array(contours_only_text_parent) contours_only_text_parent_h = np.array(contours_only_text_parent_h) boxes = np.array(boxes, dtype=int) # to be on the safe side - cx_text_only, cy_text_only, x_min_text_only, _, _, _, y_cor_x_min_main = find_new_features_of_contours( + c_boxes = np.stack((0.5 * boxes[:, 2:4].sum(axis=1), + 0.5 * boxes[:, 0:2].sum(axis=1))) + cx_main, cy_main, mx_main, Mx_main, my_main, My_main, mxy_main = find_new_features_of_contours( contours_only_text_parent) try: arg_text_con = [] - for ii in range(len(cx_text_only)): + for ii in range(len(contours_only_text_parent)): check_if_textregion_located_in_a_box = False - for jj in range(len(boxes)): - if (x_min_text_only[ii] + 80 >= boxes[jj][0] and - x_min_text_only[ii] + 80 < boxes[jj][1] and - y_cor_x_min_main[ii] >= boxes[jj][2] and - y_cor_x_min_main[ii] < boxes[jj][3]): + for jj, box in enumerate(boxes): + if (mx_main[ii] >= box[0] and + Mx_main[ii] < box[1] and + my_main[ii] >= box[2] and + My_main[ii] < box[3]): arg_text_con.append(jj) check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: - dists_tr_from_box = [math.sqrt((cx_text_only[ii] - boxes[jj][1]) ** 2 + - (cy_text_only[ii] - boxes[jj][2]) ** 2) - for jj in range(len(boxes))] - ind_min = np.argmin(dists_tr_from_box) + # dists_tr_from_box = [math.sqrt((cx_main[ii] - 0.5 * box[1] - 0.5 * box[0]) ** 2 + + # (cy_main[ii] - 0.5 * box[3] - 0.5 * box[2]) ** 2) + # for box in boxes] + dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0) + pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & + (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) + ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) arg_text_con.append(ind_min) - args_contours = np.array(range(len(arg_text_con))) + args_contours = np.arange(len(arg_text_con)) order_by_con_main = np.zeros(len(arg_text_con)) ref_point = 0 @@ -2766,7 +2785,7 @@ class Eynollah: for zahler, _ in enumerate(args_contours_box): arg_order_v = indexes_sorted_main[zahler] order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = \ - np.where(indexes_sorted == arg_order_v)[0][0] + ref_point + np.flatnonzero(indexes_sorted == arg_order_v) + ref_point for jji, _ in enumerate(id_of_texts): order_of_texts_tot.append(order_of_texts[jji] + ref_point) @@ -2779,29 +2798,29 @@ class Eynollah: order_text_new = [] for iii in range(len(order_of_texts_tot)): - order_text_new.append(np.where(np.array(order_of_texts_tot) == iii)[0][0]) + order_text_new.append(np.flatnonzero(np.array(order_of_texts_tot) == iii)) except Exception as why: self.logger.error(why) arg_text_con = [] - for ii in range(len(cx_text_only)): + for ii in range(len(contours_only_text_parent)): check_if_textregion_located_in_a_box = False - for jj in range(len(boxes)): - if (cx_text_only[ii] >= boxes[jj][0] and - cx_text_only[ii] < boxes[jj][1] and - cy_text_only[ii] >= boxes[jj][2] and - cy_text_only[ii] < boxes[jj][3]): + for jj, box in enumerate(boxes): + if (cx_main[ii] >= box[0] and + cx_main[ii] < box[1] and + cy_main[ii] >= box[2] and + cy_main[ii] < box[3]): # this is valid if the center of region identify in which box it is located arg_text_con.append(jj) check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: - dists_tr_from_box = [math.sqrt((cx_text_only[ii] - boxes[jj][1]) ** 2 + - (cy_text_only[ii] - boxes[jj][2]) ** 2) - for jj in range(len(boxes))] - ind_min = np.argmin(dists_tr_from_box) - arg_text_con.append(ind_min) - args_contours = np.array(range(len(arg_text_con))) + dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0) + pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & + (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) + ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) + arg_text_con[ii] = ind_min + args_contours = np.arange(len(contours_only_text_parent)) order_by_con_main = np.zeros(len(arg_text_con)) ref_point = 0 @@ -2829,7 +2848,7 @@ class Eynollah: for zahler, _ in enumerate(args_contours_box): arg_order_v = indexes_sorted_main[zahler] order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = \ - np.where(indexes_sorted == arg_order_v)[0][0] + ref_point + np.flatnonzero(indexes_sorted == arg_order_v) + ref_point for jji, _ in enumerate(id_of_texts): order_of_texts_tot.append(order_of_texts[jji] + ref_point) @@ -2843,7 +2862,7 @@ class Eynollah: order_text_new = [] for iii in range(len(order_of_texts_tot)): - order_text_new.append(np.where(np.array(order_of_texts_tot) == iii)[0][0]) + order_text_new.append(np.flatnonzero(np.array(order_of_texts_tot) == iii)) self.logger.debug("exit do_order_of_regions_no_full_layout") return order_text_new, id_of_texts_tot diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 92da14a..6e5afd4 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1222,6 +1222,8 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref): # offset from bbox of mask peaks_neg_new += y_ref + # assert not len(cy_main) or np.min(peaks_neg_new) <= np.min(cy_main) and np.max(cy_main) <= np.max(peaks_neg_new) + # assert not len(cy_head) or np.min(peaks_neg_new) <= np.min(cy_head) and np.max(cy_head) <= np.max(peaks_neg_new) matrix_of_orders = np.zeros((len(contours_main) + len(contours_head), 5), dtype=int) matrix_of_orders[:, 0] = np.arange(len(contours_main) + len(contours_head)) @@ -1251,16 +1253,8 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref): ##matrix_of_orders[:len_main,4]=final_indexers_sorted[:] - # This fix is applied if the sum of the lengths of contours and contours_h - # does not match final_indexers_sorted. However, this is not the optimal solution.. - if len(cy_main) + len(cy_header) == len(final_index_type): - pass - else: - indexes_missed = set(np.arange(len(cy_main) + len(cy_header))) - set(final_indexers_sorted) - for ind_missed in indexes_missed: - final_indexers_sorted.append(ind_missed) - final_types.append(1) - final_index_type.append(ind_missed) + # assert len(final_indexers_sorted) == len(contours_main) + len(contours_head) + # assert not len(final_indexers_sorted) or max(final_index_type) == max(len(contours_main) return np.array(final_indexers_sorted), np.array(final_types), np.array(final_index_type) From e9bb62bd86747dabd5cd6fb5f67a36547c5c626d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 2 Oct 2025 23:44:00 +0200 Subject: [PATCH 09/44] do_order_of_regions: simplify - avoid loops in favour of array processing --- src/eynollah/eynollah.py | 158 ++++++++++++++++----------------------- 1 file changed, 64 insertions(+), 94 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 3194b66..6a3fd1e 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2526,7 +2526,7 @@ class Eynollah: contours_only_text_parent_h) try: - arg_text_con = [] + arg_text_con_main = np.zeros(len(contours_only_text_parent), dtype=int) for ii in range(len(contours_only_text_parent)): check_if_textregion_located_in_a_box = False for jj, box in enumerate(boxes): @@ -2534,7 +2534,7 @@ class Eynollah: Mx_main[ii] < box[1] and my_main[ii] >= box[2] and My_main[ii] < box[3]): - arg_text_con.append(jj) + arg_text_con_main[ii] = jj check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: @@ -2545,11 +2545,11 @@ class Eynollah: pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) - arg_text_con.append(ind_min) - args_contours = np.arange(len(arg_text_con)) - order_by_con_main = np.zeros(len(arg_text_con)) + arg_text_con_main[ii] = ind_min + args_contours_main = np.arange(len(contours_only_text_parent)) + order_by_con_main = np.zeros_like(arg_text_con_main) - arg_text_con_h = [] + arg_text_con_head = np.zeros(len(contours_only_text_parent_h), dtype=int) for ii in range(len(contours_only_text_parent_h)): check_if_textregion_located_in_a_box = False for jj, box in enumerate(boxes): @@ -2557,7 +2557,7 @@ class Eynollah: Mx_head[ii] < box[1] and my_head[ii] >= box[2] and My_head[ii] < box[3]): - arg_text_con_h.append(jj) + arg_text_con_head[ii] = jj check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: @@ -2568,9 +2568,9 @@ class Eynollah: pcontained_in_box = ((boxes[:, 2] <= cy_head[ii]) & (cy_head[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1])) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) - arg_text_con_h.append(ind_min) - args_contours_h = np.arange(len(arg_text_con_h)) - order_by_con_head = np.zeros(len(arg_text_con_h)) + arg_text_con_head[ii] = ind_min + args_contours_head = np.arange(len(contours_only_text_parent_h)) + order_by_con_head = np.zeros_like(arg_text_con_head) ref_point = 0 order_of_texts_tot = [] @@ -2578,10 +2578,10 @@ class Eynollah: for iij, box in enumerate(boxes): ys = slice(*box[2:4]) xs = slice(*box[0:2]) - args_contours_box = args_contours[np.array(arg_text_con) == iij] - args_contours_box_h = args_contours_h[np.array(arg_text_con_h) == iij] - con_inter_box = contours_only_text_parent[args_contours_box] - con_inter_box_h = contours_only_text_parent_h[args_contours_box_h] + args_contours_box_main = args_contours_main[arg_text_con_main == iij] + args_contours_box_head = args_contours_head[arg_text_con_head == iij] + con_inter_box = contours_only_text_parent[args_contours_box_main] + con_inter_box_h = contours_only_text_parent_h[args_contours_box_head] indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2]) @@ -2595,14 +2595,14 @@ class Eynollah: indexes_sorted_head = indexes_sorted[kind_of_texts_sorted == 2] indexes_by_type_head = index_by_kind_sorted[kind_of_texts_sorted == 2] - for zahler, _ in enumerate(args_contours_box): + for zahler, _ in enumerate(args_contours_box_main): arg_order_v = indexes_sorted_main[zahler] - order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = \ + order_by_con_main[args_contours_box_main[indexes_by_type_main[zahler]]] = \ np.flatnonzero(indexes_sorted == arg_order_v) + ref_point - for zahler, _ in enumerate(args_contours_box_h): + for zahler, _ in enumerate(args_contours_box_head): arg_order_v = indexes_sorted_head[zahler] - order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = \ + order_by_con_head[args_contours_box_head[indexes_by_type_head[zahler]]] = \ np.flatnonzero(indexes_sorted == arg_order_v) + ref_point for jji in range(len(id_of_texts)): @@ -2610,20 +2610,13 @@ class Eynollah: id_of_texts_tot.append(id_of_texts[jji]) ref_point += len(id_of_texts) - order_of_texts_tot = [] - for tj1 in range(len(contours_only_text_parent)): - order_of_texts_tot.append(int(order_by_con_main[tj1])) - - for tj1 in range(len(contours_only_text_parent_h)): - order_of_texts_tot.append(int(order_by_con_head[tj1])) - - order_text_new = [] - for iii in range(len(order_of_texts_tot)): - order_text_new.append(np.flatnonzero(np.array(order_of_texts_tot) == iii)) + order_of_texts_tot = np.concatenate((order_by_con_main, + order_by_con_head)) + order_text_new = np.argsort(order_of_texts_tot) except Exception as why: self.logger.error(why) - arg_text_con = [] + arg_text_con_main = np.zeros(len(contours_only_text_parent), dtype=int) for ii in range(len(contours_only_text_parent)): check_if_textregion_located_in_a_box = False for jj, box in enumerate(boxes): @@ -2632,10 +2625,9 @@ class Eynollah: cy_main[ii] >= box[2] and cy_main[ii] < box[3]): # this is valid if the center of region identify in which box it is located - arg_text_con.append(jj) + arg_text_con_main[ii] = jj check_if_textregion_located_in_a_box = True break - if not check_if_textregion_located_in_a_box: # dists_tr_from_box = [math.sqrt((cx_main[ii] - 0.5 * box[1] - 0.5 * box[0]) ** 2 + # (cy_main[ii] - 0.5 * box[3] - 0.5 * box[2]) ** 2) @@ -2644,13 +2636,11 @@ class Eynollah: pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) - arg_text_con.append(ind_min) - args_contours = np.arange(len(arg_text_con)) - order_by_con_main = np.zeros(len(arg_text_con)) + arg_text_con_main[ii] = ind_min + args_contours_main = np.arange(len(contours_only_text_parent)) + order_by_con_main = np.zeros_like(arg_text_con_main) - ############################# head - - arg_text_con_h = [] + arg_text_con_head = np.zeros(len(contours_only_text_parent_h), dtype=int) for ii in range(len(contours_only_text_parent_h)): check_if_textregion_located_in_a_box = False for jj, box in enumerate(boxes): @@ -2659,7 +2649,7 @@ class Eynollah: cy_head[ii] >= box[2] and cy_head[ii] < box[3]): # this is valid if the center of region identify in which box it is located - arg_text_con_h.append(jj) + arg_text_con_head[ii] = jj check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: @@ -2670,9 +2660,9 @@ class Eynollah: pcontained_in_box = ((boxes[:, 2] <= cy_head[ii]) & (cy_head[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1])) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) - arg_text_con_h.append(ind_min) - args_contours_h = np.arange(len(arg_text_con_h)) - order_by_con_head = np.zeros(len(arg_text_con_h)) + arg_text_con_head[ii] = ind_min + args_contours_head = np.arange(len(contours_only_text_parent_h)) + order_by_con_head = np.zeros_like(arg_text_con_head) ref_point = 0 order_of_texts_tot = [] @@ -2680,10 +2670,10 @@ class Eynollah: for iij, box in enumerate(boxes): ys = slice(*box[2:4]) xs = slice(*box[0:2]) - args_contours_box = args_contours[np.array(arg_text_con) == iij] - args_contours_box_h = args_contours_h[np.array(arg_text_con_h) == iij] - con_inter_box = contours_only_text_parent[args_contours_box] - con_inter_box_h = contours_only_text_parent_h[args_contours_box_h] + args_contours_box_main = args_contours_main[arg_text_con_main == iij] + args_contours_box_head = args_contours_head[arg_text_con_head == iij] + con_inter_box = contours_only_text_parent[args_contours_box_main] + con_inter_box_h = contours_only_text_parent_h[args_contours_box_head] indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2]) @@ -2697,14 +2687,14 @@ class Eynollah: indexes_sorted_head = indexes_sorted[kind_of_texts_sorted == 2] indexes_by_type_head = index_by_kind_sorted[kind_of_texts_sorted == 2] - for zahler, _ in enumerate(args_contours_box): + for zahler, _ in enumerate(args_contours_box_main): arg_order_v = indexes_sorted_main[zahler] - order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = \ + order_by_con_main[args_contours_box_main[indexes_by_type_main[zahler]]] = \ np.flatnonzero(indexes_sorted == arg_order_v) + ref_point - for zahler, _ in enumerate(args_contours_box_h): + for zahler, _ in enumerate(args_contours_box_head): arg_order_v = indexes_sorted_head[zahler] - order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = \ + order_by_con_head[args_contours_box_head[indexes_by_type_head[zahler]]] = \ np.flatnonzero(indexes_sorted == arg_order_v) + ref_point for jji in range(len(id_of_texts)): @@ -2712,16 +2702,9 @@ class Eynollah: id_of_texts_tot.append(id_of_texts[jji]) ref_point += len(id_of_texts) - order_of_texts_tot = [] - for tj1 in range(len(contours_only_text_parent)): - order_of_texts_tot.append(int(order_by_con_main[tj1])) - - for tj1 in range(len(contours_only_text_parent_h)): - order_of_texts_tot.append(int(order_by_con_head[tj1])) - - order_text_new = [] - for iii in range(len(order_of_texts_tot)): - order_text_new.append(np.flatnonzero(np.array(order_of_texts_tot) == iii)) + order_of_texts_tot = np.concatenate((order_by_con_main, + order_by_con_head)) + order_text_new = np.argsort(order_of_texts_tot) self.logger.debug("exit do_order_of_regions_full_layout") return order_text_new, id_of_texts_tot @@ -2739,7 +2722,7 @@ class Eynollah: contours_only_text_parent) try: - arg_text_con = [] + arg_text_con_main = np.zeros(len(contours_only_text_parent), dtype=int) for ii in range(len(contours_only_text_parent)): check_if_textregion_located_in_a_box = False for jj, box in enumerate(boxes): @@ -2747,7 +2730,7 @@ class Eynollah: Mx_main[ii] < box[1] and my_main[ii] >= box[2] and My_main[ii] < box[3]): - arg_text_con.append(jj) + arg_text_con_main[ii] = jj check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: @@ -2758,9 +2741,9 @@ class Eynollah: pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) - arg_text_con.append(ind_min) - args_contours = np.arange(len(arg_text_con)) - order_by_con_main = np.zeros(len(arg_text_con)) + arg_text_con_main[ii] = ind_min + args_contours_main = np.arange(len(contours_only_text_parent)) + order_by_con_main = np.zeros_like(arg_text_con_main) ref_point = 0 order_of_texts_tot = [] @@ -2768,8 +2751,8 @@ class Eynollah: for iij, box in enumerate(boxes): ys = slice(*box[2:4]) xs = slice(*box[0:2]) - args_contours_box = args_contours[np.array(arg_text_con) == iij] - con_inter_box = contours_only_text_parent[args_contours_box] + args_contours_box_main = args_contours_main[arg_text_con_main == iij] + con_inter_box = contours_only_text_parent[args_contours_box_main] con_inter_box_h = [] indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( @@ -2782,9 +2765,9 @@ class Eynollah: indexes_sorted_main = indexes_sorted[kind_of_texts_sorted == 1] indexes_by_type_main = index_by_kind_sorted[kind_of_texts_sorted == 1] - for zahler, _ in enumerate(args_contours_box): + for zahler, _ in enumerate(args_contours_box_main): arg_order_v = indexes_sorted_main[zahler] - order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = \ + order_by_con_main[args_contours_box_main[indexes_by_type_main[zahler]]] = \ np.flatnonzero(indexes_sorted == arg_order_v) + ref_point for jji, _ in enumerate(id_of_texts): @@ -2792,17 +2775,12 @@ class Eynollah: id_of_texts_tot.append(id_of_texts[jji]) ref_point += len(id_of_texts) - order_of_texts_tot = [] - for tj1 in range(len(contours_only_text_parent)): - order_of_texts_tot.append(int(order_by_con_main[tj1])) - - order_text_new = [] - for iii in range(len(order_of_texts_tot)): - order_text_new.append(np.flatnonzero(np.array(order_of_texts_tot) == iii)) + order_of_texts_tot = order_by_con_main + order_text_new = np.argsort(order_of_texts_tot) except Exception as why: self.logger.error(why) - arg_text_con = [] + arg_text_con_main = np.zeros(len(contours_only_text_parent), dtype=int) for ii in range(len(contours_only_text_parent)): check_if_textregion_located_in_a_box = False for jj, box in enumerate(boxes): @@ -2811,7 +2789,7 @@ class Eynollah: cy_main[ii] >= box[2] and cy_main[ii] < box[3]): # this is valid if the center of region identify in which box it is located - arg_text_con.append(jj) + arg_text_con_main[ii] = jj check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: @@ -2819,9 +2797,9 @@ class Eynollah: pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) - arg_text_con[ii] = ind_min - args_contours = np.arange(len(contours_only_text_parent)) - order_by_con_main = np.zeros(len(arg_text_con)) + arg_text_con_main[ii] = ind_min + args_contours_main = np.arange(len(contours_only_text_parent)) + order_by_con_main = np.zeros_like(arg_text_con_main) ref_point = 0 order_of_texts_tot = [] @@ -2829,11 +2807,9 @@ class Eynollah: for iij, box in enumerate(boxes): ys = slice(*box[2:4]) xs = slice(*box[0:2]) - args_contours_box = args_contours[np.array(arg_text_con) == iij] - con_inter_box = [] + args_contours_box_main = args_contours_main[arg_text_con_main == iij] + con_inter_box = contours_only_text_parent[args_contours_box_main] con_inter_box_h = [] - for i in range(len(args_contours_box)): - con_inter_box.append(contours_only_text_parent[args_contours_box[i]]) indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2]) @@ -2845,9 +2821,9 @@ class Eynollah: indexes_sorted_main = indexes_sorted[kind_of_texts_sorted == 1] indexes_by_type_main = index_by_kind_sorted[kind_of_texts_sorted == 1] - for zahler, _ in enumerate(args_contours_box): + for zahler, _ in enumerate(args_contours_box_main): arg_order_v = indexes_sorted_main[zahler] - order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = \ + order_by_con_main[args_contours_box_main[indexes_by_type_main[zahler]]] = \ np.flatnonzero(indexes_sorted == arg_order_v) + ref_point for jji, _ in enumerate(id_of_texts): @@ -2855,14 +2831,8 @@ class Eynollah: id_of_texts_tot.append(id_of_texts[jji]) ref_point += len(id_of_texts) - order_of_texts_tot = [] - - for tj1 in range(len(contours_only_text_parent)): - order_of_texts_tot.append(int(order_by_con_main[tj1])) - - order_text_new = [] - for iii in range(len(order_of_texts_tot)): - order_text_new.append(np.flatnonzero(np.array(order_of_texts_tot) == iii)) + order_of_texts_tot = order_by_con_main + order_text_new = np.argsort(order_of_texts_tot) self.logger.debug("exit do_order_of_regions_no_full_layout") return order_text_new, id_of_texts_tot From e674ea08f383de0c87f950be153fc954c3b4308e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 3 Oct 2025 00:59:25 +0200 Subject: [PATCH 10/44] do_order_of_regions: drop redundant no/full_layout (`_no_full_layout` is the same copied code as `_full_layout`; the latter runs just the same if passed an empty list for headings) --- src/eynollah/eynollah.py | 141 ++------------------------------------ src/eynollah/utils/xml.py | 4 +- 2 files changed, 6 insertions(+), 139 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 6a3fd1e..629b001 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2511,10 +2511,10 @@ class Eynollah: self.logger.debug("exit get_regions_from_xy_2models") return text_regions_p_true, erosion_hurts, polygons_seplines - def do_order_of_regions_full_layout( + def do_order_of_regions( self, contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot): - self.logger.debug("enter do_order_of_regions_full_layout") + self.logger.debug("enter do_order_of_regions") contours_only_text_parent = np.array(contours_only_text_parent) contours_only_text_parent_h = np.array(contours_only_text_parent_h) boxes = np.array(boxes, dtype=int) # to be on the safe side @@ -2706,135 +2706,7 @@ class Eynollah: order_by_con_head)) order_text_new = np.argsort(order_of_texts_tot) - self.logger.debug("exit do_order_of_regions_full_layout") - return order_text_new, id_of_texts_tot - - def do_order_of_regions_no_full_layout( - self, contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot): - - self.logger.debug("enter do_order_of_regions_no_full_layout") - contours_only_text_parent = np.array(contours_only_text_parent) - contours_only_text_parent_h = np.array(contours_only_text_parent_h) - boxes = np.array(boxes, dtype=int) # to be on the safe side - c_boxes = np.stack((0.5 * boxes[:, 2:4].sum(axis=1), - 0.5 * boxes[:, 0:2].sum(axis=1))) - cx_main, cy_main, mx_main, Mx_main, my_main, My_main, mxy_main = find_new_features_of_contours( - contours_only_text_parent) - - try: - arg_text_con_main = np.zeros(len(contours_only_text_parent), dtype=int) - for ii in range(len(contours_only_text_parent)): - check_if_textregion_located_in_a_box = False - for jj, box in enumerate(boxes): - if (mx_main[ii] >= box[0] and - Mx_main[ii] < box[1] and - my_main[ii] >= box[2] and - My_main[ii] < box[3]): - arg_text_con_main[ii] = jj - check_if_textregion_located_in_a_box = True - break - if not check_if_textregion_located_in_a_box: - # dists_tr_from_box = [math.sqrt((cx_main[ii] - 0.5 * box[1] - 0.5 * box[0]) ** 2 + - # (cy_main[ii] - 0.5 * box[3] - 0.5 * box[2]) ** 2) - # for box in boxes] - dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0) - pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & - (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) - ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) - arg_text_con_main[ii] = ind_min - args_contours_main = np.arange(len(contours_only_text_parent)) - order_by_con_main = np.zeros_like(arg_text_con_main) - - ref_point = 0 - order_of_texts_tot = [] - id_of_texts_tot = [] - for iij, box in enumerate(boxes): - ys = slice(*box[2:4]) - xs = slice(*box[0:2]) - args_contours_box_main = args_contours_main[arg_text_con_main == iij] - con_inter_box = contours_only_text_parent[args_contours_box_main] - con_inter_box_h = [] - - indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( - textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2]) - - order_of_texts, id_of_texts = order_and_id_of_texts( - con_inter_box, con_inter_box_h, - indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) - - indexes_sorted_main = indexes_sorted[kind_of_texts_sorted == 1] - indexes_by_type_main = index_by_kind_sorted[kind_of_texts_sorted == 1] - - for zahler, _ in enumerate(args_contours_box_main): - arg_order_v = indexes_sorted_main[zahler] - order_by_con_main[args_contours_box_main[indexes_by_type_main[zahler]]] = \ - np.flatnonzero(indexes_sorted == arg_order_v) + ref_point - - for jji, _ in enumerate(id_of_texts): - order_of_texts_tot.append(order_of_texts[jji] + ref_point) - id_of_texts_tot.append(id_of_texts[jji]) - ref_point += len(id_of_texts) - - order_of_texts_tot = order_by_con_main - order_text_new = np.argsort(order_of_texts_tot) - - except Exception as why: - self.logger.error(why) - arg_text_con_main = np.zeros(len(contours_only_text_parent), dtype=int) - for ii in range(len(contours_only_text_parent)): - check_if_textregion_located_in_a_box = False - for jj, box in enumerate(boxes): - if (cx_main[ii] >= box[0] and - cx_main[ii] < box[1] and - cy_main[ii] >= box[2] and - cy_main[ii] < box[3]): - # this is valid if the center of region identify in which box it is located - arg_text_con_main[ii] = jj - check_if_textregion_located_in_a_box = True - break - if not check_if_textregion_located_in_a_box: - dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0) - pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & - (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) - ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) - arg_text_con_main[ii] = ind_min - args_contours_main = np.arange(len(contours_only_text_parent)) - order_by_con_main = np.zeros_like(arg_text_con_main) - - ref_point = 0 - order_of_texts_tot = [] - id_of_texts_tot = [] - for iij, box in enumerate(boxes): - ys = slice(*box[2:4]) - xs = slice(*box[0:2]) - args_contours_box_main = args_contours_main[arg_text_con_main == iij] - con_inter_box = contours_only_text_parent[args_contours_box_main] - con_inter_box_h = [] - - indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( - textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2]) - - order_of_texts, id_of_texts = order_and_id_of_texts( - con_inter_box, con_inter_box_h, - indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) - - indexes_sorted_main = indexes_sorted[kind_of_texts_sorted == 1] - indexes_by_type_main = index_by_kind_sorted[kind_of_texts_sorted == 1] - - for zahler, _ in enumerate(args_contours_box_main): - arg_order_v = indexes_sorted_main[zahler] - order_by_con_main[args_contours_box_main[indexes_by_type_main[zahler]]] = \ - np.flatnonzero(indexes_sorted == arg_order_v) + ref_point - - for jji, _ in enumerate(id_of_texts): - order_of_texts_tot.append(order_of_texts[jji] + ref_point) - id_of_texts_tot.append(id_of_texts[jji]) - ref_point += len(id_of_texts) - - order_of_texts_tot = order_by_con_main - order_text_new = np.argsort(order_of_texts_tot) - - self.logger.debug("exit do_order_of_regions_no_full_layout") + self.logger.debug("exit do_order_of_regions") return order_text_new, id_of_texts_tot def check_iou_of_bounding_box_and_contour_for_tables( @@ -3081,11 +2953,6 @@ class Eynollah: image_revised_last[int(y_min_tab_col1[ijv]):int(y_max_tab_col1[ijv]),:,:]=pixel_table return image_revised_last - def do_order_of_regions(self, *args, **kwargs): - if self.full_layout: - return self.do_order_of_regions_full_layout(*args, **kwargs) - return self.do_order_of_regions_no_full_layout(*args, **kwargs) - def get_tables_from_model(self, img, num_col_classifier): img_org = np.copy(img) img_height_h = img_org.shape[0] @@ -5170,7 +5037,7 @@ class Eynollah: return pcgts - contours_only_text_parent_h = None + contours_only_text_parent_h = [] self.logger.info("Step 4/5: Reading Order Detection") if self.reading_order_machine_based: diff --git a/src/eynollah/utils/xml.py b/src/eynollah/utils/xml.py index a61dadb..88d1df8 100644 --- a/src/eynollah/utils/xml.py +++ b/src/eynollah/utils/xml.py @@ -57,8 +57,8 @@ def xml_reading_order(page, order_of_texts, id_of_marginalia_left, id_of_margina og.add_RegionRefIndexed(RegionRefIndexedType(index=str(region_counter.get('region')), regionRef=id_marginal)) region_counter.inc('region') - for idx_textregion, _ in enumerate(order_of_texts): - og.add_RegionRefIndexed(RegionRefIndexedType(index=str(region_counter.get('region')), regionRef=region_counter.region_id(order_of_texts[idx_textregion] + 1))) + for idx_textregion in order_of_texts: + og.add_RegionRefIndexed(RegionRefIndexedType(index=str(region_counter.get('region')), regionRef=region_counter.region_id(idx_textregion + 1))) region_counter.inc('region') for id_marginal in id_of_marginalia_right: From 29b4527bdebf6583f32b8801aed26f6ae70d25c7 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 3 Oct 2025 02:06:08 +0200 Subject: [PATCH 11/44] do_order_of_regions: simplify - remove duplicate code via inline def for the try-catch --- src/eynollah/eynollah.py | 127 +++++++-------------------------------- 1 file changed, 22 insertions(+), 105 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 629b001..bb3d1bf 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2525,22 +2525,23 @@ class Eynollah: cx_head, cy_head, mx_head, Mx_head, my_head, My_head, mxy_head = find_new_features_of_contours( contours_only_text_parent_h) - try: + def match_boxes(only_centers: bool): arg_text_con_main = np.zeros(len(contours_only_text_parent), dtype=int) for ii in range(len(contours_only_text_parent)): check_if_textregion_located_in_a_box = False for jj, box in enumerate(boxes): - if (mx_main[ii] >= box[0] and - Mx_main[ii] < box[1] and - my_main[ii] >= box[2] and - My_main[ii] < box[3]): + if ((cx_main[ii] >= box[0] and + cx_main[ii] < box[1] and + cy_main[ii] >= box[2] and + cy_main[ii] < box[3]) if only_centers else + (mx_main[ii] >= box[0] and + Mx_main[ii] < box[1] and + my_main[ii] >= box[2] and + My_main[ii] < box[3])): arg_text_con_main[ii] = jj check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: - # dists_tr_from_box = [math.sqrt((cx_main[ii] - 0.5 * box[1] - 0.5 * box[0]) ** 2 + - # (cy_main[ii] - 0.5 * box[3] - 0.5 * box[2]) ** 2) - # for box in boxes] dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0) pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) @@ -2553,17 +2554,18 @@ class Eynollah: for ii in range(len(contours_only_text_parent_h)): check_if_textregion_located_in_a_box = False for jj, box in enumerate(boxes): - if (mx_head[ii] >= box[0] and - Mx_head[ii] < box[1] and - my_head[ii] >= box[2] and - My_head[ii] < box[3]): + if ((cx_head[ii] >= box[0] and + cx_head[ii] < box[1] and + cy_head[ii] >= box[2] and + cy_head[ii] < box[3]) if only_centers else + (mx_head[ii] >= box[0] and + Mx_head[ii] < box[1] and + my_head[ii] >= box[2] and + My_head[ii] < box[3])): arg_text_con_head[ii] = jj check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: - # dists_tr_from_box = [math.sqrt((cx_head[ii] - 0.5 * box[1] - 0.5 * box[0]) ** 2 + - # (cy_head[ii] - 0.5 * box[3] - 0.5 * box[2]) ** 2) - # for box in boxes] dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_head[ii]], [cx_head[ii]]]), axis=0) pcontained_in_box = ((boxes[:, 2] <= cy_head[ii]) & (cy_head[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1])) @@ -2613,101 +2615,16 @@ class Eynollah: order_of_texts_tot = np.concatenate((order_by_con_main, order_by_con_head)) order_text_new = np.argsort(order_of_texts_tot) + return order_text_new, id_of_texts_tot + try: + results = match_boxes(False) except Exception as why: self.logger.error(why) - arg_text_con_main = np.zeros(len(contours_only_text_parent), dtype=int) - for ii in range(len(contours_only_text_parent)): - check_if_textregion_located_in_a_box = False - for jj, box in enumerate(boxes): - if (cx_main[ii] >= box[0] and - cx_main[ii] < box[1] and - cy_main[ii] >= box[2] and - cy_main[ii] < box[3]): - # this is valid if the center of region identify in which box it is located - arg_text_con_main[ii] = jj - check_if_textregion_located_in_a_box = True - break - if not check_if_textregion_located_in_a_box: - # dists_tr_from_box = [math.sqrt((cx_main[ii] - 0.5 * box[1] - 0.5 * box[0]) ** 2 + - # (cy_main[ii] - 0.5 * box[3] - 0.5 * box[2]) ** 2) - # for box in boxes] - dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0) - pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & - (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) - ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) - arg_text_con_main[ii] = ind_min - args_contours_main = np.arange(len(contours_only_text_parent)) - order_by_con_main = np.zeros_like(arg_text_con_main) - - arg_text_con_head = np.zeros(len(contours_only_text_parent_h), dtype=int) - for ii in range(len(contours_only_text_parent_h)): - check_if_textregion_located_in_a_box = False - for jj, box in enumerate(boxes): - if (cx_head[ii] >= box[0] and - cx_head[ii] < box[1] and - cy_head[ii] >= box[2] and - cy_head[ii] < box[3]): - # this is valid if the center of region identify in which box it is located - arg_text_con_head[ii] = jj - check_if_textregion_located_in_a_box = True - break - if not check_if_textregion_located_in_a_box: - # dists_tr_from_box = [math.sqrt((cx_head[ii] - 0.5 * box[1] - 0.5 * box[0]) ** 2 + - # (cy_head[ii] - 0.5 * box[3] - 0.5 * box[2]) ** 2) - # for box in boxes] - dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_head[ii]], [cx_head[ii]]]), axis=0) - pcontained_in_box = ((boxes[:, 2] <= cy_head[ii]) & (cy_head[ii] < boxes[:, 3]) & - (boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1])) - ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) - arg_text_con_head[ii] = ind_min - args_contours_head = np.arange(len(contours_only_text_parent_h)) - order_by_con_head = np.zeros_like(arg_text_con_head) - - ref_point = 0 - order_of_texts_tot = [] - id_of_texts_tot = [] - for iij, box in enumerate(boxes): - ys = slice(*box[2:4]) - xs = slice(*box[0:2]) - args_contours_box_main = args_contours_main[arg_text_con_main == iij] - args_contours_box_head = args_contours_head[arg_text_con_head == iij] - con_inter_box = contours_only_text_parent[args_contours_box_main] - con_inter_box_h = contours_only_text_parent_h[args_contours_box_head] - - indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( - textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2]) - - order_of_texts, id_of_texts = order_and_id_of_texts( - con_inter_box, con_inter_box_h, - indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) - - indexes_sorted_main = indexes_sorted[kind_of_texts_sorted == 1] - indexes_by_type_main = index_by_kind_sorted[kind_of_texts_sorted == 1] - indexes_sorted_head = indexes_sorted[kind_of_texts_sorted == 2] - indexes_by_type_head = index_by_kind_sorted[kind_of_texts_sorted == 2] - - for zahler, _ in enumerate(args_contours_box_main): - arg_order_v = indexes_sorted_main[zahler] - order_by_con_main[args_contours_box_main[indexes_by_type_main[zahler]]] = \ - np.flatnonzero(indexes_sorted == arg_order_v) + ref_point - - for zahler, _ in enumerate(args_contours_box_head): - arg_order_v = indexes_sorted_head[zahler] - order_by_con_head[args_contours_box_head[indexes_by_type_head[zahler]]] = \ - np.flatnonzero(indexes_sorted == arg_order_v) + ref_point - - for jji in range(len(id_of_texts)): - order_of_texts_tot.append(order_of_texts[jji] + ref_point) - id_of_texts_tot.append(id_of_texts[jji]) - ref_point += len(id_of_texts) - - order_of_texts_tot = np.concatenate((order_by_con_main, - order_by_con_head)) - order_text_new = np.argsort(order_of_texts_tot) + results = match_boxes(True) self.logger.debug("exit do_order_of_regions") - return order_text_new, id_of_texts_tot + return results def check_iou_of_bounding_box_and_contour_for_tables( self, layout, table_prediction_early, pixel_table, num_col_classifier): From d774a23daa80cad0baa16dc4b41e93b93bca39bf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 5 Oct 2025 02:18:17 +0200 Subject: [PATCH 12/44] matching deskewed text region contours with predicted: simplify - avoid loops in favour of array processing - improve readability and identifiers --- src/eynollah/eynollah.py | 108 +++++++++++++++------------------------ 1 file changed, 40 insertions(+), 68 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index bb3d1bf..dd6172a 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4559,27 +4559,16 @@ class Eynollah: areas_cnt_text = np.array([cv2.contourArea(c) for c in contours_only_text_parent]) areas_cnt_text = areas_cnt_text / float(text_only.shape[0] * text_only.shape[1]) #self.logger.info('areas_cnt_text %s', areas_cnt_text) - contours_biggest = contours_only_text_parent[np.argmax(areas_cnt_text)] - contours_only_text_parent = [c for jz, c in enumerate(contours_only_text_parent) - if areas_cnt_text[jz] > MIN_AREA_REGION] - areas_cnt_text_parent = [area for area in areas_cnt_text if area > MIN_AREA_REGION] + contour0 = contours_only_text_parent[np.argmax(areas_cnt_text)] + contours_only_text_parent = np.array(contours_only_text_parent)[areas_cnt_text > MIN_AREA_REGION] + areas_cnt_text_parent = areas_cnt_text[areas_cnt_text > MIN_AREA_REGION] + index_con_parents = np.argsort(areas_cnt_text_parent) + contours_only_text_parent = contours_only_text_parent[index_con_parents] + areas_cnt_text_parent = areas_cnt_text_parent[index_con_parents] - contours_only_text_parent = self.return_list_of_contours_with_desired_order( - contours_only_text_parent, index_con_parents) - - ##try: - ##contours_only_text_parent = \ - ##list(np.array(contours_only_text_parent,dtype=object)[index_con_parents]) - ##except: - ##contours_only_text_parent = \ - ##list(np.array(contours_only_text_parent,dtype=np.int32)[index_con_parents]) - ##areas_cnt_text_parent = list(np.array(areas_cnt_text_parent)[index_con_parents]) - areas_cnt_text_parent = self.return_list_of_contours_with_desired_order( - areas_cnt_text_parent, index_con_parents) - - cx_bigest_big, cy_biggest_big = find_center_of_contours([contours_biggest]) - cx_bigest, cy_biggest = find_center_of_contours(contours_only_text_parent) + center0 = np.stack(find_center_of_contours([contour0])) # [2, 1] + centers = np.stack(find_center_of_contours(contours_only_text_parent)) # [2, N] if np.abs(slope_deskew) >= SLOPE_THRESHOLD: contours_only_text_d, hir_on_text_d = return_contours_of_image(text_only_d) @@ -4588,65 +4577,48 @@ class Eynollah: areas_cnt_text_d = np.array([cv2.contourArea(c) for c in contours_only_text_parent_d]) areas_cnt_text_d = areas_cnt_text_d / float(text_only_d.shape[0] * text_only_d.shape[1]) - if len(areas_cnt_text_d)>0: - contours_biggest_d = contours_only_text_parent_d[np.argmax(areas_cnt_text_d)] + if len(contours_only_text_parent_d): + contour0_d = contours_only_text_parent_d[np.argmax(areas_cnt_text_d)] index_con_parents_d = np.argsort(areas_cnt_text_d) - contours_only_text_parent_d = self.return_list_of_contours_with_desired_order( - contours_only_text_parent_d, index_con_parents_d) - #try: - #contours_only_text_parent_d = \ - #list(np.array(contours_only_text_parent_d,dtype=object)[index_con_parents_d]) - #except: - #contours_only_text_parent_d = \ - #list(np.array(contours_only_text_parent_d,dtype=np.int32)[index_con_parents_d]) - #areas_cnt_text_d = list(np.array(areas_cnt_text_d)[index_con_parents_d]) - areas_cnt_text_d = self.return_list_of_contours_with_desired_order( - areas_cnt_text_d, index_con_parents_d) + contours_only_text_parent_d = np.array(contours_only_text_parent_d)[index_con_parents_d] + # rs: should be the same, no? + assert np.all(contour0_d == contours_only_text_parent_d[-1]), (np.argmax(areas_cnt_text_d), index_con_parents_d[-1]) + areas_cnt_text_d = areas_cnt_text_d[index_con_parents_d] - cx_bigest_d_big, cy_biggest_d_big = find_center_of_contours([contours_biggest_d]) - cx_bigest_d, cy_biggest_d = find_center_of_contours(contours_only_text_parent_d) - try: - if len(cx_bigest_d) >= 5: - cx_bigest_d_last5 = cx_bigest_d[-5:] - cy_biggest_d_last5 = cy_biggest_d[-5:] - dists_d = [math.sqrt((cx_bigest_big[0] - cx_bigest_d_last5[j]) ** 2 + - (cy_biggest_big[0] - cy_biggest_d_last5[j]) ** 2) - for j in range(len(cy_biggest_d_last5))] - ind_largest = len(cx_bigest_d) -5 + np.argmin(dists_d) - else: - cx_bigest_d_last5 = cx_bigest_d[-len(cx_bigest_d):] - cy_biggest_d_last5 = cy_biggest_d[-len(cx_bigest_d):] - dists_d = [math.sqrt((cx_bigest_big[0]-cx_bigest_d_last5[j])**2 + - (cy_biggest_big[0]-cy_biggest_d_last5[j])**2) - for j in range(len(cy_biggest_d_last5))] - ind_largest = len(cx_bigest_d) - len(cx_bigest_d) + np.argmin(dists_d) - - cx_bigest_d_big[0] = cx_bigest_d[ind_largest] - cy_biggest_d_big[0] = cy_biggest_d[ind_largest] - except Exception as why: - self.logger.error(str(why)) + center0_d = np.stack(find_center_of_contours([contour0_d])) # [2, 1] + centers_d = np.stack(find_center_of_contours(contours_only_text_parent_d)) # [2, N] + # rs: should be the same, no? + assert center0_d[0,0] == centers_d[0,-1] and center0_d[1,0] == centers_d[1,-1] + last5_centers_d = centers_d[:, -5:] + dists_d = np.linalg.norm(center0 - last5_centers_d, axis=0) + ind_largest = len(contours_only_text_parent_d) - last5_centers_d.shape[1] + np.argmin(dists_d) + center0_d[:, 0] = centers_d[:, ind_largest] + # order new contours the same way as the undeskewed contours + # (by calculating the offset of the largest contours, respectively, + # of the new and undeskewed image; then for each contour, + # finding the closest new contour, with proximity calculated + # as distance of their centers modulo offset vector) (h, w) = text_only.shape[:2] center = (w // 2.0, h // 2.0) M = cv2.getRotationMatrix2D(center, slope_deskew, 1.0) M_22 = np.array(M)[:2, :2] - p_big = np.dot(M_22, [cx_bigest_big, cy_biggest_big]) - x_diff = p_big[0] - cx_bigest_d_big - y_diff = p_big[1] - cy_biggest_d_big + p0 = np.dot(M_22, center0) # [2, 1] + offset = p0 - center0_d # [2, 1] + # img2 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) contours_only_text_parent_d_ordered = [] for i in range(len(contours_only_text_parent)): - p = np.dot(M_22, [cx_bigest[i], cy_biggest[i]]) - p[0] = p[0] - x_diff[0] - p[1] = p[1] - y_diff[0] - dists = [math.sqrt((p[0] - cx_bigest_d[j]) ** 2 + - (p[1] - cy_biggest_d[j]) ** 2) - for j in range(len(cx_bigest_d))] - contours_only_text_parent_d_ordered.append(contours_only_text_parent_d[np.argmin(dists)]) - # img2=np.zeros((text_only.shape[0],text_only.shape[1],3)) - # img2=cv2.fillPoly(img2,pts=[contours_only_text_parent_d[np.argmin(dists)]] ,color=(1,1,1)) - # plt.imshow(img2[:,:,0]) - # plt.show() + p = np.dot(M_22, centers[:, i:i+1]) # [2, 1] + p -= offset + dists = np.linalg.norm(p - centers_d, axis=0) + contours_only_text_parent_d_ordered.append( + contours_only_text_parent_d[np.argmin(dists)]) + # cv2.fillPoly(img2, pts=[contours_only_text_parent_d[np.argmin(dists)]], color=i + 1) + # plt.imshow(img2) + # plt.show() + # rs: what about the remaining contours_only_text_parent_d? + # rs: what about duplicates? else: contours_only_text_parent_d_ordered = [] contours_only_text_parent_d = [] From 73e5a1def8489f6bf022e696f010d4c852ff685b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 5 Oct 2025 02:33:03 +0200 Subject: [PATCH 13/44] matching deskewed text region contours with predicted: simplify - (no need for argmax if already sorted) --- src/eynollah/eynollah.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index dd6172a..46437f0 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4559,7 +4559,6 @@ class Eynollah: areas_cnt_text = np.array([cv2.contourArea(c) for c in contours_only_text_parent]) areas_cnt_text = areas_cnt_text / float(text_only.shape[0] * text_only.shape[1]) #self.logger.info('areas_cnt_text %s', areas_cnt_text) - contour0 = contours_only_text_parent[np.argmax(areas_cnt_text)] contours_only_text_parent = np.array(contours_only_text_parent)[areas_cnt_text > MIN_AREA_REGION] areas_cnt_text_parent = areas_cnt_text[areas_cnt_text > MIN_AREA_REGION] @@ -4567,9 +4566,11 @@ class Eynollah: contours_only_text_parent = contours_only_text_parent[index_con_parents] areas_cnt_text_parent = areas_cnt_text_parent[index_con_parents] - center0 = np.stack(find_center_of_contours([contour0])) # [2, 1] centers = np.stack(find_center_of_contours(contours_only_text_parent)) # [2, N] + contour0 = contours_only_text_parent[-1] + center0 = centers[:, -1:] # [2, 1] + if np.abs(slope_deskew) >= SLOPE_THRESHOLD: contours_only_text_d, hir_on_text_d = return_contours_of_image(text_only_d) contours_only_text_parent_d = return_parent_contours(contours_only_text_d, hir_on_text_d) @@ -4578,17 +4579,15 @@ class Eynollah: areas_cnt_text_d = areas_cnt_text_d / float(text_only_d.shape[0] * text_only_d.shape[1]) if len(contours_only_text_parent_d): - contour0_d = contours_only_text_parent_d[np.argmax(areas_cnt_text_d)] index_con_parents_d = np.argsort(areas_cnt_text_d) contours_only_text_parent_d = np.array(contours_only_text_parent_d)[index_con_parents_d] - # rs: should be the same, no? - assert np.all(contour0_d == contours_only_text_parent_d[-1]), (np.argmax(areas_cnt_text_d), index_con_parents_d[-1]) areas_cnt_text_d = areas_cnt_text_d[index_con_parents_d] - center0_d = np.stack(find_center_of_contours([contour0_d])) # [2, 1] centers_d = np.stack(find_center_of_contours(contours_only_text_parent_d)) # [2, N] - # rs: should be the same, no? - assert center0_d[0,0] == centers_d[0,-1] and center0_d[1,0] == centers_d[1,-1] + + contour0_d = contours_only_text_parent_d[-1] + center0_d = centers_d[:, -1:] # [2, 1] + last5_centers_d = centers_d[:, -5:] dists_d = np.linalg.norm(center0 - last5_centers_d, axis=0) ind_largest = len(contours_only_text_parent_d) - last5_centers_d.shape[1] + np.argmin(dists_d) From 0f33c21eb3a9cbe87f7221dd3481203de415794d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 5 Oct 2025 02:45:01 +0200 Subject: [PATCH 14/44] matching deskewed text region contours with predicted: improve - when matching undeskewed and new contours, do not just pick the closest centers, respectively, but also of similar size (by making the contour area the 3rd dimension of the vector norm in the distance calculation) --- src/eynollah/eynollah.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 46437f0..e474916 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4610,7 +4610,11 @@ class Eynollah: for i in range(len(contours_only_text_parent)): p = np.dot(M_22, centers[:, i:i+1]) # [2, 1] p -= offset - dists = np.linalg.norm(p - centers_d, axis=0) + # add dimension for area + #dists = np.linalg.norm(p - centers_d, axis=0) + diffs = (np.append(p, [[areas_cnt_text_parent[i]]], axis=0) - + np.append(centers_d, areas_cnt_text_d[np.newaxis], axis=0)) + dists = np.linalg.norm(diffs, axis=0) contours_only_text_parent_d_ordered.append( contours_only_text_parent_d[np.argmin(dists)]) # cv2.fillPoly(img2, pts=[contours_only_text_parent_d[np.argmin(dists)]], color=i + 1) From 0e00d7868be55d3fb94b52fffc6ed96bf9387067 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 6 Oct 2025 12:55:10 +0200 Subject: [PATCH 15/44] matching deskewed text region contours with predicted: improve - apply same min-area filter to deskewed contours as to original ones --- src/eynollah/eynollah.py | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index e474916..e5ad5ae 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4568,7 +4568,6 @@ class Eynollah: centers = np.stack(find_center_of_contours(contours_only_text_parent)) # [2, N] - contour0 = contours_only_text_parent[-1] center0 = centers[:, -1:] # [2, 1] if np.abs(slope_deskew) >= SLOPE_THRESHOLD: @@ -4578,6 +4577,9 @@ class Eynollah: areas_cnt_text_d = np.array([cv2.contourArea(c) for c in contours_only_text_parent_d]) areas_cnt_text_d = areas_cnt_text_d / float(text_only_d.shape[0] * text_only_d.shape[1]) + contours_only_text_parent_d = np.array(contours_only_text_parent_d)[areas_cnt_text_d > MIN_AREA_REGION] + areas_cnt_text_d = areas_cnt_text_d[areas_cnt_text_d > MIN_AREA_REGION] + if len(contours_only_text_parent_d): index_con_parents_d = np.argsort(areas_cnt_text_d) contours_only_text_parent_d = np.array(contours_only_text_parent_d)[index_con_parents_d] @@ -4585,9 +4587,10 @@ class Eynollah: centers_d = np.stack(find_center_of_contours(contours_only_text_parent_d)) # [2, N] - contour0_d = contours_only_text_parent_d[-1] center0_d = centers_d[:, -1:] # [2, 1] + # find the largest among the largest 5 deskewed contours + # that is also closest to the largest original contour last5_centers_d = centers_d[:, -5:] dists_d = np.linalg.norm(center0 - last5_centers_d, axis=0) ind_largest = len(contours_only_text_parent_d) - last5_centers_d.shape[1] + np.argmin(dists_d) @@ -4762,14 +4765,7 @@ class Eynollah: if np.abs(slope_deskew) >= SLOPE_THRESHOLD: contours_only_text_parent_d_ordered = self.return_list_of_contours_with_desired_order( contours_only_text_parent_d_ordered, index_by_text_par_con) - #try: - #contours_only_text_parent_d_ordered = \ - #list(np.array(contours_only_text_parent_d_ordered, dtype=np.int32)[index_by_text_par_con]) - #except: - #contours_only_text_parent_d_ordered = \ - #list(np.array(contours_only_text_parent_d_ordered, dtype=object)[index_by_text_par_con]) else: - #takes long timee contours_only_text_parent_d_ordered = None if self.light_version: fun = check_any_text_region_in_model_one_is_main_or_header_light @@ -4949,12 +4945,6 @@ class Eynollah: else: contours_only_text_parent_d_ordered = self.return_list_of_contours_with_desired_order( contours_only_text_parent_d_ordered, index_by_text_par_con) - #try: - #contours_only_text_parent_d_ordered = \ - #list(np.array(contours_only_text_parent_d_ordered, dtype=object)[index_by_text_par_con]) - #except: - #contours_only_text_parent_d_ordered = \ - #list(np.array(contours_only_text_parent_d_ordered, dtype=np.int32)[index_by_text_par_con]) order_text_new, id_of_texts_tot = self.do_order_of_regions( contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d) From 155b8f68b8a7754de11e002e0df2bfc7292899d8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 6 Oct 2025 12:58:24 +0200 Subject: [PATCH 16/44] matching deskewed text region contours with predicted: improve - avoid duplicate and missing mappings by using a different approach: instead of just minimising the center distance for the N contours that we expect, 1. get all N:M distances 2. iterate over them from small to large 3. continue adding correspondences until both every original contour and every deskewed contour have at least one match 4. where one original matches multiple deskewed contours, join the latter polygons to map as single contour 5. where one deskewed contour matches multiple originals, split the former by intersecting with each of the latter (after bringing them into the same coordinate space), so ultimately only the respective match gets assigned --- src/eynollah/eynollah.py | 94 ++++++++++++++++++++++++++++------- src/eynollah/utils/contour.py | 15 ++++++ 2 files changed, 90 insertions(+), 19 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index e5ad5ae..5e32929 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -33,6 +33,7 @@ from concurrent.futures import ProcessPoolExecutor import xml.etree.ElementTree as ET import cv2 import numpy as np +import shapely.affinity from scipy.signal import find_peaks from scipy.ndimage import gaussian_filter1d from numba import cuda @@ -83,6 +84,10 @@ from .utils.contour import ( return_parent_contours, dilate_textregion_contours, dilate_textline_contours, + polygon2contour, + contour2polygon, + join_polygons, + make_intersection, ) from .utils.rotate import ( rotate_image, @@ -4556,8 +4561,9 @@ class Eynollah: contours_only_text, hir_on_text = return_contours_of_image(text_only) contours_only_text_parent = return_parent_contours(contours_only_text, hir_on_text) if len(contours_only_text_parent) > 0: + areas_tot_text = np.prod(text_only.shape) areas_cnt_text = np.array([cv2.contourArea(c) for c in contours_only_text_parent]) - areas_cnt_text = areas_cnt_text / float(text_only.shape[0] * text_only.shape[1]) + areas_cnt_text = areas_cnt_text / float(areas_tot_text) #self.logger.info('areas_cnt_text %s', areas_cnt_text) contours_only_text_parent = np.array(contours_only_text_parent)[areas_cnt_text > MIN_AREA_REGION] areas_cnt_text_parent = areas_cnt_text[areas_cnt_text > MIN_AREA_REGION] @@ -4574,8 +4580,9 @@ class Eynollah: contours_only_text_d, hir_on_text_d = return_contours_of_image(text_only_d) contours_only_text_parent_d = return_parent_contours(contours_only_text_d, hir_on_text_d) + areas_tot_text_d = np.prod(text_only_d.shape) areas_cnt_text_d = np.array([cv2.contourArea(c) for c in contours_only_text_parent_d]) - areas_cnt_text_d = areas_cnt_text_d / float(text_only_d.shape[0] * text_only_d.shape[1]) + areas_cnt_text_d = areas_cnt_text_d / float(areas_tot_text_d) contours_only_text_parent_d = np.array(contours_only_text_parent_d)[areas_cnt_text_d > MIN_AREA_REGION] areas_cnt_text_d = areas_cnt_text_d[areas_cnt_text_d > MIN_AREA_REGION] @@ -4587,7 +4594,7 @@ class Eynollah: centers_d = np.stack(find_center_of_contours(contours_only_text_parent_d)) # [2, N] - center0_d = centers_d[:, -1:] # [2, 1] + center0_d = centers_d[:, -1:].copy() # [2, 1] # find the largest among the largest 5 deskewed contours # that is also closest to the largest original contour @@ -4605,26 +4612,75 @@ class Eynollah: center = (w // 2.0, h // 2.0) M = cv2.getRotationMatrix2D(center, slope_deskew, 1.0) M_22 = np.array(M)[:2, :2] - p0 = np.dot(M_22, center0) # [2, 1] - offset = p0 - center0_d # [2, 1] + center0 = np.dot(M_22, center0) # [2, 1] + offset = center0 - center0_d # [2, 1] - # img2 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) - contours_only_text_parent_d_ordered = [] + centers = np.dot(M_22, centers) - offset # [2,N] + # add dimension for area (so only contours of similar size will be considered close) + centers = np.append(centers, areas_cnt_text_parent[np.newaxis], axis=0) + centers_d = np.append(centers_d, areas_cnt_text_d[np.newaxis], axis=0) + + dists = np.zeros((len(contours_only_text_parent), len(contours_only_text_parent_d))) for i in range(len(contours_only_text_parent)): - p = np.dot(M_22, centers[:, i:i+1]) # [2, 1] - p -= offset - # add dimension for area - #dists = np.linalg.norm(p - centers_d, axis=0) - diffs = (np.append(p, [[areas_cnt_text_parent[i]]], axis=0) - - np.append(centers_d, areas_cnt_text_d[np.newaxis], axis=0)) - dists = np.linalg.norm(diffs, axis=0) - contours_only_text_parent_d_ordered.append( - contours_only_text_parent_d[np.argmin(dists)]) - # cv2.fillPoly(img2, pts=[contours_only_text_parent_d[np.argmin(dists)]], color=i + 1) + dists[i] = np.linalg.norm(centers[:, i:i + 1] - centers_d, axis=0) + corresp = np.zeros(dists.shape, dtype=bool) + # keep searching next-closest until at least one correspondence on each side + while not np.all(corresp.sum(axis=1)) and not np.all(corresp.sum(axis=0)): + idx = np.nanargmin(dists) + i, j = np.unravel_index(idx, dists.shape) + dists[i, j] = np.nan + corresp[i, j] = True + #print("original/deskewed adjacency", corresp.nonzero()) + contours_only_text_parent_d_ordered = np.zeros_like(contours_only_text_parent) + contours_only_text_parent_d_ordered = contours_only_text_parent_d[np.argmax(corresp, axis=1)] + # img1 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) + # for i in range(len(contours_only_text_parent)): + # cv2.fillPoly(img1, pts=[contours_only_text_parent_d_ordered[i]], color=i + 1) + # plt.subplot(2, 2, 1, title="direct corresp contours") + # plt.imshow(img1) + # img2 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) + # join deskewed regions mapping to single original ones + for i in range(len(contours_only_text_parent)): + if np.count_nonzero(corresp[i]) > 1: + indices = np.flatnonzero(corresp[i]) + #print("joining", indices) + polygons_d = [contour2polygon(contour) + for contour in contours_only_text_parent_d[indices]] + contour_d = polygon2contour(join_polygons(polygons_d)) + contours_only_text_parent_d_ordered[i] = contour_d + # cv2.fillPoly(img2, pts=[contour_d], color=i + 1) + # plt.subplot(2, 2, 3, title="joined contours") # plt.imshow(img2) + # img3 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) + # split deskewed regions mapping to multiple original ones + def deskew(polygon): + polygon = shapely.affinity.rotate(polygon, -slope_deskew, origin=center) + polygon = shapely.affinity.translate(polygon, *offset.squeeze()) + return polygon + for j in range(len(contours_only_text_parent_d)): + if np.count_nonzero(corresp[:, j]) > 1: + indices = np.flatnonzero(corresp[:, j]) + #print("splitting along", indices) + polygons = [deskew(contour2polygon(contour)) + for contour in contours_only_text_parent[indices]] + polygon_d = contour2polygon(contours_only_text_parent_d[j]) + polygons_d = [make_intersection(polygon_d, polygon) + for polygon in polygons] + # ignore where there is no actual overlap + indices = indices[np.flatnonzero(polygons_d)] + contours_d = [polygon2contour(polygon_d) + for polygon_d in polygons_d + if polygon_d] + contours_only_text_parent_d_ordered[indices] = contours_d + # cv2.fillPoly(img3, pts=contours_d, color=j + 1) + # plt.subplot(2, 2, 4, title="split contours") + # plt.imshow(img3) + # img4 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) + # for i in range(len(contours_only_text_parent)): + # cv2.fillPoly(img4, pts=[contours_only_text_parent_d_ordered[i]], color=i + 1) + # plt.subplot(2, 2, 2, title="result contours") + # plt.imshow(img4) # plt.show() - # rs: what about the remaining contours_only_text_parent_d? - # rs: what about duplicates? else: contours_only_text_parent_d_ordered = [] contours_only_text_parent_d = [] diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index 041cbf6..8431bbe 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -335,6 +335,21 @@ def polygon2contour(polygon: Polygon) -> np.ndarray: polygon = np.array(polygon.exterior.coords[:-1], dtype=int) return np.maximum(0, polygon).astype(np.uint)[:, np.newaxis] +def make_intersection(poly1, poly2): + interp = poly1.intersection(poly2) + # post-process + if interp.is_empty or interp.area == 0.0: + return None + if interp.geom_type == 'GeometryCollection': + # heterogeneous result: filter zero-area shapes (LineString, Point) + interp = unary_union([geom for geom in interp.geoms if geom.area > 0]) + if interp.geom_type == 'MultiPolygon': + # homogeneous result: construct convex hull to connect + interp = join_polygons(interp.geoms) + assert interp.geom_type == 'Polygon', interp.wkt + interp = make_valid(interp) + return interp + def make_valid(polygon: Polygon) -> Polygon: """Ensures shapely.geometry.Polygon object is valid by repeated rearrangement/simplification/enlargement.""" def isint(x): From fe603188f4f7f9d545b44085cdc45195f98f0546 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 6 Oct 2025 13:11:03 +0200 Subject: [PATCH 17/44] avoid unnecessary 3-channel conversions --- src/eynollah/eynollah.py | 52 ++++----- src/eynollah/utils/__init__.py | 156 +++++++++++---------------- src/eynollah/utils/contour.py | 74 +++++-------- src/eynollah/utils/separate_lines.py | 53 ++++----- 4 files changed, 132 insertions(+), 203 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 5e32929..834ecf3 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -712,7 +712,7 @@ class Eynollah: if self.input_binary: img = self.imread() prediction_bin = self.do_prediction(True, img, self.model_bin, n_batch_inference=5) - prediction_bin = 255 * (prediction_bin[:,:,0]==0) + prediction_bin = 255 * (prediction_bin[:,:,0] == 0) prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2).astype(np.uint8) img= np.copy(prediction_bin) img_bin = prediction_bin @@ -2064,9 +2064,7 @@ class Eynollah: boxes_sub_new = [] poly_sub = [] for mv in range(len(boxes_per_process)): - crop_img, _ = crop_image_inside_box(boxes_per_process[mv], - np.repeat(textline_mask_tot[:, :, np.newaxis], 3, axis=2)) - crop_img = crop_img[:, :, 0] + crop_img, _ = crop_image_inside_box(boxes_per_process[mv], textline_mask_tot) crop_img = cv2.erode(crop_img, KERNEL, iterations=2) try: textline_con, hierarchy = return_contours_of_image(crop_img) @@ -2638,10 +2636,8 @@ class Eynollah: layout_org[:,:,0][layout_org[:,:,0]==pixel_table] = 0 layout = (layout[:,:,0]==pixel_table)*1 - layout =np.repeat(layout[:, :, np.newaxis], 3, axis=2) layout = layout.astype(np.uint8) - imgray = cv2.cvtColor(layout, cv2.COLOR_BGR2GRAY ) - _, thresh = cv2.threshold(imgray, 0, 255, 0) + _, thresh = cv2.threshold(layout, 0, 255, 0) contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) cnt_size = np.array([cv2.contourArea(contours[j]) @@ -2652,8 +2648,8 @@ class Eynollah: x, y, w, h = cv2.boundingRect(contours[i]) iou = cnt_size[i] /float(w*h) *100 if iou<80: - layout_contour = np.zeros((layout_org.shape[0], layout_org.shape[1])) - layout_contour= cv2.fillPoly(layout_contour,pts=[contours[i]] ,color=(1,1,1)) + layout_contour = np.zeros(layout_org.shape[:2]) + layout_contour = cv2.fillPoly(layout_contour, pts=[contours[i]] ,color=1) layout_contour_sum = layout_contour.sum(axis=0) layout_contour_sum_diff = np.diff(layout_contour_sum) @@ -2669,20 +2665,17 @@ class Eynollah: layout_contour=cv2.erode(layout_contour[:,:], KERNEL, iterations=5) layout_contour=cv2.dilate(layout_contour[:,:], KERNEL, iterations=5) - layout_contour =np.repeat(layout_contour[:, :, np.newaxis], 3, axis=2) layout_contour = layout_contour.astype(np.uint8) - - imgray = cv2.cvtColor(layout_contour, cv2.COLOR_BGR2GRAY ) - _, thresh = cv2.threshold(imgray, 0, 255, 0) + _, thresh = cv2.threshold(layout_contour, 0, 255, 0) contours_sep, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) for ji in range(len(contours_sep) ): contours_new.append(contours_sep[ji]) if num_col_classifier>=2: - only_recent_contour_image = np.zeros((layout.shape[0],layout.shape[1])) - only_recent_contour_image= cv2.fillPoly(only_recent_contour_image, - pts=[contours_sep[ji]], color=(1,1,1)) + only_recent_contour_image = np.zeros(layout.shape[:2]) + only_recent_contour_image = cv2.fillPoly(only_recent_contour_image, + pts=[contours_sep[ji]], color=1) table_pixels_masked_from_early_pre = only_recent_contour_image * table_prediction_early iou_in = 100. * table_pixels_masked_from_early_pre.sum() / only_recent_contour_image.sum() #print(iou_in,'iou_in_in1') @@ -3210,13 +3203,11 @@ class Eynollah: pixel_lines = 3 if np.abs(slope_deskew) < SLOPE_THRESHOLD: _, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( - np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), - num_col_classifier, self.tables, pixel_lines) + text_regions_p, num_col_classifier, self.tables, pixel_lines) if np.abs(slope_deskew) >= SLOPE_THRESHOLD: _, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), - num_col_classifier, self.tables, pixel_lines) + text_regions_p_1_n, num_col_classifier, self.tables, pixel_lines) #print(time.time()-t_0_box,'time box in 2') self.logger.info("num_col_classifier: %s", num_col_classifier) @@ -3392,13 +3383,11 @@ class Eynollah: pixel_lines=3 if np.abs(slope_deskew) < SLOPE_THRESHOLD: num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( - np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), - num_col_classifier, self.tables, pixel_lines) + text_regions_p, num_col_classifier, self.tables, pixel_lines) if np.abs(slope_deskew) >= SLOPE_THRESHOLD: num_col_d, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), - num_col_classifier, self.tables, pixel_lines) + text_regions_p_1_n, num_col_classifier, self.tables, pixel_lines) if num_col_classifier>=3: if np.abs(slope_deskew) < SLOPE_THRESHOLD: @@ -3498,7 +3487,7 @@ class Eynollah: #text_regions_p[:,:][regions_fully[:,:,0]==6]=6 ##regions_fully_only_drop = put_drop_out_from_only_drop_model(regions_fully_only_drop, text_regions_p) - ##regions_fully[:, :, 0][regions_fully_only_drop[:, :, 0] == 4] = 4 + ##regions_fully[:, :, 0][regions_fully_only_drop[:, :] == 4] = 4 drop_capital_label_in_full_layout_model = 3 drops = (regions_fully[:,:,0]==drop_capital_label_in_full_layout_model)*1 @@ -4715,7 +4704,6 @@ class Eynollah: return pcgts - #print("text region early 3 in %.1fs", time.time() - t0) if self.light_version: contours_only_text_parent = dilate_textregion_contours(contours_only_text_parent) @@ -4851,21 +4839,17 @@ class Eynollah: if not self.headers_off: if np.abs(slope_deskew) < SLOPE_THRESHOLD: num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( - np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), - num_col_classifier, self.tables, label_seps, contours_only_text_parent_h) + text_regions_p, num_col_classifier, self.tables, label_seps, contours_only_text_parent_h) else: _, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), - num_col_classifier, self.tables, label_seps, contours_only_text_parent_h_d_ordered) + text_regions_p_1_n, num_col_classifier, self.tables, label_seps, contours_only_text_parent_h_d_ordered) elif self.headers_off: if np.abs(slope_deskew) < SLOPE_THRESHOLD: num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( - np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), - num_col_classifier, self.tables, label_seps) + text_regions_p, num_col_classifier, self.tables, label_seps) else: _, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), - num_col_classifier, self.tables, label_seps) + text_regions_p_1_n, num_col_classifier, self.tables, label_seps) if num_col_classifier >= 3: if np.abs(slope_deskew) < SLOPE_THRESHOLD: diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 6e5afd4..ebf78fe 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -796,7 +796,7 @@ def find_num_col_only_image(regions_without_separators, multiplier=3.8): return len(peaks_fin_true), peaks_fin_true def find_num_col_by_vertical_lines(regions_without_separators, multiplier=3.8): - regions_without_separators_0 = regions_without_separators[:, :, 0].sum(axis=0) + regions_without_separators_0 = regions_without_separators.sum(axis=0) ##plt.plot(regions_without_separators_0) ##plt.show() @@ -823,7 +823,10 @@ def return_regions_without_separators(regions_pre): return regions_without_separators def put_drop_out_from_only_drop_model(layout_no_patch, layout1): - drop_only = (layout_no_patch[:, :, 0] == 4) * 1 + if layout_no_patch.ndim == 3: + layout_no_patch = layout_no_patch[:, :, 0] + + drop_only = (layout_no_patch[:, :] == 4) * 1 contours_drop, hir_on_drop = return_contours_of_image(drop_only) contours_drop_parent = return_parent_contours(contours_drop, hir_on_drop) @@ -849,9 +852,8 @@ def put_drop_out_from_only_drop_model(layout_no_patch, layout1): (map_of_drop_contour_bb == 5).sum()) >= 15: contours_drop_parent_final.append(contours_drop_parent[jj]) - layout_no_patch[:, :, 0][layout_no_patch[:, :, 0] == 4] = 0 - - layout_no_patch = cv2.fillPoly(layout_no_patch, pts=contours_drop_parent_final, color=(4, 4, 4)) + layout_no_patch[:, :][layout_no_patch[:, :] == 4] = 0 + layout_no_patch = cv2.fillPoly(layout_no_patch, pts=contours_drop_parent_final, color=4) return layout_no_patch @@ -925,17 +927,16 @@ def check_any_text_region_in_model_one_is_main_or_header( contours_only_text_parent_main_d=[] contours_only_text_parent_head_d=[] - for ii in range(len(contours_only_text_parent)): - con=contours_only_text_parent[ii] - img=np.zeros((regions_model_1.shape[0],regions_model_1.shape[1],3)) - img = cv2.fillPoly(img, pts=[con], color=(255, 255, 255)) + for ii, con in enumerate(contours_only_text_parent): + img = np.zeros(regions_model_1.shape[:2]) + img = cv2.fillPoly(img, pts=[con], color=255) - all_pixels=((img[:,:,0]==255)*1).sum() - pixels_header=( ( (img[:,:,0]==255) & (regions_model_full[:,:,0]==2) )*1 ).sum() + all_pixels=((img == 255)*1).sum() + pixels_header=( ( (img == 255) & (regions_model_full[:,:,0]==2) )*1 ).sum() pixels_main=all_pixels-pixels_header if (pixels_header>=pixels_main) and ( (length_con[ii]/float(height_con[ii]) )>=1.3 ): - regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ]=2 + regions_model_1[:,:][(regions_model_1[:,:]==1) & (img == 255) ]=2 contours_only_text_parent_head.append(con) if contours_only_text_parent_d_ordered is not None: contours_only_text_parent_head_d.append(contours_only_text_parent_d_ordered[ii]) @@ -944,7 +945,7 @@ def check_any_text_region_in_model_one_is_main_or_header( all_found_textline_polygons_head.append(all_found_textline_polygons[ii]) conf_contours_head.append(None) else: - regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ]=1 + regions_model_1[:,:][(regions_model_1[:,:]==1) & (img == 255) ]=1 contours_only_text_parent_main.append(con) conf_contours_main.append(conf_contours[ii]) if contours_only_text_parent_d_ordered is not None: @@ -1015,11 +1016,11 @@ def check_any_text_region_in_model_one_is_main_or_header_light( contours_only_text_parent_head_d=[] for ii, con in enumerate(contours_only_text_parent_z): - img=np.zeros((regions_model_1.shape[0], regions_model_1.shape[1], 3)) - img = cv2.fillPoly(img, pts=[con], color=(255, 255, 255)) + img = np.zeros(regions_model_1.shape[:2]) + img = cv2.fillPoly(img, pts=[con], color=255) - all_pixels = (img[:,:,0]==255).sum() - pixels_header=((img[:,:,0]==255) & + all_pixels = (img == 255).sum() + pixels_header=((img == 255) & (regions_model_full[:,:,0]==2)).sum() pixels_main = all_pixels - pixels_header @@ -1029,7 +1030,7 @@ def check_any_text_region_in_model_one_is_main_or_header_light( ( pixels_header / float(pixels_main) >= 0.3 and length_con[ii] / float(height_con[ii]) >=3 )): - regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ] = 2 + regions_model_1[:,:][(regions_model_1[:,:]==1) & (img == 255) ] = 2 contours_only_text_parent_head.append(contours_only_text_parent[ii]) conf_contours_head.append(None) # why not conf_contours[ii], too? if contours_only_text_parent_d_ordered is not None: @@ -1039,7 +1040,7 @@ def check_any_text_region_in_model_one_is_main_or_header_light( all_found_textline_polygons_head.append(all_found_textline_polygons[ii]) else: - regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ] = 1 + regions_model_1[:,:][(regions_model_1[:,:]==1) & (img == 255) ] = 1 contours_only_text_parent_main.append(contours_only_text_parent[ii]) conf_contours_main.append(conf_contours[ii]) if contours_only_text_parent_d_ordered is not None: @@ -1119,11 +1120,11 @@ def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col) textlines_big.append(textlines_tot[i]) textlines_big_org_form.append(textlines_tot_org_form[i]) - img_textline_s = np.zeros((textline_iamge.shape[0], textline_iamge.shape[1])) - img_textline_s = cv2.fillPoly(img_textline_s, pts=textlines_small, color=(1, 1, 1)) + img_textline_s = np.zeros(textline_iamge.shape[:2]) + img_textline_s = cv2.fillPoly(img_textline_s, pts=textlines_small, color=1) - img_textline_b = np.zeros((textline_iamge.shape[0], textline_iamge.shape[1])) - img_textline_b = cv2.fillPoly(img_textline_b, pts=textlines_big, color=(1, 1, 1)) + img_textline_b = np.zeros(textline_iamge.shape[:2]) + img_textline_b = cv2.fillPoly(img_textline_b, pts=textlines_big, color=1) sum_small_big_all = img_textline_s + img_textline_b sum_small_big_all2 = (sum_small_big_all[:, :] == 2) * 1 @@ -1135,11 +1136,11 @@ def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col) # print(len(textlines_small),'small') intersections = [] for z2 in range(len(textlines_big)): - img_text = np.zeros((textline_iamge.shape[0], textline_iamge.shape[1])) - img_text = cv2.fillPoly(img_text, pts=[textlines_small[z1]], color=(1, 1, 1)) + img_text = np.zeros(textline_iamge.shape[:2]) + img_text = cv2.fillPoly(img_text, pts=[textlines_small[z1]], color=1) - img_text2 = np.zeros((textline_iamge.shape[0], textline_iamge.shape[1])) - img_text2 = cv2.fillPoly(img_text2, pts=[textlines_big[z2]], color=(1, 1, 1)) + img_text2 = np.zeros(textline_iamge.shape[:2]) + img_text2 = cv2.fillPoly(img_text2, pts=[textlines_big[z2]], color=1) sum_small_big = img_text2 + img_text sum_small_big_2 = (sum_small_big[:, :] == 2) * 1 @@ -1165,19 +1166,17 @@ def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col) index_small_textlines = list(np.where(np.array(dis_small_from_bigs_tot) == z)[0]) # print(z,index_small_textlines) - img_text2 = np.zeros((textline_iamge.shape[0], textline_iamge.shape[1], 3)) - img_text2 = cv2.fillPoly(img_text2, pts=[textlines_big[z]], color=(255, 255, 255)) + img_text2 = np.zeros(textline_iamge.shape[:2], dtype=np.uint8) + img_text2 = cv2.fillPoly(img_text2, pts=[textlines_big[z]], color=255) textlines_big_with_change.append(z) for k in index_small_textlines: - img_text2 = cv2.fillPoly(img_text2, pts=[textlines_small[k]], color=(255, 255, 255)) + img_text2 = cv2.fillPoly(img_text2, pts=[textlines_small[k]], color=255) textlines_small_with_change.append(k) - img_text2 = img_text2.astype(np.uint8) - imgray = cv2.cvtColor(img_text2, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - cont, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + _, thresh = cv2.threshold(img_text2, 0, 255, 0) + cont, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) # print(cont[0],type(cont)) textlines_big_with_change_con.append(cont) @@ -1189,9 +1188,8 @@ def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col) # print(textlines_big_with_change,'textlines_big_with_change') # print(textlines_small_with_change,'textlines_small_with_change') # print(textlines_big) - textlines_con_changed.append(textlines_big_org_form) - else: - textlines_con_changed.append(textlines_big_org_form) + + textlines_con_changed.append(textlines_big_org_form) return textlines_con_changed def order_of_regions(textline_mask, contours_main, contours_head, y_ref): @@ -1262,29 +1260,22 @@ def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( img_p_in_ver, img_in_hor,num_col_classifier): #img_p_in_ver = cv2.erode(img_p_in_ver, self.kernel, iterations=2) - img_p_in_ver=img_p_in_ver.astype(np.uint8) - img_p_in_ver=np.repeat(img_p_in_ver[:, :, np.newaxis], 3, axis=2) - imgray = cv2.cvtColor(img_p_in_ver, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - - contours_lines_ver,hierarchy=cv2.findContours(thresh,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) + _, thresh = cv2.threshold(img_p_in_ver, 0, 255, 0) + contours_lines_ver, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) slope_lines_ver, _, x_min_main_ver, _, _, _, y_min_main_ver, y_max_main_ver, cx_main_ver = \ find_features_of_lines(contours_lines_ver) for i in range(len(x_min_main_ver)): img_p_in_ver[int(y_min_main_ver[i]): int(y_min_main_ver[i])+30, int(cx_main_ver[i])-25: - int(cx_main_ver[i])+25, 0] = 0 + int(cx_main_ver[i])+25] = 0 img_p_in_ver[int(y_max_main_ver[i])-30: int(y_max_main_ver[i]), int(cx_main_ver[i])-25: - int(cx_main_ver[i])+25, 0] = 0 + int(cx_main_ver[i])+25] = 0 - img_in_hor=img_in_hor.astype(np.uint8) - img_in_hor=np.repeat(img_in_hor[:, :, np.newaxis], 3, axis=2) - imgray = cv2.cvtColor(img_in_hor, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - contours_lines_hor,hierarchy=cv2.findContours(thresh,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) + _, thresh = cv2.threshold(img_in_hor, 0, 255, 0) + contours_lines_hor, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) slope_lines_hor, dist_x_hor, x_min_main_hor, x_max_main_hor, cy_main_hor, _, _, _, _ = \ find_features_of_lines(contours_lines_hor) @@ -1340,22 +1331,19 @@ def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( img_p_in=img_in_hor special_separators=[] - img_p_in_ver[:,:,0][img_p_in_ver[:,:,0]==255]=1 - sep_ver_hor=img_p_in+img_p_in_ver - sep_ver_hor_cross=(sep_ver_hor[:,:,0]==2)*1 - sep_ver_hor_cross=np.repeat(sep_ver_hor_cross[:, :, np.newaxis], 3, axis=2) - sep_ver_hor_cross=sep_ver_hor_cross.astype(np.uint8) - imgray = cv2.cvtColor(sep_ver_hor_cross, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - contours_cross,_=cv2.findContours(thresh,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) - cx_cross, cy_cross = find_center_of_contours(contours_cross) - for ii in range(len(cx_cross)): - img_p_in[int(cy_cross[ii])-30:int(cy_cross[ii])+30,int(cx_cross[ii])+5:int(cx_cross[ii])+40,0]=0 - img_p_in[int(cy_cross[ii])-30:int(cy_cross[ii])+30,int(cx_cross[ii])-40:int(cx_cross[ii])-4,0]=0 + img_p_in_ver[img_p_in_ver == 255] = 1 + sep_ver_hor = img_p_in + img_p_in_ver + sep_ver_hor_cross = (sep_ver_hor == 2) * 1 + _, thresh = cv2.threshold(sep_ver_hor_cross.astype(np.uint8), 0, 255, 0) + contours_cross, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + center_cross = np.array(find_center_of_contours(contours_cross), dtype=int) + for cx, cy in center_cross.T: + img_p_in[cy - 30: cy + 30, cx + 5: cx + 40] = 0 + img_p_in[cy - 30: cy + 30, cx - 40: cx - 4] = 0 else: img_p_in=np.copy(img_in_hor) special_separators=[] - return img_p_in[:,:,0], special_separators + return img_p_in, special_separators def return_points_with_boundies(peaks_neg_fin, first_point, last_point): peaks_neg_tot = [] @@ -1365,11 +1353,11 @@ def return_points_with_boundies(peaks_neg_fin, first_point, last_point): peaks_neg_tot.append(last_point) return peaks_neg_tot -def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, pixel_lines, contours_h=None): +def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, label_lines, contours_h=None): t_ins_c0 = time.time() - separators_closeup=( (region_pre_p[:,:,:]==pixel_lines))*1 - separators_closeup[0:110,:,:]=0 - separators_closeup[separators_closeup.shape[0]-150:,:,:]=0 + separators_closeup=( (region_pre_p[:,:]==label_lines))*1 + separators_closeup[0:110,:]=0 + separators_closeup[separators_closeup.shape[0]-150:,:]=0 kernel = np.ones((5,5),np.uint8) separators_closeup=separators_closeup.astype(np.uint8) @@ -1381,15 +1369,11 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, separators_closeup_n=separators_closeup_n.astype(np.uint8) separators_closeup_n_binary=np.zeros(( separators_closeup_n.shape[0],separators_closeup_n.shape[1]) ) - separators_closeup_n_binary[:,:]=separators_closeup_n[:,:,0] + separators_closeup_n_binary[:,:]=separators_closeup_n[:,:] separators_closeup_n_binary[:,:][separators_closeup_n_binary[:,:]!=0]=1 - gray_early=np.repeat(separators_closeup_n_binary[:, :, np.newaxis], 3, axis=2) - gray_early=gray_early.astype(np.uint8) - imgray_e = cv2.cvtColor(gray_early, cv2.COLOR_BGR2GRAY) - ret_e, thresh_e = cv2.threshold(imgray_e, 0, 255, 0) - - contours_line_e,hierarchy_e=cv2.findContours(thresh_e,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) + _, thresh_e = cv2.threshold(separators_closeup_n_binary, 0, 255, 0) + contours_line_e, _ = cv2.findContours(thresh_e.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) _, dist_xe, _, _, _, _, y_min_main, y_max_main, _ = \ find_features_of_lines(contours_line_e) dist_ye = y_max_main - y_min_main @@ -1399,10 +1383,8 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, cnts_hor_e=[] for ce in args_hor_e: cnts_hor_e.append(contours_line_e[ce]) - figs_e=np.zeros(thresh_e.shape) - figs_e=cv2.fillPoly(figs_e,pts=cnts_hor_e,color=(1,1,1)) - separators_closeup_n_binary=cv2.fillPoly(separators_closeup_n_binary, pts=cnts_hor_e, color=(0,0,0)) + separators_closeup_n_binary=cv2.fillPoly(separators_closeup_n_binary, pts=cnts_hor_e, color=0) gray = cv2.bitwise_not(separators_closeup_n_binary) gray=gray.astype(np.uint8) @@ -1422,7 +1404,7 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, kernel = np.ones((5,5),np.uint8) horizontal = cv2.dilate(horizontal,kernel,iterations = 2) horizontal = cv2.erode(horizontal,kernel,iterations = 2) - horizontal = cv2.fillPoly(horizontal, pts=cnts_hor_e, color=(255,255,255)) + horizontal = cv2.fillPoly(horizontal, pts=cnts_hor_e, color=255) rows = vertical.shape[0] verticalsize = rows // 30 @@ -1440,13 +1422,8 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, separators_closeup_new[:,:][vertical[:,:]!=0]=1 separators_closeup_new[:,:][horizontal[:,:]!=0]=1 - vertical=np.repeat(vertical[:, :, np.newaxis], 3, axis=2) - vertical=vertical.astype(np.uint8) - - imgray = cv2.cvtColor(vertical, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - - contours_line_vers,hierarchy=cv2.findContours(thresh,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) + _, thresh = cv2.threshold(vertical, 0, 255, 0) + contours_line_vers, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) slope_lines, dist_x, x_min_main, x_max_main, cy_main, slope_lines_org, y_min_main, y_max_main, cx_main = \ find_features_of_lines(contours_line_vers) @@ -1461,11 +1438,8 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, dist_y_ver=y_max_main_ver-y_min_main_ver len_y=separators_closeup.shape[0]/3.0 - horizontal=np.repeat(horizontal[:, :, np.newaxis], 3, axis=2) - horizontal=horizontal.astype(np.uint8) - imgray = cv2.cvtColor(horizontal, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - contours_line_hors,hierarchy=cv2.findContours(thresh,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) + _, thresh = cv2.threshold(horizontal, 0, 255, 0) + contours_line_hors, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) slope_lines, dist_x, x_min_main, x_max_main, cy_main, slope_lines_org, y_min_main, y_max_main, cx_main = \ find_features_of_lines(contours_line_hors) @@ -1558,7 +1532,7 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, peaks_neg_fin_fin=[] for itiles in args_big_parts: regions_without_separators_tile=regions_without_separators[int(splitter_y_new[itiles]): - int(splitter_y_new[itiles+1]),:,0] + int(splitter_y_new[itiles+1]),:] try: num_col, peaks_neg_fin = find_num_col(regions_without_separators_tile, num_col_classifier, tables, multiplier=7.0) diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index 8431bbe..22a6f50 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -119,14 +119,11 @@ def return_parent_contours(contours, hierarchy): def return_contours_of_interested_region(region_pre_p, label, min_area=0.0002): # pixels of images are identified by 5 - if len(region_pre_p.shape) == 3: + if region_pre_p.ndim == 3: cnts_images = (region_pre_p[:, :, 0] == label) * 1 else: cnts_images = (region_pre_p[:, :] == label) * 1 - cnts_images = cnts_images.astype(np.uint8) - cnts_images = np.repeat(cnts_images[:, :, np.newaxis], 3, axis=2) - imgray = cv2.cvtColor(cnts_images, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) + _, thresh = cv2.threshold(cnts_images.astype(np.uint8), 0, 255, 0) contours_imgs, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) contours_imgs = return_parent_contours(contours_imgs, hierarchy) @@ -135,13 +132,11 @@ def return_contours_of_interested_region(region_pre_p, label, min_area=0.0002): return contours_imgs def do_work_of_contours_in_image(contour, index_r_con, img, slope_first): - img_copy = np.zeros(img.shape) - img_copy = cv2.fillPoly(img_copy, pts=[contour], color=(1, 1, 1)) + img_copy = np.zeros(img.shape[:2], dtype=np.uint8) + img_copy = cv2.fillPoly(img_copy, pts=[contour], color=1) img_copy = rotation_image_new(img_copy, -slope_first) - img_copy = img_copy.astype(np.uint8) - imgray = cv2.cvtColor(img_copy, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) + _, thresh = cv2.threshold(img_copy, 0, 255, 0) cont_int, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) @@ -164,8 +159,8 @@ def get_textregion_contours_in_org_image(cnts, img, slope_first): cnts_org = [] # print(cnts,'cnts') for i in range(len(cnts)): - img_copy = np.zeros(img.shape) - img_copy = cv2.fillPoly(img_copy, pts=[cnts[i]], color=(1, 1, 1)) + img_copy = np.zeros(img.shape[:2], dtype=np.uint8) + img_copy = cv2.fillPoly(img_copy, pts=[cnts[i]], color=1) # plt.imshow(img_copy) # plt.show() @@ -176,9 +171,7 @@ def get_textregion_contours_in_org_image(cnts, img, slope_first): # plt.imshow(img_copy) # plt.show() - img_copy = img_copy.astype(np.uint8) - imgray = cv2.cvtColor(img_copy, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) + _, thresh = cv2.threshold(img_copy, 0, 255, 0) cont_int, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) cont_int[0][:, 0, 0] = cont_int[0][:, 0, 0] + np.abs(img_copy.shape[1] - img.shape[1]) @@ -195,12 +188,11 @@ def get_textregion_contours_in_org_image_light_old(cnts, img, slope_first): interpolation=cv2.INTER_NEAREST) cnts_org = [] for cnt in cnts: - img_copy = np.zeros(img.shape) - img_copy = cv2.fillPoly(img_copy, pts=[(cnt / zoom).astype(int)], color=(1, 1, 1)) + img_copy = np.zeros(img.shape[:2], dtype=np.uint8) + img_copy = cv2.fillPoly(img_copy, pts=[cnt // zoom], color=1) img_copy = rotation_image_new(img_copy, -slope_first).astype(np.uint8) - imgray = cv2.cvtColor(img_copy, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) + _, thresh = cv2.threshold(img_copy, 0, 255, 0) cont_int, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) cont_int[0][:, 0, 0] = cont_int[0][:, 0, 0] + np.abs(img_copy.shape[1] - img.shape[1]) @@ -210,14 +202,13 @@ def get_textregion_contours_in_org_image_light_old(cnts, img, slope_first): return cnts_org def do_back_rotation_and_get_cnt_back(contour_par, index_r_con, img, slope_first, confidence_matrix): - img_copy = np.zeros(img.shape) - img_copy = cv2.fillPoly(img_copy, pts=[contour_par], color=(1, 1, 1)) - confidence_matrix_mapped_with_contour = confidence_matrix * img_copy[:,:,0] - confidence_contour = np.sum(confidence_matrix_mapped_with_contour) / float(np.sum(img_copy[:,:,0])) + img_copy = np.zeros(img.shape[:2], dtype=np.uint8) + img_copy = cv2.fillPoly(img_copy, pts=[contour_par], color=1) + confidence_matrix_mapped_with_contour = confidence_matrix * img_copy + confidence_contour = np.sum(confidence_matrix_mapped_with_contour) / float(np.sum(img_copy)) img_copy = rotation_image_new(img_copy, -slope_first).astype(np.uint8) - imgray = cv2.cvtColor(img_copy, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) + _, thresh = cv2.threshold(img_copy, 0, 255, 0) cont_int, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) if len(cont_int)==0: @@ -245,14 +236,11 @@ def get_textregion_contours_in_org_image_light(cnts, img, confidence_matrix): def return_contours_of_interested_textline(region_pre_p, label): # pixels of images are identified by 5 - if len(region_pre_p.shape) == 3: + if region_pre_p.ndim == 3: cnts_images = (region_pre_p[:, :, 0] == label) * 1 else: cnts_images = (region_pre_p[:, :] == label) * 1 - cnts_images = cnts_images.astype(np.uint8) - cnts_images = np.repeat(cnts_images[:, :, np.newaxis], 3, axis=2) - imgray = cv2.cvtColor(cnts_images, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) + _, thresh = cv2.threshold(cnts_images.astype(np.uint8), 0, 255, 0) contours_imgs, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) contours_imgs = return_parent_contours(contours_imgs, hierarchy) @@ -262,25 +250,22 @@ def return_contours_of_interested_textline(region_pre_p, label): def return_contours_of_image(image): if len(image.shape) == 2: - image = np.repeat(image[:, :, np.newaxis], 3, axis=2) image = image.astype(np.uint8) + imgray = image else: image = image.astype(np.uint8) - imgray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) + imgray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + _, thresh = cv2.threshold(imgray, 0, 255, 0) contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) return contours, hierarchy def return_contours_of_interested_region_by_min_size(region_pre_p, label, min_size=0.00003): # pixels of images are identified by 5 - if len(region_pre_p.shape) == 3: + if region_pre_p.ndim == 3: cnts_images = (region_pre_p[:, :, 0] == label) * 1 else: cnts_images = (region_pre_p[:, :] == label) * 1 - cnts_images = cnts_images.astype(np.uint8) - cnts_images = np.repeat(cnts_images[:, :, np.newaxis], 3, axis=2) - imgray = cv2.cvtColor(cnts_images, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) + _, thresh = cv2.threshold(cnts_images.astype(np.uint8), 0, 255, 0) contours_imgs, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) contours_imgs = return_parent_contours(contours_imgs, hierarchy) @@ -291,24 +276,21 @@ def return_contours_of_interested_region_by_min_size(region_pre_p, label, min_si def return_contours_of_interested_region_by_size(region_pre_p, label, min_area, max_area): # pixels of images are identified by 5 - if len(region_pre_p.shape) == 3: + if region_pre_p.ndim == 3: cnts_images = (region_pre_p[:, :, 0] == label) * 1 else: cnts_images = (region_pre_p[:, :] == label) * 1 - cnts_images = cnts_images.astype(np.uint8) - cnts_images = np.repeat(cnts_images[:, :, np.newaxis], 3, axis=2) - imgray = cv2.cvtColor(cnts_images, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) + _, thresh = cv2.threshold(cnts_images.astype(np.uint8), 0, 255, 0) contours_imgs, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) contours_imgs = return_parent_contours(contours_imgs, hierarchy) contours_imgs = filter_contours_area_of_image_tables( thresh, contours_imgs, hierarchy, max_area=max_area, min_area=min_area) - img_ret = np.zeros((region_pre_p.shape[0], region_pre_p.shape[1], 3)) - img_ret = cv2.fillPoly(img_ret, pts=contours_imgs, color=(1, 1, 1)) + img_ret = np.zeros((region_pre_p.shape[0], region_pre_p.shape[1])) + img_ret = cv2.fillPoly(img_ret, pts=contours_imgs, color=1) - return img_ret[:, :, 0] + return img_ret def dilate_textline_contours(all_found_textline_polygons): return [[polygon2contour(contour2polygon(contour, dilate=6)) diff --git a/src/eynollah/utils/separate_lines.py b/src/eynollah/utils/separate_lines.py index d41dda1..b8c7f3d 100644 --- a/src/eynollah/utils/separate_lines.py +++ b/src/eynollah/utils/separate_lines.py @@ -142,13 +142,12 @@ def dedup_separate_lines(img_patch, contour_text_interest, thetha, axis): rotation_matrix) def separate_lines(img_patch, contour_text_interest, thetha, x_help, y_help): - (h, w) = img_patch.shape[:2] + h, w = img_patch.shape[:2] center = (w // 2, h // 2) M = cv2.getRotationMatrix2D(center, -thetha, 1.0) x_d = M[0, 2] y_d = M[1, 2] - thetha = thetha / 180. * np.pi - rotation_matrix = np.array([[np.cos(thetha), -np.sin(thetha)], [np.sin(thetha), np.cos(thetha)]]) + rotation_matrix = M[:2, :2] contour_text_interest_copy = contour_text_interest.copy() x_cont = contour_text_interest[:, 0, 0] @@ -1302,19 +1301,16 @@ def separate_lines_new_inside_tiles(img_path, thetha): def separate_lines_vertical_cont(img_patch, contour_text_interest, thetha, box_ind, add_boxes_coor_into_textlines): kernel = np.ones((5, 5), np.uint8) - pixel = 255 + label = 255 min_area = 0 max_area = 1 - if len(img_patch.shape) == 3: - cnts_images = (img_patch[:, :, 0] == pixel) * 1 + if img_patch.ndim == 3: + cnts_images = (img_patch[:, :, 0] == label) * 1 else: - cnts_images = (img_patch[:, :] == pixel) * 1 - cnts_images = cnts_images.astype(np.uint8) - cnts_images = np.repeat(cnts_images[:, :, np.newaxis], 3, axis=2) - imgray = cv2.cvtColor(cnts_images, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - contours_imgs, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + cnts_images = (img_patch[:, :] == label) * 1 + _, thresh = cv2.threshold(cnts_images.astype(np.uint8), 0, 255, 0) + contours_imgs, hierarchy = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) contours_imgs = return_parent_contours(contours_imgs, hierarchy) contours_imgs = filter_contours_area_of_image_tables(thresh, @@ -1322,14 +1318,12 @@ def separate_lines_vertical_cont(img_patch, contour_text_interest, thetha, box_i max_area=max_area, min_area=min_area) cont_final = [] for i in range(len(contours_imgs)): - img_contour = np.zeros((cnts_images.shape[0], cnts_images.shape[1], 3)) - img_contour = cv2.fillPoly(img_contour, pts=[contours_imgs[i]], color=(255, 255, 255)) - img_contour = img_contour.astype(np.uint8) + img_contour = np.zeros(cnts_images.shape[:2], dtype=np.uint8) + img_contour = cv2.fillPoly(img_contour, pts=[contours_imgs[i]], color=255) img_contour = cv2.dilate(img_contour, kernel, iterations=4) - imgrayrot = cv2.cvtColor(img_contour, cv2.COLOR_BGR2GRAY) - _, threshrot = cv2.threshold(imgrayrot, 0, 255, 0) - contours_text_rot, _ = cv2.findContours(threshrot.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + _, threshrot = cv2.threshold(img_contour, 0, 255, 0) + contours_text_rot, _ = cv2.findContours(threshrot.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) ##contour_text_copy[:, 0, 0] = contour_text_copy[:, 0, 0] - box_ind[ ##0] @@ -1344,8 +1338,7 @@ def separate_lines_vertical_cont(img_patch, contour_text_interest, thetha, box_i def textline_contours_postprocessing(textline_mask, slope, contour_text_interest, box_ind, add_boxes_coor_into_textlines=False): - textline_mask = np.repeat(textline_mask[:, :, np.newaxis], 3, axis=2) * 255 - textline_mask = textline_mask.astype(np.uint8) + textline_mask = textline_mask * 255 kernel = np.ones((5, 5), np.uint8) textline_mask = cv2.morphologyEx(textline_mask, cv2.MORPH_OPEN, kernel) textline_mask = cv2.morphologyEx(textline_mask, cv2.MORPH_CLOSE, kernel) @@ -1356,12 +1349,11 @@ def textline_contours_postprocessing(textline_mask, slope, y_help = 2 textline_mask_help = np.zeros((textline_mask.shape[0] + int(2 * y_help), - textline_mask.shape[1] + int(2 * x_help), 3)) + textline_mask.shape[1] + int(2 * x_help))) textline_mask_help[y_help : y_help + textline_mask.shape[0], - x_help : x_help + textline_mask.shape[1], :] = np.copy(textline_mask[:, :, :]) + x_help : x_help + textline_mask.shape[1]] = np.copy(textline_mask[:, :]) dst = rotate_image(textline_mask_help, slope) - dst = dst[:, :, 0] dst[dst != 0] = 1 # if np.abs(slope)>.5 and textline_mask.shape[0]/float(textline_mask.shape[1])>3: @@ -1372,21 +1364,18 @@ def textline_contours_postprocessing(textline_mask, slope, contour_text_copy[:, 0, 0] = contour_text_copy[:, 0, 0] - box_ind[0] contour_text_copy[:, 0, 1] = contour_text_copy[:, 0, 1] - box_ind[1] - img_contour = np.zeros((box_ind[3], box_ind[2], 3)) - img_contour = cv2.fillPoly(img_contour, pts=[contour_text_copy], color=(255, 255, 255)) + img_contour = np.zeros((box_ind[3], box_ind[2])) + img_contour = cv2.fillPoly(img_contour, pts=[contour_text_copy], color=255) img_contour_help = np.zeros((img_contour.shape[0] + int(2 * y_help), - img_contour.shape[1] + int(2 * x_help), 3)) + img_contour.shape[1] + int(2 * x_help))) img_contour_help[y_help : y_help + img_contour.shape[0], - x_help : x_help + img_contour.shape[1], :] = np.copy(img_contour[:, :, :]) + x_help : x_help + img_contour.shape[1]] = np.copy(img_contour[:, :]) img_contour_rot = rotate_image(img_contour_help, slope) - img_contour_rot = img_contour_rot.astype(np.uint8) - # dst_help = dst_help.astype(np.uint8) - imgrayrot = cv2.cvtColor(img_contour_rot, cv2.COLOR_BGR2GRAY) - _, threshrot = cv2.threshold(imgrayrot, 0, 255, 0) - contours_text_rot, _ = cv2.findContours(threshrot.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + _, threshrot = cv2.threshold(img_contour_rot, 0, 255, 0) + contours_text_rot, _ = cv2.findContours(threshrot.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) len_con_text_rot = [len(contours_text_rot[ib]) for ib in range(len(contours_text_rot))] ind_big_con = np.argmax(len_con_text_rot) From 6e57ab3741f5532a30dd2925b423cd40871ab010 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 6 Oct 2025 16:53:59 +0200 Subject: [PATCH 18/44] textline_contours_postprocessing: do not catch arbitrary exceptions --- src/eynollah/utils/separate_lines.py | 68 ++++++++++++++-------------- 1 file changed, 33 insertions(+), 35 deletions(-) diff --git a/src/eynollah/utils/separate_lines.py b/src/eynollah/utils/separate_lines.py index b8c7f3d..3bfc903 100644 --- a/src/eynollah/utils/separate_lines.py +++ b/src/eynollah/utils/separate_lines.py @@ -1344,51 +1344,49 @@ def textline_contours_postprocessing(textline_mask, slope, textline_mask = cv2.morphologyEx(textline_mask, cv2.MORPH_CLOSE, kernel) textline_mask = cv2.erode(textline_mask, kernel, iterations=2) # textline_mask = cv2.erode(textline_mask, kernel, iterations=1) - try: - x_help = 30 - y_help = 2 - textline_mask_help = np.zeros((textline_mask.shape[0] + int(2 * y_help), - textline_mask.shape[1] + int(2 * x_help))) - textline_mask_help[y_help : y_help + textline_mask.shape[0], - x_help : x_help + textline_mask.shape[1]] = np.copy(textline_mask[:, :]) + x_help = 30 + y_help = 2 - dst = rotate_image(textline_mask_help, slope) - dst[dst != 0] = 1 + textline_mask_help = np.zeros((textline_mask.shape[0] + int(2 * y_help), + textline_mask.shape[1] + int(2 * x_help))) + textline_mask_help[y_help : y_help + textline_mask.shape[0], + x_help : x_help + textline_mask.shape[1]] = np.copy(textline_mask[:, :]) - # if np.abs(slope)>.5 and textline_mask.shape[0]/float(textline_mask.shape[1])>3: - # plt.imshow(dst) - # plt.show() + dst = rotate_image(textline_mask_help, slope) + dst[dst != 0] = 1 - contour_text_copy = contour_text_interest.copy() - contour_text_copy[:, 0, 0] = contour_text_copy[:, 0, 0] - box_ind[0] - contour_text_copy[:, 0, 1] = contour_text_copy[:, 0, 1] - box_ind[1] + # if np.abs(slope)>.5 and textline_mask.shape[0]/float(textline_mask.shape[1])>3: + # plt.imshow(dst) + # plt.show() - img_contour = np.zeros((box_ind[3], box_ind[2])) - img_contour = cv2.fillPoly(img_contour, pts=[contour_text_copy], color=255) + contour_text_copy = contour_text_interest.copy() + contour_text_copy[:, 0, 0] = contour_text_copy[:, 0, 0] - box_ind[0] + contour_text_copy[:, 0, 1] = contour_text_copy[:, 0, 1] - box_ind[1] - img_contour_help = np.zeros((img_contour.shape[0] + int(2 * y_help), - img_contour.shape[1] + int(2 * x_help))) - img_contour_help[y_help : y_help + img_contour.shape[0], - x_help : x_help + img_contour.shape[1]] = np.copy(img_contour[:, :]) + img_contour = np.zeros((box_ind[3], box_ind[2])) + img_contour = cv2.fillPoly(img_contour, pts=[contour_text_copy], color=255) - img_contour_rot = rotate_image(img_contour_help, slope) + img_contour_help = np.zeros((img_contour.shape[0] + int(2 * y_help), + img_contour.shape[1] + int(2 * x_help))) + img_contour_help[y_help : y_help + img_contour.shape[0], + x_help : x_help + img_contour.shape[1]] = np.copy(img_contour[:, :]) - _, threshrot = cv2.threshold(img_contour_rot, 0, 255, 0) - contours_text_rot, _ = cv2.findContours(threshrot.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + img_contour_rot = rotate_image(img_contour_help, slope) - len_con_text_rot = [len(contours_text_rot[ib]) for ib in range(len(contours_text_rot))] - ind_big_con = np.argmax(len_con_text_rot) + _, threshrot = cv2.threshold(img_contour_rot, 0, 255, 0) + contours_text_rot, _ = cv2.findContours(threshrot.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - if abs(slope) > 45: - _, contours_rotated_clean = separate_lines_vertical_cont( - textline_mask, contours_text_rot[ind_big_con], box_ind, slope, - add_boxes_coor_into_textlines=add_boxes_coor_into_textlines) - else: - _, contours_rotated_clean = separate_lines( - dst, contours_text_rot[ind_big_con], slope, x_help, y_help) - except: - contours_rotated_clean = [] + len_con_text_rot = [len(contours_text_rot[ib]) for ib in range(len(contours_text_rot))] + ind_big_con = np.argmax(len_con_text_rot) + + if abs(slope) > 45: + _, contours_rotated_clean = separate_lines_vertical_cont( + textline_mask, contours_text_rot[ind_big_con], box_ind, slope, + add_boxes_coor_into_textlines=add_boxes_coor_into_textlines) + else: + _, contours_rotated_clean = separate_lines( + dst, contours_text_rot[ind_big_con], slope, x_help, y_help) return contours_rotated_clean From 595ed02743afc3ab8359de5f6feb0ca680546599 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 6 Oct 2025 17:24:50 +0200 Subject: [PATCH 19/44] run_single: simplify; allow running TrOCR in non-fl mode, too - refactor final `self.full_layout` conditional, removing copied code - allow running `self.ocr` and `self.tr` branch in both cases (non/fl) - when running TrOCR, use model / processor / device initialised during init (instead of ad-hoc loading) --- src/eynollah/eynollah.py | 277 ++++++++++++++++----------------------- 1 file changed, 112 insertions(+), 165 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 834ecf3..079cf8c 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -379,9 +379,14 @@ class Eynollah: self.model_reading_order = self.our_load_model(self.model_reading_order_dir) if self.ocr and self.tr: self.model_ocr = VisionEncoderDecoderModel.from_pretrained(self.model_ocr_dir) - self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - #("microsoft/trocr-base-printed")#("microsoft/trocr-base-handwritten") - self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") + if torch.cuda.is_available(): + self.logger.info("Using GPU acceleration") + self.device = torch.device("cuda:0") + else: + self.logger.info("Using CPU processing") + self.device = torch.device("cpu") + #self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") + self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed") elif self.ocr and not self.tr: model_ocr = load_model(self.model_ocr_dir , compile=False) @@ -4805,12 +4810,13 @@ class Eynollah: slopes_marginals, mid_point_of_page_width) #print(len(polygons_of_marginals), len(ordered_left_marginals), len(ordered_right_marginals), 'marginals ordred') + if np.abs(slope_deskew) >= SLOPE_THRESHOLD: + contours_only_text_parent_d_ordered = self.return_list_of_contours_with_desired_order( + contours_only_text_parent_d_ordered, index_by_text_par_con) + else: + contours_only_text_parent_d_ordered = None + if self.full_layout: - if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - contours_only_text_parent_d_ordered = self.return_list_of_contours_with_desired_order( - contours_only_text_parent_d_ordered, index_by_text_par_con) - else: - contours_only_text_parent_d_ordered = None if self.light_version: fun = check_any_text_region_in_model_one_is_main_or_header_light else: @@ -4869,44 +4875,43 @@ class Eynollah: splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, num_col_classifier, erosion_hurts, self.tables, self.right2left, logger=self.logger) + else: + contours_only_text_parent_h = [] + contours_only_text_parent_h_d_ordered = [] if self.plotter: self.plotter.write_images_into_directory(polygons_of_images, image_page) t_order = time.time() - if self.full_layout: - self.logger.info("Step 4/5: Reading Order Detection") - - if self.reading_order_machine_based: - self.logger.info("Using machine-based detection") - if self.right2left: - self.logger.info("Right-to-left mode enabled") - if self.headers_off: - self.logger.info("Headers ignored in reading order") + #if self.full_layout: + self.logger.info("Step 4/5: Reading Order Detection") - if self.reading_order_machine_based: - tror = time.time() - order_text_new, id_of_texts_tot = self.do_order_of_regions_with_model( - contours_only_text_parent, contours_only_text_parent_h, text_regions_p) + if self.reading_order_machine_based: + self.logger.info("Using machine-based detection") + if self.right2left: + self.logger.info("Right-to-left mode enabled") + if self.headers_off: + self.logger.info("Headers ignored in reading order") + + if self.reading_order_machine_based: + order_text_new, id_of_texts_tot = self.do_order_of_regions_with_model( + contours_only_text_parent, contours_only_text_parent_h, text_regions_p) + else: + if np.abs(slope_deskew) < SLOPE_THRESHOLD: + order_text_new, id_of_texts_tot = self.do_order_of_regions( + contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot) else: - if np.abs(slope_deskew) < SLOPE_THRESHOLD: - order_text_new, id_of_texts_tot = self.do_order_of_regions( - contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot) - else: - order_text_new, id_of_texts_tot = self.do_order_of_regions( - contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, - boxes_d, textline_mask_tot_d) - self.logger.info(f"Detection of reading order took {time.time() - t_order:.1f}s") + order_text_new, id_of_texts_tot = self.do_order_of_regions( + contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, + boxes_d, textline_mask_tot_d) + self.logger.info(f"Detection of reading order took {time.time() - t_order:.1f}s") - if self.ocr and not self.tr: - self.logger.info("Step 4.5/5: OCR Processing") - - if torch.cuda.is_available(): - self.logger.info("Using GPU acceleration") - else: - self.logger.info("Using CPU processing") - + if self.ocr: + self.logger.info("Step 4.5/5: OCR Processing") + + if not self.tr: gc.collect() + if len(all_found_textline_polygons)>0: ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines( image_page, all_found_textline_polygons, self.prediction_model, @@ -4941,15 +4946,68 @@ class Eynollah: self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) else: ocr_all_textlines_drop = None + else: - ocr_all_textlines = None - ocr_all_textlines_marginals_left = None - ocr_all_textlines_marginals_right = None - ocr_all_textlines_h = None - ocr_all_textlines_drop = None + if self.light_version: + self.logger.info("Using light version OCR") + if self.textline_light: + self.logger.info("Using light text line detection for OCR") + self.logger.info("Processing text lines...") + + self.device.reset() + gc.collect() + + torch.cuda.empty_cache() + self.model_ocr.to(self.device) + + ind_tot = 0 + #cv2.imwrite('./img_out.png', image_page) + ocr_all_textlines = [] + for indexing, ind_poly_first in enumerate(all_found_textline_polygons): + ocr_textline_in_textregion = [] + for indexing2, ind_poly in enumerate(ind_poly_first): + if not (self.textline_light or self.curved_line): + ind_poly = copy.deepcopy(ind_poly) + box_ind = all_box_coord[indexing] + #print(ind_poly,np.shape(ind_poly), 'ind_poly') + #print(box_ind) + ind_poly = return_textline_contour_with_added_box_coordinate(ind_poly, box_ind) + #print(ind_poly_copy) + ind_poly[ind_poly<0] = 0 + x, y, w, h = cv2.boundingRect(ind_poly) + #print(ind_poly_copy, np.shape(ind_poly_copy)) + #print(x, y, w, h, h/float(w),'ratio') + h2w_ratio = h/float(w) + mask_poly = np.zeros(image_page.shape) + if not self.light_version: + img_poly_on_img = np.copy(image_page) + else: + img_poly_on_img = np.copy(img_bin_light) + mask_poly = cv2.fillPoly(mask_poly, pts=[ind_poly], color=(1, 1, 1)) + + if self.textline_light: + mask_poly = cv2.dilate(mask_poly, KERNEL, iterations=1) + img_poly_on_img[:,:,0][mask_poly[:,:,0] ==0] = 255 + img_poly_on_img[:,:,1][mask_poly[:,:,0] ==0] = 255 + img_poly_on_img[:,:,2][mask_poly[:,:,0] ==0] = 255 + + img_croped = img_poly_on_img[y:y+h, x:x+w, :] + #cv2.imwrite('./extracted_lines/'+str(ind_tot)+'.jpg', img_croped) + text_ocr = self.return_ocr_of_textline_without_common_section( + img_croped, self.model_ocr, self.processor, self.device, w, h2w_ratio, ind_tot) + ocr_textline_in_textregion.append(text_ocr) + ind_tot = ind_tot +1 + ocr_all_textlines.append(ocr_textline_in_textregion) + else: + ocr_all_textlines = None + ocr_all_textlines_marginals_left = None + ocr_all_textlines_marginals_right = None + ocr_all_textlines_h = None + ocr_all_textlines_drop = None - self.logger.info("Step 5/5: Output Generation") - + self.logger.info("Step 5/5: Output Generation") + + if self.full_layout: pcgts = self.writer.build_pagexml_full_layout( contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, @@ -4962,129 +5020,18 @@ class Eynollah: ocr_all_textlines_marginals_left, ocr_all_textlines_marginals_right, ocr_all_textlines_drop, conf_contours_textregions, conf_contours_textregions_h) - - return pcgts - - contours_only_text_parent_h = [] - self.logger.info("Step 4/5: Reading Order Detection") - - if self.reading_order_machine_based: - self.logger.info("Using machine-based detection") - if self.right2left: - self.logger.info("Right-to-left mode enabled") - if self.headers_off: - self.logger.info("Headers ignored in reading order") - - if self.reading_order_machine_based: - order_text_new, id_of_texts_tot = self.do_order_of_regions_with_model( - contours_only_text_parent, contours_only_text_parent_h, text_regions_p) else: - if np.abs(slope_deskew) < SLOPE_THRESHOLD: - order_text_new, id_of_texts_tot = self.do_order_of_regions( - contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot) - else: - contours_only_text_parent_d_ordered = self.return_list_of_contours_with_desired_order( - contours_only_text_parent_d_ordered, index_by_text_par_con) - order_text_new, id_of_texts_tot = self.do_order_of_regions( - contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d) - - if self.ocr and self.tr: - self.logger.info("Step 4.5/5: OCR Processing") - if torch.cuda.is_available(): - self.logger.info("Using GPU acceleration") - else: - self.logger.info("Using CPU processing") - if self.light_version: - self.logger.info("Using light version OCR") - if self.textline_light: - self.logger.info("Using light text line detection for OCR") - self.logger.info("Processing text lines...") + pcgts = self.writer.build_pagexml_no_full_layout( + txt_con_org, page_coord, order_text_new, id_of_texts_tot, + all_found_textline_polygons, all_box_coord, polygons_of_images, + polygons_of_marginals_left, polygons_of_marginals_right, + all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, + all_box_coord_marginals_left, all_box_coord_marginals_right, + slopes, slopes_marginals_left, slopes_marginals_right, + cont_page, polygons_seplines, contours_tables, ocr_all_textlines, + ocr_all_textlines_marginals_left, ocr_all_textlines_marginals_right, + conf_contours_textregions) - device = cuda.get_current_device() - device.reset() - gc.collect() - model_ocr = VisionEncoderDecoderModel.from_pretrained(self.model_ocr_dir) - device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed") - torch.cuda.empty_cache() - model_ocr.to(device) - - ind_tot = 0 - #cv2.imwrite('./img_out.png', image_page) - ocr_all_textlines = [] - for indexing, ind_poly_first in enumerate(all_found_textline_polygons): - ocr_textline_in_textregion = [] - for indexing2, ind_poly in enumerate(ind_poly_first): - if not (self.textline_light or self.curved_line): - ind_poly = copy.deepcopy(ind_poly) - box_ind = all_box_coord[indexing] - #print(ind_poly,np.shape(ind_poly), 'ind_poly') - #print(box_ind) - ind_poly = return_textline_contour_with_added_box_coordinate(ind_poly, box_ind) - #print(ind_poly_copy) - ind_poly[ind_poly<0] = 0 - x, y, w, h = cv2.boundingRect(ind_poly) - #print(ind_poly_copy, np.shape(ind_poly_copy)) - #print(x, y, w, h, h/float(w),'ratio') - h2w_ratio = h/float(w) - mask_poly = np.zeros(image_page.shape) - if not self.light_version: - img_poly_on_img = np.copy(image_page) - else: - img_poly_on_img = np.copy(img_bin_light) - mask_poly = cv2.fillPoly(mask_poly, pts=[ind_poly], color=(1, 1, 1)) - - if self.textline_light: - mask_poly = cv2.dilate(mask_poly, KERNEL, iterations=1) - img_poly_on_img[:,:,0][mask_poly[:,:,0] ==0] = 255 - img_poly_on_img[:,:,1][mask_poly[:,:,0] ==0] = 255 - img_poly_on_img[:,:,2][mask_poly[:,:,0] ==0] = 255 - - img_croped = img_poly_on_img[y:y+h, x:x+w, :] - #cv2.imwrite('./extracted_lines/'+str(ind_tot)+'.jpg', img_croped) - text_ocr = self.return_ocr_of_textline_without_common_section( - img_croped, model_ocr, processor, device, w, h2w_ratio, ind_tot) - ocr_textline_in_textregion.append(text_ocr) - ind_tot = ind_tot +1 - ocr_all_textlines.append(ocr_textline_in_textregion) - - elif self.ocr and not self.tr: - gc.collect() - if len(all_found_textline_polygons)>0: - ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines( - image_page, all_found_textline_polygons, self.prediction_model, - self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) - - if all_found_textline_polygons_marginals_left and len(all_found_textline_polygons_marginals_left)>0: - ocr_all_textlines_marginals_left = return_rnn_cnn_ocr_of_given_textlines( - image_page, all_found_textline_polygons_marginals_left, self.prediction_model, - self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) - - if all_found_textline_polygons_marginals_right and len(all_found_textline_polygons_marginals_right)>0: - ocr_all_textlines_marginals_right = return_rnn_cnn_ocr_of_given_textlines( - image_page, all_found_textline_polygons_marginals_right, self.prediction_model, - self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) - - else: - ocr_all_textlines = None - ocr_all_textlines_marginals_left = None - ocr_all_textlines_marginals_right = None - self.logger.info(f"Detection of reading order took {time.time() - t_order:.1f}s") - - self.logger.info("Step 5/5: Output Generation") - self.logger.info("Generating PAGE-XML output") - - pcgts = self.writer.build_pagexml_no_full_layout( - txt_con_org, page_coord, order_text_new, id_of_texts_tot, - all_found_textline_polygons, all_box_coord, polygons_of_images, - polygons_of_marginals_left, polygons_of_marginals_right, - all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, - all_box_coord_marginals_left, all_box_coord_marginals_right, - slopes, slopes_marginals_left, slopes_marginals_right, - cont_page, polygons_seplines, contours_tables, ocr_all_textlines, - ocr_all_textlines_marginals_left, ocr_all_textlines_marginals_right, - conf_contours_textregions) - return pcgts From a1904fa660e7cb79ba9b4d8fc7df5befc41072f1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 6 Oct 2025 17:44:12 +0200 Subject: [PATCH 20/44] tests: cover layout with OCR in various modes --- tests/test_run.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_run.py b/tests/test_run.py index 59e5099..d69f021 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -24,14 +24,18 @@ MODELS_BIN = environ.get('MODELS_BIN', str(testdir.joinpath('..', 'default-2021- "options", [ [], # defaults - ["--allow_scaling", "--curved-line"], + #["--allow_scaling", "--curved-line"], ["--allow_scaling", "--curved-line", "--full-layout"], ["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based"], ["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based", "--textline_light", "--light_version"], # -ep ... # -eoi ... - # --do_ocr + ["--do_ocr"], + ["--do_ocr", "--light_version", "--textline_light"], + ["--do_ocr", "--transformer_ocr"], + #["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light"], + ["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light", "--full-layout"], # --skip_layout_and_reading_order ], ids=str) def test_run_eynollah_layout_filename(tmp_path, pytestconfig, caplog, options): From 23535998f7532942d481f3729682969e19c228b6 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 6 Oct 2025 21:27:21 +0200 Subject: [PATCH 21/44] tests: symlink OCR models into layout model directory (so layout with OCR options works with our split model packages) --- Makefile | 19 +++++++++++-------- tests/test_run.py | 3 ++- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 357aa47..5d190b2 100644 --- a/Makefile +++ b/Makefile @@ -90,26 +90,29 @@ deps-test: $(OCR_MODELNAME) endif deps-test: $(BIN_MODELNAME) $(SEG_MODELNAME) $(PIP) install -r requirements-test.txt +ifeq (OCR,$(findstring OCR, $(EXTRAS))) + ln -s $(OCR_MODELNAME)/* $(SEG_MODELNAME)/ +endif smoke-test: TMPDIR != mktemp -d smoke-test: tests/resources/kant_aufklaerung_1784_0020.tif # layout analysis: - eynollah layout -i $< -o $(TMPDIR) -m $(CURDIR)/models_layout_v0_5_0 + eynollah layout -i $< -o $(TMPDIR) -m $(CURDIR)/$(SEG_MODELNAME) fgrep -q http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 $(TMPDIR)/$(basename $( Date: Tue, 7 Oct 2025 00:54:25 +0200 Subject: [PATCH 22/44] CI: run deps-test with OCR extra so symlink rule fires --- .github/workflows/test-eynollah.yml | 2 +- Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-eynollah.yml b/.github/workflows/test-eynollah.yml index 9d5b2c8..7c3f5ae 100644 --- a/.github/workflows/test-eynollah.yml +++ b/.github/workflows/test-eynollah.yml @@ -65,7 +65,7 @@ jobs: run: | python -m pip install --upgrade pip make install-dev EXTRAS=OCR,plotting - make deps-test + make deps-test EXTRAS=OCR,plotting - name: Test with pytest run: make coverage PYTEST_ARGS="-vv --junitxml=pytest.xml" - name: Get coverage results diff --git a/Makefile b/Makefile index 5d190b2..618b1f9 100644 --- a/Makefile +++ b/Makefile @@ -91,7 +91,7 @@ endif deps-test: $(BIN_MODELNAME) $(SEG_MODELNAME) $(PIP) install -r requirements-test.txt ifeq (OCR,$(findstring OCR, $(EXTRAS))) - ln -s $(OCR_MODELNAME)/* $(SEG_MODELNAME)/ + ln -rs $(OCR_MODELNAME)/* $(SEG_MODELNAME)/ endif smoke-test: TMPDIR != mktemp -d From d53f829dfd0b26e4738915b24ffe4256796c6eb4 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 7 Oct 2025 22:06:57 +0200 Subject: [PATCH 23/44] filter_contours_inside_a_bigger_one: fix edge case in 81827c29 --- src/eynollah/eynollah.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 079cf8c..271779f 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4068,7 +4068,9 @@ class Eynollah: for textregion_index_to_del in textline_in_textregion_index_to_del: contours[textregion_index_to_del] = list(np.delete( contours[textregion_index_to_del], - textline_in_textregion_index_to_del[textregion_index_to_del])) + textline_in_textregion_index_to_del[textregion_index_to_del], + # needed so numpy does not flatten the entire result when 0 left + axis=0)) return contours From 2e907875c12b4f22c650c109558917479e0ec3ae Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 7 Oct 2025 22:32:06 +0200 Subject: [PATCH 24/44] get_text_region_boxes_by_given_contours: simplify --- src/eynollah/eynollah.py | 4 ++-- src/eynollah/utils/contour.py | 10 ++-------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 271779f..06be910 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4726,8 +4726,8 @@ class Eynollah: txt_con_org , conf_contours_textregions = get_textregion_contours_in_org_image_light( contours_only_text_parent, self.image, confidence_matrix) #print("text region early 4 in %.1fs", time.time() - t0) - boxes_text, _ = get_text_region_boxes_by_given_contours(contours_only_text_parent) - boxes_marginals, _ = get_text_region_boxes_by_given_contours(polygons_of_marginals) + boxes_text = get_text_region_boxes_by_given_contours(contours_only_text_parent) + boxes_marginals = get_text_region_boxes_by_given_contours(polygons_of_marginals) #print("text region early 5 in %.1fs", time.time() - t0) ## birdan sora chock chakir if not self.curved_line: diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index 22a6f50..fb4bbd0 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -36,14 +36,8 @@ def find_contours_mean_y_diff(contours_main): return np.mean(np.diff(np.sort(np.array(cy_main)))) def get_text_region_boxes_by_given_contours(contours): - boxes = [] - contours_new = [] - for jj in range(len(contours)): - box = cv2.boundingRect(contours[jj]) - boxes.append(box) - contours_new.append(contours[jj]) - - return boxes, contours_new + return [cv2.boundingRect(contour) + for contour in contours] def filter_contours_area_of_image(image, contours, hierarchy, max_area=1.0, min_area=0.0, dilate=0): found_polygons_early = [] From dfdc70537530b55f77b5232ae3cfa1fc8357eed0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 7 Oct 2025 22:33:06 +0200 Subject: [PATCH 25/44] do_work_of_slopes: rm unused old variant --- src/eynollah/eynollah.py | 38 -------------------------------------- 1 file changed, 38 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 06be910..2431a3b 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -108,7 +108,6 @@ from .utils.utils_ocr import ( get_contours_and_bounding_boxes ) from .utils.separate_lines import ( - textline_contours_postprocessing, separate_lines_new2, return_deskew_slop, do_work_of_slopes_new, @@ -2062,43 +2061,6 @@ class Eynollah: (prediction_textline_longshot_true_size[:, :, 0]==1).astype(np.uint8)) - def do_work_of_slopes(self, q, poly, box_sub, boxes_per_process, textline_mask_tot, contours_per_process): - self.logger.debug('enter do_work_of_slopes') - slope_biggest = 0 - slopes_sub = [] - boxes_sub_new = [] - poly_sub = [] - for mv in range(len(boxes_per_process)): - crop_img, _ = crop_image_inside_box(boxes_per_process[mv], textline_mask_tot) - crop_img = cv2.erode(crop_img, KERNEL, iterations=2) - try: - textline_con, hierarchy = return_contours_of_image(crop_img) - textline_con_fil = filter_contours_area_of_image(crop_img, textline_con, hierarchy, - max_area=1, min_area=0.0008) - y_diff_mean = find_contours_mean_y_diff(textline_con_fil) - sigma_des = max(1, int(y_diff_mean * (4.0 / 40.0))) - crop_img[crop_img > 0] = 1 - slope_corresponding_textregion = return_deskew_slop(crop_img, sigma_des, - logger=self.logger, plotter=self.plotter) - except Exception as why: - self.logger.error(why) - slope_corresponding_textregion = MAX_SLOPE - - if slope_corresponding_textregion == MAX_SLOPE: - slope_corresponding_textregion = slope_biggest - slopes_sub.append(slope_corresponding_textregion) - - cnt_clean_rot = textline_contours_postprocessing( - crop_img, slope_corresponding_textregion, contours_per_process[mv], boxes_per_process[mv]) - - poly_sub.append(cnt_clean_rot) - boxes_sub_new.append(boxes_per_process[mv]) - - q.put(slopes_sub) - poly.put(poly_sub) - box_sub.put(boxes_sub_new) - self.logger.debug('exit do_work_of_slopes') - def get_regions_light_v_extract_only_images(self,img,is_image_enhanced, num_col_classifier): self.logger.debug("enter get_regions_extract_images_only") erosion_hurts = False From 0a80cd5dffc7e5c28f41330da8d2f1255ac66e88 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 7 Oct 2025 22:37:05 +0200 Subject: [PATCH 26/44] avoid unnecessary 3-channel conversions: for tables, too --- src/eynollah/eynollah.py | 155 ++++++++++++++++----------------------- 1 file changed, 65 insertions(+), 90 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 2431a3b..70a8a17 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -930,10 +930,8 @@ class Eynollah: img_w = img.shape[1] prediction_true = np.zeros((img_h, img_w, 3)) mask_true = np.zeros((img_h, img_w)) - nxf = img_w / float(width_mid) - nyf = img_h / float(height_mid) - nxf = int(nxf) + 1 if nxf > int(nxf) else int(nxf) - nyf = int(nyf) + 1 if nyf > int(nyf) else int(nyf) + nxf = math.ceil(img_w / float(width_mid)) + nyf = math.ceil(img_h / float(height_mid)) list_i_s = [] list_j_s = [] @@ -946,18 +944,10 @@ class Eynollah: img_patch = np.zeros((n_batch_inference, img_height_model, img_width_model, 3)) for i in range(nxf): for j in range(nyf): - if i == 0: - index_x_d = i * width_mid - index_x_u = index_x_d + img_width_model - else: - index_x_d = i * width_mid - index_x_u = index_x_d + img_width_model - if j == 0: - index_y_d = j * height_mid - index_y_u = index_y_d + img_height_model - else: - index_y_d = j * height_mid - index_y_u = index_y_d + img_height_model + index_x_d = i * width_mid + index_x_u = index_x_d + img_width_model + index_y_d = j * height_mid + index_y_u = index_y_d + img_height_model if index_x_u > img_w: index_x_u = img_w index_x_d = img_w - img_width_model @@ -2600,23 +2590,20 @@ class Eynollah: self, layout, table_prediction_early, pixel_table, num_col_classifier): layout_org = np.copy(layout) - layout_org[:,:,0][layout_org[:,:,0]==pixel_table] = 0 - layout = (layout[:,:,0]==pixel_table)*1 - - layout = layout.astype(np.uint8) + layout_org[layout_org == pixel_table] = 0 + layout = (layout == pixel_table).astype(np.uint8) * 1 _, thresh = cv2.threshold(layout, 0, 255, 0) contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - cnt_size = np.array([cv2.contourArea(contours[j]) - for j in range(len(contours))]) + cnt_size = np.array([cv2.contourArea(cnt) for cnt in contours]) contours_new = [] - for i in range(len(contours)): - x, y, w, h = cv2.boundingRect(contours[i]) + for i, contour in enumerate(contours): + x, y, w, h = cv2.boundingRect(contour) iou = cnt_size[i] /float(w*h) *100 if iou<80: layout_contour = np.zeros(layout_org.shape[:2]) - layout_contour = cv2.fillPoly(layout_contour, pts=[contours[i]] ,color=1) + layout_contour = cv2.fillPoly(layout_contour, pts=[contour] ,color=1) layout_contour_sum = layout_contour.sum(axis=0) layout_contour_sum_diff = np.diff(layout_contour_sum) @@ -2648,26 +2635,26 @@ class Eynollah: #print(iou_in,'iou_in_in1') if iou_in>30: - layout_org= cv2.fillPoly(layout_org, pts=[contours_sep[ji]], color=3 * (pixel_table,)) + layout_org = cv2.fillPoly(layout_org, pts=[contours_sep[ji]], color=pixel_table) else: pass else: - layout_org= cv2.fillPoly(layout_org, pts=[contours_sep[ji]], color=3 * (pixel_table,)) + layout_org= cv2.fillPoly(layout_org, pts=[contours_sep[ji]], color=pixel_table) else: - contours_new.append(contours[i]) + contours_new.append(contour) if num_col_classifier>=2: - only_recent_contour_image = np.zeros((layout.shape[0],layout.shape[1])) - only_recent_contour_image= cv2.fillPoly(only_recent_contour_image,pts=[contours[i]] ,color=(1,1,1)) + only_recent_contour_image = np.zeros(layout.shape[:2]) + only_recent_contour_image = cv2.fillPoly(only_recent_contour_image, pts=[contour],color=1) table_pixels_masked_from_early_pre = only_recent_contour_image * table_prediction_early iou_in = 100. * table_pixels_masked_from_early_pre.sum() / only_recent_contour_image.sum() #print(iou_in,'iou_in') if iou_in>30: - layout_org= cv2.fillPoly(layout_org, pts=[contours[i]], color=3 * (pixel_table,)) + layout_org = cv2.fillPoly(layout_org, pts=[contour], color=pixel_table) else: pass else: - layout_org= cv2.fillPoly(layout_org, pts=[contours[i]], color=3 * (pixel_table,)) + layout_org = cv2.fillPoly(layout_org, pts=[contour], color=pixel_table) return layout_org, contours_new @@ -2714,16 +2701,10 @@ class Eynollah: pass boxes = np.array(boxes, dtype=int) # to be on the safe side - img_comm_e = np.zeros(image_revised_1.shape) - img_comm = np.repeat(img_comm_e[:, :, np.newaxis], 3, axis=2) - + img_comm = np.zeros(image_revised_1.shape, dtype=np.uint8) for indiv in np.unique(image_revised_1): - image_col=(image_revised_1==indiv)*255 - img_comm_in=np.repeat(image_col[:, :, np.newaxis], 3, axis=2) - img_comm_in=img_comm_in.astype(np.uint8) - - imgray = cv2.cvtColor(img_comm_in, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) + image_col = (image_revised_1 == indiv).astype(np.uint8) * 255 + _, thresh = cv2.threshold(image_col, 0, 255, 0) contours,hirarchy=cv2.findContours(thresh.copy(), cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) if indiv==pixel_table: @@ -2733,35 +2714,27 @@ class Eynollah: main_contours = filter_contours_area_of_image_tables(thresh, contours, hirarchy, max_area=1, min_area=min_area) - img_comm = cv2.fillPoly(img_comm, pts = main_contours, color = (indiv, indiv, indiv)) - img_comm = img_comm.astype(np.uint8) + img_comm = cv2.fillPoly(img_comm, pts=main_contours, color=indiv) if not self.isNaN(slope_mean_hor): - image_revised_last = np.zeros((image_regions_eraly_p.shape[0], image_regions_eraly_p.shape[1],3)) + image_revised_last = np.zeros(image_regions_eraly_p.shape[:2]) for i in range(len(boxes)): box_ys = slice(*boxes[i][2:4]) box_xs = slice(*boxes[i][0:2]) image_box = img_comm[box_ys, box_xs] try: - image_box_tabels_1=(image_box[:,:,0]==pixel_table)*1 + image_box_tabels_1 = (image_box == pixel_table) * 1 contours_tab,_=return_contours_of_image(image_box_tabels_1) contours_tab=filter_contours_area_of_image_tables(image_box_tabels_1,contours_tab,_,1,0.003) - image_box_tabels_1=(image_box[:,:,0]==pixel_line)*1 + image_box_tabels_1 = (image_box == pixel_line).astype(np.uint8) * 1 + image_box_tabels_and_m_text = ( (image_box == pixel_table) | + (image_box == 1) ).astype(np.uint8) * 1 - image_box_tabels_and_m_text=( (image_box[:,:,0]==pixel_table) | (image_box[:,:,0]==1) )*1 - image_box_tabels_and_m_text=image_box_tabels_and_m_text.astype(np.uint8) + image_box_tabels_1 = cv2.dilate(image_box_tabels_1, KERNEL, iterations=5) - image_box_tabels_1=image_box_tabels_1.astype(np.uint8) - image_box_tabels_1 = cv2.dilate(image_box_tabels_1,KERNEL,iterations = 5) - - contours_table_m_text,_=return_contours_of_image(image_box_tabels_and_m_text) - image_box_tabels=np.repeat(image_box_tabels_1[:, :, np.newaxis], 3, axis=2) - - image_box_tabels=image_box_tabels.astype(np.uint8) - imgray = cv2.cvtColor(image_box_tabels, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - - contours_line,hierachy=cv2.findContours(thresh,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) + contours_table_m_text, _ = return_contours_of_image(image_box_tabels_and_m_text) + _, thresh = cv2.threshold(image_box_tabels_1, 0, 255, 0) + contours_line, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) y_min_main_line ,y_max_main_line=find_features_of_contours(contours_line) y_min_main_tab ,y_max_main_tab=find_features_of_contours(contours_tab) @@ -2793,18 +2766,20 @@ class Eynollah: y_max_main_tab[i_t] < y_min_main_line[i_l] and y_min_main_tab[i_t] < y_min_main_line[i_l]): pass - elif np.abs(y_max_main_line[i_l]-y_min_main_line[i_l])<100: + elif abs(y_max_main_line[i_l] - y_min_main_line[i_l]) < 100: pass else: - y_up_tab.append(np.min([y_min_main_line[i_l], y_min_main_tab[i_t] ]) ) - y_down_tab.append( np.max([ y_max_main_line[i_l],y_max_main_tab[i_t] ]) ) + y_up_tab.append(min([y_min_main_line[i_l], + y_min_main_tab[i_t]])) + y_down_tab.append(max([y_max_main_line[i_l], + y_max_main_tab[i_t]])) if len(y_up_tab)==0: y_up_tabs.append(y_min_main_tab[i_t]) y_down_tabs.append(y_max_main_tab[i_t]) else: - y_up_tabs.append(np.min(y_up_tab)) - y_down_tabs.append(np.max(y_down_tab)) + y_up_tabs.append(min(y_up_tab)) + y_down_tabs.append(max(y_down_tab)) else: y_down_tabs=[] y_up_tabs=[] @@ -2814,7 +2789,7 @@ class Eynollah: y_up_tabs=[] for ii in range(len(y_up_tabs)): - image_box[y_up_tabs[ii]:y_down_tabs[ii],:,0]=pixel_table + image_box[y_up_tabs[ii]:y_down_tabs[ii]] = pixel_table image_revised_last[box_ys, box_xs] = image_box else: @@ -2825,14 +2800,14 @@ class Eynollah: image_revised_last[box_ys, box_xs] = image_box if num_col_classifier==1: - img_tables_col_1 = (image_revised_last[:,:,0] == pixel_table).astype(np.uint8) + img_tables_col_1 = (image_revised_last == pixel_table).astype(np.uint8) contours_table_col1, _ = return_contours_of_image(img_tables_col_1) _,_ ,_ , _, y_min_tab_col1 ,y_max_tab_col1, _= find_new_features_of_contours(contours_table_col1) if len(y_min_tab_col1)>0: for ijv in range(len(y_min_tab_col1)): - image_revised_last[int(y_min_tab_col1[ijv]):int(y_max_tab_col1[ijv]),:,:]=pixel_table + image_revised_last[int(y_min_tab_col1[ijv]):int(y_max_tab_col1[ijv])] = pixel_table return image_revised_last def get_tables_from_model(self, img, num_col_classifier): @@ -3200,7 +3175,7 @@ class Eynollah: pass else: text_regions_p_tables = np.copy(text_regions_p) - text_regions_p_tables[:,:][(table_prediction[:,:] == 1)] = 10 + text_regions_p_tables[(table_prediction == 1)] = 10 pixel_line = 3 img_revised_tab2 = self.add_tables_heuristic_to_layout( text_regions_p_tables, boxes, 0, splitter_y_new, peaks_neg_tot_tables, text_regions_p_tables, @@ -3221,8 +3196,8 @@ class Eynollah: pass else: text_regions_p_tables = np.copy(text_regions_p_1_n) - text_regions_p_tables =np.round(text_regions_p_tables) - text_regions_p_tables[:,:][(text_regions_p_tables[:,:] != 3) & (table_prediction_n[:,:] == 1)] = 10 + text_regions_p_tables = np.round(text_regions_p_tables) + text_regions_p_tables[(text_regions_p_tables != 3) & (table_prediction_n == 1)] = 10 pixel_line = 3 img_revised_tab2 = self.add_tables_heuristic_to_layout( @@ -3242,21 +3217,21 @@ class Eynollah: if self.tables: if self.light_version: - text_regions_p[:,:][table_prediction[:,:]==1] = 10 - img_revised_tab=text_regions_p[:,:] + text_regions_p[table_prediction == 1] = 10 + img_revised_tab = text_regions_p[:,:] else: if np.abs(slope_deskew) < SLOPE_THRESHOLD: - img_revised_tab = np.copy(img_revised_tab2[:,:,0]) - img_revised_tab[:,:][(text_regions_p[:,:] == 1) & (img_revised_tab[:,:] != 10)] = 1 + img_revised_tab = np.copy(img_revised_tab2) + img_revised_tab[(text_regions_p == 1) & (img_revised_tab != 10)] = 1 else: - img_revised_tab = np.copy(text_regions_p[:,:]) - img_revised_tab[:,:][img_revised_tab[:,:] == 10] = 0 - img_revised_tab[:,:][img_revised_tab2_d_rotated[:,:,0] == 10] = 10 + img_revised_tab = np.copy(text_regions_p) + img_revised_tab[img_revised_tab == 10] = 0 + img_revised_tab[img_revised_tab2_d_rotated == 10] = 10 - text_regions_p[:,:][text_regions_p[:,:]==10] = 0 - text_regions_p[:,:][img_revised_tab[:,:]==10] = 10 + text_regions_p[text_regions_p == 10] = 0 + text_regions_p[img_revised_tab == 10] = 10 else: - img_revised_tab=text_regions_p[:,:] + img_revised_tab = text_regions_p[:,:] #img_revised_tab = text_regions_p[:, :] if self.light_version: polygons_of_images = return_contours_of_interested_region(text_regions_p, 2) @@ -3386,7 +3361,7 @@ class Eynollah: num_col_classifier, erosion_hurts, self.tables, self.right2left) text_regions_p_tables = np.copy(text_regions_p_1_n) text_regions_p_tables = np.round(text_regions_p_tables) - text_regions_p_tables[:,:][(text_regions_p_tables[:,:]!=3) & (table_prediction_n[:,:]==1)] = 10 + text_regions_p_tables[(text_regions_p_tables != 3) & (table_prediction_n == 1)] = 10 pixel_line = 3 img_revised_tab2 = self.add_tables_heuristic_to_layout( @@ -3405,17 +3380,17 @@ class Eynollah: text_regions_p.shape[1]) if np.abs(slope_deskew) < 0.13: - img_revised_tab = np.copy(img_revised_tab2[:,:,0]) + img_revised_tab = np.copy(img_revised_tab2) else: - img_revised_tab = np.copy(text_regions_p[:,:]) - img_revised_tab[:,:][img_revised_tab[:,:] == 10] = 0 - img_revised_tab[:,:][img_revised_tab2_d_rotated[:,:,0] == 10] = 10 + img_revised_tab = np.copy(text_regions_p) + img_revised_tab[img_revised_tab == 10] = 0 + img_revised_tab[img_revised_tab2_d_rotated == 10] = 10 - ##img_revised_tab=img_revised_tab2[:,:,0] - #img_revised_tab=text_regions_p[:,:] - text_regions_p[:,:][text_regions_p[:,:]==10] = 0 - text_regions_p[:,:][img_revised_tab[:,:]==10] = 10 - #img_revised_tab[img_revised_tab2[:,:,0]==10] =10 + ##img_revised_tab = img_revised_tab2[:,:] + #img_revised_tab = text_regions_p[:,:] + text_regions_p[text_regions_p == 10] = 0 + text_regions_p[img_revised_tab == 10] = 10 + #img_revised_tab[img_revised_tab2 == 10] = 10 pixel_img = 4 min_area_mar = 0.00001 From fd43e78442251c552faafeffe02256023ae1a806 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 7 Oct 2025 22:42:36 +0200 Subject: [PATCH 27/44] filter_contours_without_textline_inside: simplify - np.delete in index array instead of contour lists - yield actual resulting indices --- src/eynollah/eynollah.py | 77 ++++------------------------------------ 1 file changed, 6 insertions(+), 71 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 70a8a17..6cc8b1b 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4040,79 +4040,23 @@ class Eynollah: self, contours, text_con_org, contours_textline, contours_only_text_parent_d_ordered, conf_contours_textregions): - ###contours_txtline_of_all_textregions = [] - ###for jj in range(len(contours_textline)): - ###contours_txtline_of_all_textregions = contours_txtline_of_all_textregions + contours_textline[jj] - ###M_main_textline = [cv2.moments(contours_txtline_of_all_textregions[j]) - ### for j in range(len(contours_txtline_of_all_textregions))] - ###cx_main_textline = [(M_main_textline[j]["m10"] / (M_main_textline[j]["m00"] + 1e-32)) - ### for j in range(len(M_main_textline))] - ###cy_main_textline = [(M_main_textline[j]["m01"] / (M_main_textline[j]["m00"] + 1e-32)) - ### for j in range(len(M_main_textline))] - - ###M_main = [cv2.moments(contours[j]) for j in range(len(contours))] - ###cx_main = [(M_main[j]["m10"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))] - ###cy_main = [(M_main[j]["m01"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))] - - ###contours_with_textline = [] - ###for ind_tr, con_tr in enumerate(contours): - ###results = [cv2.pointPolygonTest(con_tr, - ### (cx_main_textline[index_textline_con], - ### cy_main_textline[index_textline_con]), - ### False) - ### for index_textline_con in range(len(contours_txtline_of_all_textregions)) ] - ###results = np.array(results) - ###if np.any(results==1): - ###contours_with_textline.append(con_tr) - - textregion_index_to_del = set() - for index_textregion, textlines_textregion in enumerate(contours_textline): - if len(textlines_textregion) == 0: - textregion_index_to_del.add(index_textregion) + assert len(contours_par) == len(contours_textline) + indices = np.arange(len(contours_textline)) + indices = np.delete(indices, np.flatnonzero([len(lines) == 0 for lines in contours_textline])) def filterfun(lis): if len(lis) == 0: return [] - if len(textregion_index_to_del) == 0: - return lis - return list(np.delete(lis, list(textregion_index_to_del))) + return list(np.array(lis)[indices]) return (filterfun(contours), filterfun(text_con_org), filterfun(conf_contours_textregions), filterfun(contours_textline), filterfun(contours_only_text_parent_d_ordered), - np.arange(len(contours) - len(textregion_index_to_del))) + indices + ) - def delete_regions_without_textlines( - self, slopes, all_found_textline_polygons, boxes_text, txt_con_org, - contours_only_text_parent, index_by_text_par_con): - - slopes_rem = [] - all_found_textline_polygons_rem = [] - boxes_text_rem = [] - txt_con_org_rem = [] - contours_only_text_parent_rem = [] - index_by_text_par_con_rem = [] - - for i, ind_con in enumerate(all_found_textline_polygons): - if len(ind_con): - all_found_textline_polygons_rem.append(ind_con) - slopes_rem.append(slopes[i]) - boxes_text_rem.append(boxes_text[i]) - txt_con_org_rem.append(txt_con_org[i]) - contours_only_text_parent_rem.append(contours_only_text_parent[i]) - index_by_text_par_con_rem.append(index_by_text_par_con[i]) - - index_sort = np.argsort(index_by_text_par_con_rem) - indexes_new = np.array(range(len(index_by_text_par_con_rem))) - - index_by_text_par_con_rem_sort = [indexes_new[index_sort==j][0] - for j in range(len(index_by_text_par_con_rem))] - - return (slopes_rem, all_found_textline_polygons_rem, boxes_text_rem, txt_con_org_rem, - contours_only_text_parent_rem, index_by_text_par_con_rem_sort) - def separate_marginals_to_left_and_right_and_order_from_top_to_down( self, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes_marginals, mid_point_of_page_width): @@ -4679,15 +4623,6 @@ class Eynollah: polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea_org, boxes_marginals, slope_deskew) - #slopes, all_found_textline_polygons, boxes_text, txt_con_org, \ - # contours_only_text_parent, index_by_text_par_con = \ - # self.delete_regions_without_textlines(slopes, all_found_textline_polygons, - # boxes_text, txt_con_org, contours_only_text_parent, index_by_text_par_con) - #slopes_marginals, all_found_textline_polygons_marginals, boxes_marginals, \ - # polygons_of_marginals, polygons_of_marginals, _ = \ - # self.delete_regions_without_textlines(slopes_marginals, all_found_textline_polygons_marginals, - # boxes_marginals, polygons_of_marginals, polygons_of_marginals, - # np.array(range(len(polygons_of_marginals)))) all_found_textline_polygons = dilate_textline_contours( all_found_textline_polygons) all_found_textline_polygons = self.filter_contours_inside_a_bigger_one( From 02a347a48a972de49c4b098f454a9a16cc4ee4fc Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 7 Oct 2025 22:47:34 +0200 Subject: [PATCH 28/44] no more need to rm from `contours_only_text_parent_d_ordered` now --- src/eynollah/eynollah.py | 16 ++-------------- src/eynollah/utils/__init__.py | 8 ++++---- 2 files changed, 6 insertions(+), 18 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 6cc8b1b..c4a6600 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4437,6 +4437,8 @@ class Eynollah: ###min_con_area = 0.000005 contours_only_text, hir_on_text = return_contours_of_image(text_only) contours_only_text_parent = return_parent_contours(contours_only_text, hir_on_text) + contours_only_text_parent_d_ordered = [] + contours_only_text_parent_d = [] if len(contours_only_text_parent) > 0: areas_tot_text = np.prod(text_only.shape) areas_cnt_text = np.array([cv2.contourArea(c) for c in contours_only_text_parent]) @@ -4558,15 +4560,6 @@ class Eynollah: # plt.subplot(2, 2, 2, title="result contours") # plt.imshow(img4) # plt.show() - else: - contours_only_text_parent_d_ordered = [] - contours_only_text_parent_d = [] - contours_only_text_parent = [] - - else: - contours_only_text_parent_d_ordered = [] - contours_only_text_parent_d = [] - #contours_only_text_parent = [] if not len(contours_only_text_parent): # stop early @@ -4684,11 +4677,6 @@ class Eynollah: slopes_marginals, mid_point_of_page_width) #print(len(polygons_of_marginals), len(ordered_left_marginals), len(ordered_right_marginals), 'marginals ordred') - if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - contours_only_text_parent_d_ordered = self.return_list_of_contours_with_desired_order( - contours_only_text_parent_d_ordered, index_by_text_par_con) - else: - contours_only_text_parent_d_ordered = None if self.full_layout: if self.light_version: diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index ebf78fe..5ccb2af 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -938,7 +938,7 @@ def check_any_text_region_in_model_one_is_main_or_header( if (pixels_header>=pixels_main) and ( (length_con[ii]/float(height_con[ii]) )>=1.3 ): regions_model_1[:,:][(regions_model_1[:,:]==1) & (img == 255) ]=2 contours_only_text_parent_head.append(con) - if contours_only_text_parent_d_ordered is not None: + if len(contours_only_text_parent_d_ordered): contours_only_text_parent_head_d.append(contours_only_text_parent_d_ordered[ii]) all_box_coord_head.append(all_box_coord[ii]) slopes_head.append(slopes[ii]) @@ -948,7 +948,7 @@ def check_any_text_region_in_model_one_is_main_or_header( regions_model_1[:,:][(regions_model_1[:,:]==1) & (img == 255) ]=1 contours_only_text_parent_main.append(con) conf_contours_main.append(conf_contours[ii]) - if contours_only_text_parent_d_ordered is not None: + if len(contours_only_text_parent_d_ordered): contours_only_text_parent_main_d.append(contours_only_text_parent_d_ordered[ii]) all_box_coord_main.append(all_box_coord[ii]) slopes_main.append(slopes[ii]) @@ -1033,7 +1033,7 @@ def check_any_text_region_in_model_one_is_main_or_header_light( regions_model_1[:,:][(regions_model_1[:,:]==1) & (img == 255) ] = 2 contours_only_text_parent_head.append(contours_only_text_parent[ii]) conf_contours_head.append(None) # why not conf_contours[ii], too? - if contours_only_text_parent_d_ordered is not None: + if len(contours_only_text_parent_d_ordered): contours_only_text_parent_head_d.append(contours_only_text_parent_d_ordered[ii]) all_box_coord_head.append(all_box_coord[ii]) slopes_head.append(slopes[ii]) @@ -1043,7 +1043,7 @@ def check_any_text_region_in_model_one_is_main_or_header_light( regions_model_1[:,:][(regions_model_1[:,:]==1) & (img == 255) ] = 1 contours_only_text_parent_main.append(contours_only_text_parent[ii]) conf_contours_main.append(conf_contours[ii]) - if contours_only_text_parent_d_ordered is not None: + if len(contours_only_text_parent_d_ordered): contours_only_text_parent_main_d.append(contours_only_text_parent_d_ordered[ii]) all_box_coord_main.append(all_box_coord[ii]) slopes_main.append(slopes[ii]) From d88ca18eec8f1a4def371848c218b817fdb728a1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 7 Oct 2025 22:53:30 +0200 Subject: [PATCH 29/44] get/do_work_of_slopes etc.: reduce call/return signatures - `get_textregion_contours_in_org_image_light`: no more need to also return unchanged contours here (see 41cc38c5); therefore - `txt_con_org`: no more need for this (now mere alias to `contours_only_text_parent`); also - `index_by_text_par_con`: no more need for this (see prev. commit), so do not pass/return - `get_slopes_and_deskew_*`: do not pass `contours_only_text` (where not used) - `get_slopes_and_deskew_*`: do not return unchanged contours, boxes - `do_work_of_slopes_*`: adapt respectively --- src/eynollah/eynollah.py | 98 +++++++++++++--------------- src/eynollah/utils/contour.py | 4 +- src/eynollah/utils/separate_lines.py | 12 ++-- 3 files changed, 54 insertions(+), 60 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index c4a6600..ec68bcd 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -879,7 +879,7 @@ class Eynollah: thresholding_for_fl_light_version=False, threshold_art_class_textline=0.1): - self.logger.debug("enter do_prediction") + self.logger.debug("enter do_prediction (patches=%d)", patches) img_height_model = model.layers[-1].output_shape[1] img_width_model = model.layers[-1].output_shape[2] @@ -1856,7 +1856,7 @@ class Eynollah: return sorted_textlines - def get_slopes_and_deskew_new_light2(self, contours, contours_par, textline_mask_tot, boxes, slope_deskew): + def get_slopes_and_deskew_new_light2(self, contours_par, textline_mask_tot, boxes, slope_deskew): polygons_of_textlines = return_contours_of_interested_region(textline_mask_tot,1,0.00001) cx_main_tot, cy_main_tot = find_center_of_contours(polygons_of_textlines) @@ -1889,16 +1889,12 @@ class Eynollah: all_box_coord.append(crop_coor) return (all_found_textline_polygons, - boxes, - contours, - contours_par, all_box_coord, - np.array(range(len(contours_par))), slopes) def get_slopes_and_deskew_new_light(self, contours, contours_par, textline_mask_tot, boxes, slope_deskew): if not len(contours): - return [], [], [], [], [], [], [] + return [], [], [] self.logger.debug("enter get_slopes_and_deskew_new_light") with share_ndarray(textline_mask_tot) as textline_mask_tot_shared: results = self.executor.map(partial(do_work_of_slopes_new_light, @@ -1906,15 +1902,15 @@ class Eynollah: slope_deskew=slope_deskew, textline_light=self.textline_light, logger=self.logger,), - boxes, contours, contours_par, range(len(contours_par))) + boxes, contours, contours_par) results = list(results) # exhaust prior to release - #textline_polygons, boxes, text_regions, text_regions_par, box_coord, index_text_con, slopes = zip(*results) + #textline_polygons, box_coord, slopes = zip(*results) self.logger.debug("exit get_slopes_and_deskew_new_light") return tuple(zip(*results)) def get_slopes_and_deskew_new(self, contours, contours_par, textline_mask_tot, boxes, slope_deskew): if not len(contours): - return [], [], [], [], [], [], [] + return [], [], [] self.logger.debug("enter get_slopes_and_deskew_new") with share_ndarray(textline_mask_tot) as textline_mask_tot_shared: results = self.executor.map(partial(do_work_of_slopes_new, @@ -1924,16 +1920,16 @@ class Eynollah: KERNEL=KERNEL, logger=self.logger, plotter=self.plotter,), - boxes, contours, contours_par, range(len(contours_par))) + boxes, contours, contours_par) results = list(results) # exhaust prior to release - #textline_polygons, boxes, text_regions, text_regions_par, box_coord, index_text_con, slopes = zip(*results) + #textline_polygons, box_coord, slopes = zip(*results) self.logger.debug("exit get_slopes_and_deskew_new") return tuple(zip(*results)) - def get_slopes_and_deskew_new_curved(self, contours, contours_par, textline_mask_tot, boxes, + def get_slopes_and_deskew_new_curved(self, contours_par, textline_mask_tot, boxes, mask_texts_only, num_col, scale_par, slope_deskew): - if not len(contours): - return [], [], [], [], [], [], [] + if not len(contours_par): + return [], [], [] self.logger.debug("enter get_slopes_and_deskew_new_curved") with share_ndarray(textline_mask_tot) as textline_mask_tot_shared: with share_ndarray(mask_texts_only) as mask_texts_only_shared: @@ -1947,9 +1943,9 @@ class Eynollah: KERNEL=KERNEL, logger=self.logger, plotter=self.plotter,), - boxes, contours, contours_par, range(len(contours_par))) + boxes, contours_par) results = list(results) # exhaust prior to release - #textline_polygons, boxes, text_regions, text_regions_par, box_coord, index_text_con, slopes = zip(*results) + #textline_polygons, box_coord, slopes = zip(*results) self.logger.debug("exit get_slopes_and_deskew_new_curved") return tuple(zip(*results)) @@ -4037,7 +4033,7 @@ class Eynollah: def filter_contours_without_textline_inside( - self, contours, text_con_org, contours_textline, + self, contours_par, contours_textline, contours_only_text_parent_d_ordered, conf_contours_textregions): @@ -4049,12 +4045,11 @@ class Eynollah: return [] return list(np.array(lis)[indices]) - return (filterfun(contours), - filterfun(text_con_org), - filterfun(conf_contours_textregions), + return (filterfun(contours_par), filterfun(contours_textline), filterfun(contours_only_text_parent_d_ordered), - indices + filterfun(conf_contours_textregions), + # indices ) def separate_marginals_to_left_and_right_and_order_from_top_to_down( @@ -4592,12 +4587,11 @@ class Eynollah: contours_only_text_parent, contours_only_text_parent_d_ordered, text_only, marginal_cnts=polygons_of_marginals) #print("text region early 3.5 in %.1fs", time.time() - t0) - txt_con_org , conf_contours_textregions = get_textregion_contours_in_org_image_light( + conf_contours_textregions = get_textregion_contours_in_org_image_light( contours_only_text_parent, self.image, confidence_matrix) - #txt_con_org = dilate_textregion_contours(txt_con_org) #contours_only_text_parent = dilate_textregion_contours(contours_only_text_parent) else: - txt_con_org , conf_contours_textregions = get_textregion_contours_in_org_image_light( + conf_contours_textregions = get_textregion_contours_in_org_image_light( contours_only_text_parent, self.image, confidence_matrix) #print("text region early 4 in %.1fs", time.time() - t0) boxes_text = get_text_region_boxes_by_given_contours(contours_only_text_parent) @@ -4607,13 +4601,13 @@ class Eynollah: if not self.curved_line: if self.light_version: if self.textline_light: - all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, \ - all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_light2( - txt_con_org, contours_only_text_parent, textline_mask_tot_ea_org, + all_found_textline_polygons, \ + all_box_coord, slopes = self.get_slopes_and_deskew_new_light2( + contours_only_text_parent, textline_mask_tot_ea_org, boxes_text, slope_deskew) - all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, \ - all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new_light2( - polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea_org, + all_found_textline_polygons_marginals, \ + all_box_coord_marginals, slopes_marginals = self.get_slopes_and_deskew_new_light2( + polygons_of_marginals, textline_mask_tot_ea_org, boxes_marginals, slope_deskew) all_found_textline_polygons = dilate_textline_contours( @@ -4622,46 +4616,46 @@ class Eynollah: all_found_textline_polygons, None, textline_mask_tot_ea_org, type_contour="textline") all_found_textline_polygons_marginals = dilate_textline_contours( all_found_textline_polygons_marginals) - contours_only_text_parent, txt_con_org, conf_contours_textregions, \ - all_found_textline_polygons, contours_only_text_parent_d_ordered, \ - index_by_text_par_con = self.filter_contours_without_textline_inside( - contours_only_text_parent, txt_con_org, all_found_textline_polygons, + contours_only_text_parent, all_found_textline_polygons, \ + contours_only_text_parent_d_ordered, conf_contours_textregions = \ + self.filter_contours_without_textline_inside( + contours_only_text_parent, all_found_textline_polygons, contours_only_text_parent_d_ordered, conf_contours_textregions) else: textline_mask_tot_ea = cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1) - all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, \ - index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_light( - txt_con_org, contours_only_text_parent, textline_mask_tot_ea, + all_found_textline_polygons, \ + all_box_coord, slopes = self.get_slopes_and_deskew_new_light( + contours_only_text_parent, contours_only_text_parent, textline_mask_tot_ea, boxes_text, slope_deskew) - all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, \ - all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new_light( + all_found_textline_polygons_marginals, \ + all_box_coord_marginals, slopes_marginals = self.get_slopes_and_deskew_new_light( polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, boxes_marginals, slope_deskew) #all_found_textline_polygons = self.filter_contours_inside_a_bigger_one( # all_found_textline_polygons, textline_mask_tot_ea_org, type_contour="textline") else: textline_mask_tot_ea = cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1) - all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, \ - all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new( - txt_con_org, contours_only_text_parent, textline_mask_tot_ea, + all_found_textline_polygons, \ + all_box_coord, slopes = self.get_slopes_and_deskew_new( + contours_only_text_parent, contours_only_text_parent, textline_mask_tot_ea, boxes_text, slope_deskew) - all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, \ - all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new( + all_found_textline_polygons_marginals, \ + all_box_coord_marginals, slopes_marginals = self.get_slopes_and_deskew_new( polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, boxes_marginals, slope_deskew) else: scale_param = 1 textline_mask_tot_ea_erode = cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=2) - all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, \ - all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_curved( - txt_con_org, contours_only_text_parent, textline_mask_tot_ea_erode, + all_found_textline_polygons, \ + all_box_coord, slopes = self.get_slopes_and_deskew_new_curved( + contours_only_text_parent, textline_mask_tot_ea_erode, boxes_text, text_only, num_col_classifier, scale_param, slope_deskew) all_found_textline_polygons = small_textlines_to_parent_adherence2( all_found_textline_polygons, textline_mask_tot_ea, num_col_classifier) - all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, \ - all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new_curved( - polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea_erode, + all_found_textline_polygons_marginals, \ + all_box_coord_marginals, slopes_marginals = self.get_slopes_and_deskew_new_curved( + polygons_of_marginals, textline_mask_tot_ea_erode, boxes_marginals, text_only, num_col_classifier, scale_param, slope_deskew) all_found_textline_polygons_marginals = small_textlines_to_parent_adherence2( @@ -4884,7 +4878,7 @@ class Eynollah: conf_contours_textregions, conf_contours_textregions_h) else: pcgts = self.writer.build_pagexml_no_full_layout( - txt_con_org, page_coord, order_text_new, id_of_texts_tot, + contours_only_text_parent, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals_left, polygons_of_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index fb4bbd0..2560846 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -216,7 +216,7 @@ def do_back_rotation_and_get_cnt_back(contour_par, index_r_con, img, slope_first def get_textregion_contours_in_org_image_light(cnts, img, confidence_matrix): if not len(cnts): - return [], [] + return [] confidence_matrix = cv2.resize(confidence_matrix, (img.shape[1] // 6, img.shape[0] // 6), @@ -226,7 +226,7 @@ def get_textregion_contours_in_org_image_light(cnts, img, confidence_matrix): cnt_mask = np.zeros(confidence_matrix.shape) cnt_mask = cv2.fillPoly(cnt_mask, pts=[cnt // 6], color=1.0) confs.append(np.sum(confidence_matrix * cnt_mask) / np.sum(cnt_mask)) - return cnts, confs + return confs def return_contours_of_interested_textline(region_pre_p, label): # pixels of images are identified by 5 diff --git a/src/eynollah/utils/separate_lines.py b/src/eynollah/utils/separate_lines.py index 3bfc903..22ef00d 100644 --- a/src/eynollah/utils/separate_lines.py +++ b/src/eynollah/utils/separate_lines.py @@ -1592,7 +1592,7 @@ def get_smallest_skew(img, sigma_des, angles, logger=None, plotter=None, map=map @wrap_ndarray_shared(kw='textline_mask_tot_ea') def do_work_of_slopes_new( - box_text, contour, contour_par, index_r_con, + box_text, contour, contour_par, textline_mask_tot_ea=None, slope_deskew=0.0, logger=None, MAX_SLOPE=999, KERNEL=None, plotter=None ): @@ -1647,12 +1647,12 @@ def do_work_of_slopes_new( all_text_region_raw[mask_only_con_region == 0] = 0 cnt_clean_rot = textline_contours_postprocessing(all_text_region_raw, slope_for_all, contour_par, box_text) - return cnt_clean_rot, box_text, contour, contour_par, crop_coor, index_r_con, slope + return cnt_clean_rot, crop_coor, slope @wrap_ndarray_shared(kw='textline_mask_tot_ea') @wrap_ndarray_shared(kw='mask_texts_only') def do_work_of_slopes_new_curved( - box_text, contour, contour_par, index_r_con, + box_text, contour_par, textline_mask_tot_ea=None, mask_texts_only=None, num_col=1, scale_par=1.0, slope_deskew=0.0, logger=None, MAX_SLOPE=999, KERNEL=None, plotter=None @@ -1743,11 +1743,11 @@ def do_work_of_slopes_new_curved( slope_for_all, contour_par, box_text, True) - return textlines_cnt_per_region[::-1], box_text, contour, contour_par, crop_coor, index_r_con, slope + return textlines_cnt_per_region[::-1], crop_coor, slope @wrap_ndarray_shared(kw='textline_mask_tot_ea') def do_work_of_slopes_new_light( - box_text, contour, contour_par, index_r_con, + box_text, contour, contour_par, textline_mask_tot_ea=None, slope_deskew=0, textline_light=True, logger=None ): @@ -1777,4 +1777,4 @@ def do_work_of_slopes_new_light( all_text_region_raw[mask_only_con_region == 0] = 0 cnt_clean_rot = textline_contours_postprocessing(all_text_region_raw, slope_deskew, contour_par, box_text) - return cnt_clean_rot, box_text, contour, contour_par, crop_coor, index_r_con, slope_deskew + return cnt_clean_rot, crop_coor, slope_deskew From e32479765cc52a29462b36f876d253478860f176 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 7 Oct 2025 23:03:27 +0200 Subject: [PATCH 30/44] writer: simplify - simplify serialization of coordinates - re-use `serialize_lines_in_region` (drop `*_in_dropcapital` and `*_in_marginal`) - re-use `calculate_polygon_coords` --- src/eynollah/writer.py | 343 ++++++++++++++++------------------------- 1 file changed, 131 insertions(+), 212 deletions(-) diff --git a/src/eynollah/writer.py b/src/eynollah/writer.py index 936c95f..67a2989 100644 --- a/src/eynollah/writer.py +++ b/src/eynollah/writer.py @@ -56,113 +56,30 @@ class EynollahXmlWriter(): points_page_print = points_page_print + ' ' return points_page_print[:-1] - def serialize_lines_in_marginal(self, marginal_region, all_found_textline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, slopes_marginals, counter, ocr_all_textlines_textregion): - for j in range(len(all_found_textline_polygons_marginals[marginal_idx])): - coords = CoordsType() - textline = TextLineType(id=counter.next_line_id, Coords=coords) - if ocr_all_textlines_textregion: - textline.set_TextEquiv( [ TextEquivType(Unicode=ocr_all_textlines_textregion[j]) ] ) - marginal_region.add_TextLine(textline) - marginal_region.set_orientation(-slopes_marginals[marginal_idx]) - points_co = '' - for l in range(len(all_found_textline_polygons_marginals[marginal_idx][j])): - if not (self.curved_line or self.textline_light): - if len(all_found_textline_polygons_marginals[marginal_idx][j][l]) == 2: - textline_x_coord = max(0, int((all_found_textline_polygons_marginals[marginal_idx][j][l][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x) ) - textline_y_coord = max(0, int((all_found_textline_polygons_marginals[marginal_idx][j][l][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y) ) - else: - textline_x_coord = max(0, int((all_found_textline_polygons_marginals[marginal_idx][j][l][0][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x) ) - textline_y_coord = max(0, int((all_found_textline_polygons_marginals[marginal_idx][j][l][0][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y) ) - points_co += str(textline_x_coord) - points_co += ',' - points_co += str(textline_y_coord) - if (self.curved_line or self.textline_light) and np.abs(slopes_marginals[marginal_idx]) <= 45: - if len(all_found_textline_polygons_marginals[marginal_idx][j][l]) == 2: - points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][0] + page_coord[2]) / self.scale_x)) - points_co += ',' - points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][1] + page_coord[0]) / self.scale_y)) - else: - points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][0][0] + page_coord[2]) / self.scale_x)) - points_co += ',' - points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][0][1] + page_coord[0]) / self.scale_y)) - - elif (self.curved_line or self.textline_light) and np.abs(slopes_marginals[marginal_idx]) > 45: - if len(all_found_textline_polygons_marginals[marginal_idx][j][l]) == 2: - points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x)) - points_co += ',' - points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y)) - else: - points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][0][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x)) - points_co += ',' - points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][0][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y)) - points_co += ' ' - coords.set_points(points_co[:-1]) - def serialize_lines_in_region(self, text_region, all_found_textline_polygons, region_idx, page_coord, all_box_coord, slopes, counter, ocr_all_textlines_textregion): self.logger.debug('enter serialize_lines_in_region') - for j in range(len(all_found_textline_polygons[region_idx])): + for j, polygon_textline in enumerate(all_found_textline_polygons[region_idx]): coords = CoordsType() textline = TextLineType(id=counter.next_line_id, Coords=coords) if ocr_all_textlines_textregion: - textline.set_TextEquiv( [ TextEquivType(Unicode=ocr_all_textlines_textregion[j]) ] ) + # FIXME: add OCR confidence + textline.set_TextEquiv([TextEquivType(Unicode=ocr_all_textlines_textregion[j])]) text_region.add_TextLine(textline) text_region.set_orientation(-slopes[region_idx]) region_bboxes = all_box_coord[region_idx] points_co = '' - for idx_contour_textline, contour_textline in enumerate(all_found_textline_polygons[region_idx][j]): - if not (self.curved_line or self.textline_light): - if len(contour_textline) == 2: - textline_x_coord = max(0, int((contour_textline[0] + region_bboxes[2] + page_coord[2]) / self.scale_x)) - textline_y_coord = max(0, int((contour_textline[1] + region_bboxes[0] + page_coord[0]) / self.scale_y)) - else: - textline_x_coord = max(0, int((contour_textline[0][0] + region_bboxes[2] + page_coord[2]) / self.scale_x)) - textline_y_coord = max(0, int((contour_textline[0][1] + region_bboxes[0] + page_coord[0]) / self.scale_y)) - points_co += str(textline_x_coord) - points_co += ',' - points_co += str(textline_y_coord) - - if self.textline_light or (self.curved_line and np.abs(slopes[region_idx]) <= 45): - if len(contour_textline) == 2: - points_co += str(int((contour_textline[0] + page_coord[2]) / self.scale_x)) - points_co += ',' - points_co += str(int((contour_textline[1] + page_coord[0]) / self.scale_y)) - else: - points_co += str(int((contour_textline[0][0] + page_coord[2]) / self.scale_x)) - points_co += ',' - points_co += str(int((contour_textline[0][1] + page_coord[0])/self.scale_y)) - elif self.curved_line and np.abs(slopes[region_idx]) > 45: - if len(contour_textline)==2: - points_co += str(int((contour_textline[0] + region_bboxes[2] + page_coord[2])/self.scale_x)) - points_co += ',' - points_co += str(int((contour_textline[1] + region_bboxes[0] + page_coord[0])/self.scale_y)) - else: - points_co += str(int((contour_textline[0][0] + region_bboxes[2]+page_coord[2])/self.scale_x)) - points_co += ',' - points_co += str(int((contour_textline[0][1] + region_bboxes[0]+page_coord[0])/self.scale_y)) - points_co += ' ' - coords.set_points(points_co[:-1]) - - def serialize_lines_in_dropcapital(self, text_region, all_found_textline_polygons, region_idx, page_coord, all_box_coord, slopes, counter, ocr_all_textlines_textregion): - self.logger.debug('enter serialize_lines_in_region') - for j in range(1): - coords = CoordsType() - textline = TextLineType(id=counter.next_line_id, Coords=coords) - if ocr_all_textlines_textregion: - textline.set_TextEquiv( [ TextEquivType(Unicode=ocr_all_textlines_textregion[j]) ] ) - text_region.add_TextLine(textline) - #region_bboxes = all_box_coord[region_idx] - points_co = '' - for idx_contour_textline, contour_textline in enumerate(all_found_textline_polygons[j]): - if len(contour_textline) == 2: - points_co += str(int((contour_textline[0] + page_coord[2]) / self.scale_x)) - points_co += ',' - points_co += str(int((contour_textline[1] + page_coord[0]) / self.scale_y)) - else: - points_co += str(int((contour_textline[0][0] + page_coord[2]) / self.scale_x)) - points_co += ',' - points_co += str(int((contour_textline[0][1] + page_coord[0])/self.scale_y)) - - points_co += ' ' + for point in polygon_textline: + if len(point) != 2: + point = point[0] + point_x = point[0] + page_coord[2] + point_y = point[1] + page_coord[0] + # FIXME: or actually... not self.textline_light and not self.curved_line or np.abs(slopes[region_idx]) > 45? + if not self.textline_light and not (self.curved_line and np.abs(slopes[region_idx]) <= 45): + point_x += region_bboxes[2] + point_y += region_bboxes[0] + point_x = max(0, int(point_x / self.scale_x)) + point_y = max(0, int(point_y / self.scale_y)) + points_co += str(point_x) + ',' + str(point_y) + ' ' coords.set_points(points_co[:-1]) def write_pagexml(self, pcgts): @@ -170,7 +87,7 @@ class EynollahXmlWriter(): with open(self.output_filename, 'w') as f: f.write(to_xml(pcgts)) - def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals_left, found_polygons_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_marginals_left, slopes_marginals_right, cont_page, polygons_lines_to_be_written_in_xml, found_polygons_tables, ocr_all_textlines=None, ocr_all_textlines_marginals_left=None, ocr_all_textlines_marginals_right=None, conf_contours_textregion=None, skip_layout_reading_order=False): + def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals_left, found_polygons_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_marginals_left, slopes_marginals_right, cont_page, polygons_seplines, found_polygons_tables, ocr_all_textlines=None, ocr_all_textlines_marginals_left=None, ocr_all_textlines_marginals_right=None, conf_contours_textregion=None, skip_layout_reading_order=False): self.logger.debug('enter build_pagexml_no_full_layout') # create the file structure @@ -179,90 +96,79 @@ class EynollahXmlWriter(): page.set_Border(BorderType(Coords=CoordsType(points=self.calculate_page_coords(cont_page)))) counter = EynollahIdCounter() - if len(found_polygons_text_region) > 0: + if len(order_of_texts): _counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) - id_of_marginalia_left = [_counter_marginals.next_region_id for _ in found_polygons_marginals_left] - id_of_marginalia_right = [_counter_marginals.next_region_id for _ in found_polygons_marginals_right] + id_of_marginalia_left = [_counter_marginals.next_region_id + for _ in found_polygons_marginals_left] + id_of_marginalia_right = [_counter_marginals.next_region_id + for _ in found_polygons_marginals_right] xml_reading_order(page, order_of_texts, id_of_marginalia_left, id_of_marginalia_right) - for mm in range(len(found_polygons_text_region)): - textregion = TextRegionType(id=counter.next_region_id, type_='paragraph', - Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord, skip_layout_reading_order), conf=conf_contours_textregion[mm]), - ) - #textregion.set_conf(conf_contours_textregion[mm]) + for mm, region_contour in enumerate(found_polygons_text_region): + textregion = TextRegionType( + id=counter.next_region_id, type_='paragraph', + Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord, + skip_layout_reading_order), + conf=conf_contours_textregion[mm]), + ) page.add_TextRegion(textregion) if ocr_all_textlines: ocr_textlines = ocr_all_textlines[mm] else: ocr_textlines = None - self.serialize_lines_in_region(textregion, all_found_textline_polygons, mm, page_coord, all_box_coord, slopes, counter, ocr_textlines) + self.serialize_lines_in_region(textregion, all_found_textline_polygons, mm, page_coord, + all_box_coord, slopes, counter, ocr_textlines) - for mm in range(len(found_polygons_marginals_left)): - marginal = TextRegionType(id=counter.next_region_id, type_='marginalia', - Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals_left[mm], page_coord))) + for mm, region_contour in enumerate(found_polygons_marginals_left): + marginal = TextRegionType( + id=counter.next_region_id, type_='marginalia', + Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) + ) page.add_TextRegion(marginal) if ocr_all_textlines_marginals_left: ocr_textlines = ocr_all_textlines_marginals_left[mm] else: ocr_textlines = None - - #print(ocr_textlines, mm, len(all_found_textline_polygons_marginals_left[mm]) ) - self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals_left, mm, page_coord, all_box_coord_marginals_left, slopes_marginals_left, counter, ocr_textlines) + self.serialize_lines_in_region(marginal, all_found_textline_polygons_marginals_left, mm, page_coord, + all_box_coord_marginals_left, slopes_marginals_left, counter, ocr_textlines) - for mm in range(len(found_polygons_marginals_right)): - marginal = TextRegionType(id=counter.next_region_id, type_='marginalia', - Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals_right[mm], page_coord))) + for mm, region_contour in enumerate(found_polygons_marginals_right): + marginal = TextRegionType( + id=counter.next_region_id, type_='marginalia', + Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) + ) page.add_TextRegion(marginal) if ocr_all_textlines_marginals_right: ocr_textlines = ocr_all_textlines_marginals_right[mm] else: ocr_textlines = None - - self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals_right, mm, page_coord, all_box_coord_marginals_right, slopes_marginals_right, counter, ocr_textlines) + self.serialize_lines_in_region(marginal, all_found_textline_polygons_marginals_right, mm, page_coord, + all_box_coord_marginals_right, slopes_marginals_right, counter, ocr_textlines) - for mm in range(len(found_polygons_text_region_img)): - img_region = ImageRegionType(id=counter.next_region_id, Coords=CoordsType()) + for region_contour in found_polygons_text_region_img: + img_region = ImageRegionType( + id=counter.next_region_id, + Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) + ) page.add_ImageRegion(img_region) - points_co = '' - for lmm in range(len(found_polygons_text_region_img[mm])): - try: - points_co += str(int((found_polygons_text_region_img[mm][lmm,0,0] + page_coord[2]) / self.scale_x)) - points_co += ',' - points_co += str(int((found_polygons_text_region_img[mm][lmm,0,1] + page_coord[0]) / self.scale_y)) - points_co += ' ' - except: - points_co += str(int((found_polygons_text_region_img[mm][lmm][0] + page_coord[2])/ self.scale_x )) - points_co += ',' - points_co += str(int((found_polygons_text_region_img[mm][lmm][1] + page_coord[0])/ self.scale_y )) - points_co += ' ' + for region_contour in polygons_seplines: + sep = SeparatorRegionType( + id=counter.next_region_id, + Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, [0, 0, 0, 0])) + ) + page.add_SeparatorRegion(sep) - img_region.get_Coords().set_points(points_co[:-1]) - - for mm in range(len(polygons_lines_to_be_written_in_xml)): - sep_hor = SeparatorRegionType(id=counter.next_region_id, Coords=CoordsType()) - page.add_SeparatorRegion(sep_hor) - points_co = '' - for lmm in range(len(polygons_lines_to_be_written_in_xml[mm])): - points_co += str(int((polygons_lines_to_be_written_in_xml[mm][lmm,0,0] ) / self.scale_x)) - points_co += ',' - points_co += str(int((polygons_lines_to_be_written_in_xml[mm][lmm,0,1] ) / self.scale_y)) - points_co += ' ' - sep_hor.get_Coords().set_points(points_co[:-1]) - for mm in range(len(found_polygons_tables)): - tab_region = TableRegionType(id=counter.next_region_id, Coords=CoordsType()) - page.add_TableRegion(tab_region) - points_co = '' - for lmm in range(len(found_polygons_tables[mm])): - points_co += str(int((found_polygons_tables[mm][lmm,0,0] + page_coord[2]) / self.scale_x)) - points_co += ',' - points_co += str(int((found_polygons_tables[mm][lmm,0,1] + page_coord[0]) / self.scale_y)) - points_co += ' ' - tab_region.get_Coords().set_points(points_co[:-1]) + for region_contour in found_polygons_tables: + tab = TableRegionType( + id=counter.next_region_id, + Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) + ) + page.add_TableRegion(tab) return pcgts - def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals_left,found_polygons_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_h, slopes_marginals_left, slopes_marginals_right, cont_page, polygons_lines_to_be_written_in_xml, ocr_all_textlines=None, ocr_all_textlines_h=None, ocr_all_textlines_marginals_left=None, ocr_all_textlines_marginals_right=None, ocr_all_textlines_drop=None, conf_contours_textregion=None, conf_contours_textregion_h=None): + def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals_left,found_polygons_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_h, slopes_marginals_left, slopes_marginals_right, cont_page, polygons_seplines, ocr_all_textlines=None, ocr_all_textlines_h=None, ocr_all_textlines_marginals_left=None, ocr_all_textlines_marginals_right=None, ocr_all_textlines_drop=None, conf_contours_textregion=None, conf_contours_textregion_h=None): self.logger.debug('enter build_pagexml_full_layout') # create the file structure @@ -271,99 +177,112 @@ class EynollahXmlWriter(): page.set_Border(BorderType(Coords=CoordsType(points=self.calculate_page_coords(cont_page)))) counter = EynollahIdCounter() - _counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) - id_of_marginalia_left = [_counter_marginals.next_region_id for _ in found_polygons_marginals_left] - id_of_marginalia_right = [_counter_marginals.next_region_id for _ in found_polygons_marginals_right] - xml_reading_order(page, order_of_texts, id_of_marginalia_left, id_of_marginalia_right) + if len(order_of_texts): + _counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) + id_of_marginalia_left = [_counter_marginals.next_region_id + for _ in found_polygons_marginals_left] + id_of_marginalia_right = [_counter_marginals.next_region_id + for _ in found_polygons_marginals_right] + xml_reading_order(page, order_of_texts, id_of_marginalia_left, id_of_marginalia_right) - for mm in range(len(found_polygons_text_region)): - textregion = TextRegionType(id=counter.next_region_id, type_='paragraph', - Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord), conf=conf_contours_textregion[mm])) + for mm, region_contour in enumerate(found_polygons_text_region): + textregion = TextRegionType( + id=counter.next_region_id, type_='paragraph', + Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord), + conf=conf_contours_textregion[mm]) + ) page.add_TextRegion(textregion) - if ocr_all_textlines: ocr_textlines = ocr_all_textlines[mm] else: ocr_textlines = None - self.serialize_lines_in_region(textregion, all_found_textline_polygons, mm, page_coord, all_box_coord, slopes, counter, ocr_textlines) + self.serialize_lines_in_region(textregion, all_found_textline_polygons, mm, page_coord, + all_box_coord, slopes, counter, ocr_textlines) self.logger.debug('len(found_polygons_text_region_h) %s', len(found_polygons_text_region_h)) - for mm in range(len(found_polygons_text_region_h)): - textregion = TextRegionType(id=counter.next_region_id, type_='heading', - Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region_h[mm], page_coord))) + for mm, region_contour in enumerate(found_polygons_text_region_h): + textregion = TextRegionType( + id=counter.next_region_id, type_='heading', + Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) + ) page.add_TextRegion(textregion) - if ocr_all_textlines_h: ocr_textlines = ocr_all_textlines_h[mm] else: ocr_textlines = None - self.serialize_lines_in_region(textregion, all_found_textline_polygons_h, mm, page_coord, all_box_coord_h, slopes_h, counter, ocr_textlines) + self.serialize_lines_in_region(textregion, all_found_textline_polygons_h, mm, page_coord, + all_box_coord_h, slopes_h, counter, ocr_textlines) - for mm in range(len(found_polygons_marginals_left)): - marginal = TextRegionType(id=counter.next_region_id, type_='marginalia', - Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals_left[mm], page_coord))) + for mm, region_contour in enumerate(found_polygons_marginals_left): + marginal = TextRegionType( + id=counter.next_region_id, type_='marginalia', + Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) + ) page.add_TextRegion(marginal) if ocr_all_textlines_marginals_left: ocr_textlines = ocr_all_textlines_marginals_left[mm] else: ocr_textlines = None - self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals_left, mm, page_coord, all_box_coord_marginals_left, slopes_marginals_left, counter, ocr_textlines) - - for mm in range(len(found_polygons_marginals_right)): - marginal = TextRegionType(id=counter.next_region_id, type_='marginalia', - Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals_right[mm], page_coord))) + self.serialize_lines_in_region(marginal, all_found_textline_polygons_marginals_left, mm, page_coord, all_box_coord_marginals_left, slopes_marginals_left, counter, ocr_textlines) + + for mm, region_contour in enumerate(found_polygons_marginals_right): + marginal = TextRegionType( + id=counter.next_region_id, type_='marginalia', + Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) + ) page.add_TextRegion(marginal) if ocr_all_textlines_marginals_right: ocr_textlines = ocr_all_textlines_marginals_right[mm] else: ocr_textlines = None - self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals_right, mm, page_coord, all_box_coord_marginals_right, slopes_marginals_right, counter, ocr_textlines) - - for mm in range(len(found_polygons_drop_capitals)): - dropcapital = TextRegionType(id=counter.next_region_id, type_='drop-capital', - Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_drop_capitals[mm], page_coord))) + self.serialize_lines_in_region(marginal, all_found_textline_polygons_marginals_right, mm, page_coord, + all_box_coord_marginals_right, slopes_marginals_right, counter, ocr_textlines) + + for mm, region_contour in enumerate(found_polygons_drop_capitals): + dropcapital = TextRegionType( + id=counter.next_region_id, type_='drop-capital', + Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) + ) page.add_TextRegion(dropcapital) - all_box_coord_drop = None - slopes_drop = None + all_box_coord_drop = [[0, 0, 0, 0]] + slopes_drop = [0] if ocr_all_textlines_drop: ocr_textlines = ocr_all_textlines_drop[mm] else: ocr_textlines = None - self.serialize_lines_in_dropcapital(dropcapital, [found_polygons_drop_capitals[mm]], mm, page_coord, all_box_coord_drop, slopes_drop, counter, ocr_all_textlines_textregion=ocr_textlines) + self.serialize_lines_in_region(dropcapital, [[found_polygons_drop_capitals[mm]]], 0, page_coord, + all_box_coord_drop, slopes_drop, counter, ocr_textlines) - for mm in range(len(found_polygons_text_region_img)): - page.add_ImageRegion(ImageRegionType(id=counter.next_region_id, Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region_img[mm], page_coord)))) + for region_contour in found_polygons_text_region_img: + page.add_ImageRegion( + ImageRegionType(id=counter.next_region_id, + Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)))) - for mm in range(len(polygons_lines_to_be_written_in_xml)): - page.add_SeparatorRegion(SeparatorRegionType(id=counter.next_region_id, Coords=CoordsType(points=self.calculate_polygon_coords(polygons_lines_to_be_written_in_xml[mm], [0 , 0, 0, 0])))) + for region_contour in polygons_seplines: + page.add_SeparatorRegion( + SeparatorRegionType(id=counter.next_region_id, + Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, [0, 0, 0, 0])))) - for mm in range(len(found_polygons_tables)): - page.add_TableRegion(TableRegionType(id=counter.next_region_id, Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_tables[mm], page_coord)))) + for region_contour in found_polygons_tables: + page.add_TableRegion( + TableRegionType(id=counter.next_region_id, + Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)))) return pcgts def calculate_polygon_coords(self, contour, page_coord, skip_layout_reading_order=False): self.logger.debug('enter calculate_polygon_coords') coords = '' - for value_bbox in contour: - if skip_layout_reading_order: - if len(value_bbox) == 2: - coords += str(int((value_bbox[0]) / self.scale_x)) - coords += ',' - coords += str(int((value_bbox[1]) / self.scale_y)) - else: - coords += str(int((value_bbox[0][0]) / self.scale_x)) - coords += ',' - coords += str(int((value_bbox[0][1]) / self.scale_y)) - else: - if len(value_bbox) == 2: - coords += str(int((value_bbox[0] + page_coord[2]) / self.scale_x)) - coords += ',' - coords += str(int((value_bbox[1] + page_coord[0]) / self.scale_y)) - else: - coords += str(int((value_bbox[0][0] + page_coord[2]) / self.scale_x)) - coords += ',' - coords += str(int((value_bbox[0][1] + page_coord[0]) / self.scale_y)) - coords=coords + ' ' + for point in contour: + if len(point) != 2: + point = point[0] + point_x = point[0] + point_y = point[1] + if not skip_layout_reading_order: + point_x += page_coord[2] + point_y += page_coord[0] + point_x = int(point_x / self.scale_x) + point_y = int(point_y / self.scale_y) + coords += str(point_x) + ',' + str(point_y) + ' ' return coords[:-1] From cbbb3248c72c1f3e50b98de1f7e2980bdd14da5d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 8 Oct 2025 00:43:29 +0200 Subject: [PATCH 31/44] writer: simplify - `build_pagexml_no_full_layout`: delegate to `build_pagexml_full_layout` (removing redundant code) --- src/eynollah/writer.py | 133 +++++++++++++++-------------------------- 1 file changed, 49 insertions(+), 84 deletions(-) diff --git a/src/eynollah/writer.py b/src/eynollah/writer.py index 67a2989..eee7440 100644 --- a/src/eynollah/writer.py +++ b/src/eynollah/writer.py @@ -87,8 +87,50 @@ class EynollahXmlWriter(): with open(self.output_filename, 'w') as f: f.write(to_xml(pcgts)) - def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals_left, found_polygons_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_marginals_left, slopes_marginals_right, cont_page, polygons_seplines, found_polygons_tables, ocr_all_textlines=None, ocr_all_textlines_marginals_left=None, ocr_all_textlines_marginals_right=None, conf_contours_textregion=None, skip_layout_reading_order=False): - self.logger.debug('enter build_pagexml_no_full_layout') + def build_pagexml_no_full_layout( + self, found_polygons_text_region, + page_coord, order_of_texts, id_of_texts, + all_found_textline_polygons, + all_box_coord, + found_polygons_text_region_img, + found_polygons_marginals_left, found_polygons_marginals_right, + all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, + all_box_coord_marginals_left, all_box_coord_marginals_right, + slopes, slopes_marginals_left, slopes_marginals_right, + cont_page, polygons_seplines, + found_polygons_tables, + **kwargs): + return self.build_pagexml_full_layout( + found_polygons_text_region, [], + page_coord, order_of_texts, id_of_texts, + all_found_textline_polygons, [], + all_box_coord, [], + found_polygons_text_region_img, found_polygons_tables, [], + found_polygons_marginals_left, found_polygons_marginals_right, + all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, + all_box_coord_marginals_left, all_box_coord_marginals_right, + slopes, [], slopes_marginals_left, slopes_marginals_right, + cont_page, polygons_seplines, + **kwargs) + + def build_pagexml_full_layout( + self, + found_polygons_text_region, found_polygons_text_region_h, + page_coord, order_of_texts, id_of_texts, + all_found_textline_polygons, all_found_textline_polygons_h, + all_box_coord, all_box_coord_h, + found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, + found_polygons_marginals_left,found_polygons_marginals_right, + all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, + all_box_coord_marginals_left, all_box_coord_marginals_right, + slopes, slopes_h, slopes_marginals_left, slopes_marginals_right, + cont_page, polygons_seplines, + ocr_all_textlines=None, ocr_all_textlines_h=None, + ocr_all_textlines_marginals_left=None, ocr_all_textlines_marginals_right=None, + ocr_all_textlines_drop=None, + conf_contours_textregion=None, conf_contours_textregion_h=None, + skip_layout_reading_order=False): + self.logger.debug('enter build_pagexml') # create the file structure pcgts = self.pcgts if self.pcgts else create_page_xml(self.image_filename, self.height_org, self.width_org) @@ -108,89 +150,10 @@ class EynollahXmlWriter(): textregion = TextRegionType( id=counter.next_region_id, type_='paragraph', Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord, - skip_layout_reading_order), - conf=conf_contours_textregion[mm]), - ) - page.add_TextRegion(textregion) - if ocr_all_textlines: - ocr_textlines = ocr_all_textlines[mm] - else: - ocr_textlines = None - self.serialize_lines_in_region(textregion, all_found_textline_polygons, mm, page_coord, - all_box_coord, slopes, counter, ocr_textlines) - - for mm, region_contour in enumerate(found_polygons_marginals_left): - marginal = TextRegionType( - id=counter.next_region_id, type_='marginalia', - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) - ) - page.add_TextRegion(marginal) - if ocr_all_textlines_marginals_left: - ocr_textlines = ocr_all_textlines_marginals_left[mm] - else: - ocr_textlines = None - self.serialize_lines_in_region(marginal, all_found_textline_polygons_marginals_left, mm, page_coord, - all_box_coord_marginals_left, slopes_marginals_left, counter, ocr_textlines) - - for mm, region_contour in enumerate(found_polygons_marginals_right): - marginal = TextRegionType( - id=counter.next_region_id, type_='marginalia', - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) - ) - page.add_TextRegion(marginal) - if ocr_all_textlines_marginals_right: - ocr_textlines = ocr_all_textlines_marginals_right[mm] - else: - ocr_textlines = None - self.serialize_lines_in_region(marginal, all_found_textline_polygons_marginals_right, mm, page_coord, - all_box_coord_marginals_right, slopes_marginals_right, counter, ocr_textlines) - - for region_contour in found_polygons_text_region_img: - img_region = ImageRegionType( - id=counter.next_region_id, - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) - ) - page.add_ImageRegion(img_region) - - for region_contour in polygons_seplines: - sep = SeparatorRegionType( - id=counter.next_region_id, - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, [0, 0, 0, 0])) - ) - page.add_SeparatorRegion(sep) - - for region_contour in found_polygons_tables: - tab = TableRegionType( - id=counter.next_region_id, - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) - ) - page.add_TableRegion(tab) - - return pcgts - - def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals_left,found_polygons_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_h, slopes_marginals_left, slopes_marginals_right, cont_page, polygons_seplines, ocr_all_textlines=None, ocr_all_textlines_h=None, ocr_all_textlines_marginals_left=None, ocr_all_textlines_marginals_right=None, ocr_all_textlines_drop=None, conf_contours_textregion=None, conf_contours_textregion_h=None): - self.logger.debug('enter build_pagexml_full_layout') - - # create the file structure - pcgts = self.pcgts if self.pcgts else create_page_xml(self.image_filename, self.height_org, self.width_org) - page = pcgts.get_Page() - page.set_Border(BorderType(Coords=CoordsType(points=self.calculate_page_coords(cont_page)))) - - counter = EynollahIdCounter() - if len(order_of_texts): - _counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) - id_of_marginalia_left = [_counter_marginals.next_region_id - for _ in found_polygons_marginals_left] - id_of_marginalia_right = [_counter_marginals.next_region_id - for _ in found_polygons_marginals_right] - xml_reading_order(page, order_of_texts, id_of_marginalia_left, id_of_marginalia_right) - - for mm, region_contour in enumerate(found_polygons_text_region): - textregion = TextRegionType( - id=counter.next_region_id, type_='paragraph', - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord), - conf=conf_contours_textregion[mm]) + skip_layout_reading_order)) ) + if conf_contours_textregion: + textregion.Coords.set_conf(conf_contours_textregion[mm]) page.add_TextRegion(textregion) if ocr_all_textlines: ocr_textlines = ocr_all_textlines[mm] @@ -205,6 +168,8 @@ class EynollahXmlWriter(): id=counter.next_region_id, type_='heading', Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) ) + if conf_contours_textregion_h: + textregion.Coords.set_conf(conf_contours_textregion_h[mm]) page.add_TextRegion(textregion) if ocr_all_textlines_h: ocr_textlines = ocr_all_textlines_h[mm] From 75823f9bed64153718acab6f664cdfc114ef34fb Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 8 Oct 2025 00:54:53 +0200 Subject: [PATCH 32/44] run_single: call `writer.build_pagexml_no_full_layout` w/ kwargs --- src/eynollah/eynollah.py | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index ec68bcd..b109c90 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4260,18 +4260,6 @@ class Eynollah: order_text_new = [0] slopes =[0] id_of_texts_tot =['region_0001'] - - polygons_of_images = [] - slopes_marginals_left = [] - slopes_marginals_right = [] - polygons_of_marginals_left = [] - polygons_of_marginals_right = [] - all_found_textline_polygons_marginals_left = [] - all_found_textline_polygons_marginals_right = [] - all_box_coord_marginals_left = [] - all_box_coord_marginals_right = [] - polygons_seplines = [] - contours_tables = [] conf_contours_textregions =[0] if self.ocr and not self.tr: @@ -4284,15 +4272,13 @@ class Eynollah: pcgts = self.writer.build_pagexml_no_full_layout( cont_page, page_coord, order_text_new, id_of_texts_tot, - all_found_textline_polygons, page_coord, polygons_of_images, - polygons_of_marginals_left, polygons_of_marginals_right, - all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, - all_box_coord_marginals_left, all_box_coord_marginals_right, - slopes, slopes_marginals_left, slopes_marginals_right, - cont_page, polygons_seplines, contours_tables, + all_found_textline_polygons, page_coord, [], + [], [], [], [], [], [], + slopes, [], [], + cont_page, [], [], ocr_all_textlines=ocr_all_textlines, conf_contours_textregion=conf_contours_textregions, - skip_layout_reading_order=self.skip_layout_and_reading_order) + skip_layout_reading_order=True) self.logger.info("Basic processing complete") return pcgts @@ -4884,9 +4870,11 @@ class Eynollah: all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_marginals_left, slopes_marginals_right, - cont_page, polygons_seplines, contours_tables, ocr_all_textlines, - ocr_all_textlines_marginals_left, ocr_all_textlines_marginals_right, - conf_contours_textregions) + cont_page, polygons_seplines, contours_tables, + ocr_all_textlines=ocr_all_textlines, + ocr_all_textlines_marginals_left=ocr_all_textlines_marginals_left, + ocr_all_textlines_marginals_right=ocr_all_textlines_marginals_right, + conf_contours_textregions=conf_contours_textregions) return pcgts From 5e11a68a3e18e926b25829e0fce3c279e529aca0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 8 Oct 2025 01:03:48 +0200 Subject: [PATCH 33/44] writer/run_single: consistent kwarg naming `conf_contours_textregion(s)` --- src/eynollah/writer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/eynollah/writer.py b/src/eynollah/writer.py index eee7440..8859d95 100644 --- a/src/eynollah/writer.py +++ b/src/eynollah/writer.py @@ -128,7 +128,7 @@ class EynollahXmlWriter(): ocr_all_textlines=None, ocr_all_textlines_h=None, ocr_all_textlines_marginals_left=None, ocr_all_textlines_marginals_right=None, ocr_all_textlines_drop=None, - conf_contours_textregion=None, conf_contours_textregion_h=None, + conf_contours_textregions=None, conf_contours_textregions_h=None, skip_layout_reading_order=False): self.logger.debug('enter build_pagexml') @@ -152,8 +152,8 @@ class EynollahXmlWriter(): Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord, skip_layout_reading_order)) ) - if conf_contours_textregion: - textregion.Coords.set_conf(conf_contours_textregion[mm]) + if conf_contours_textregions: + textregion.Coords.set_conf(conf_contours_textregions[mm]) page.add_TextRegion(textregion) if ocr_all_textlines: ocr_textlines = ocr_all_textlines[mm] @@ -168,8 +168,8 @@ class EynollahXmlWriter(): id=counter.next_region_id, type_='heading', Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) ) - if conf_contours_textregion_h: - textregion.Coords.set_conf(conf_contours_textregion_h[mm]) + if conf_contours_textregions_h: + textregion.Coords.set_conf(conf_contours_textregions_h[mm]) page.add_TextRegion(textregion) if ocr_all_textlines_h: ocr_textlines = ocr_all_textlines_h[mm] From ca72a095cab373b6daa2f7353f456d9eacfd399b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 8 Oct 2025 00:44:32 +0200 Subject: [PATCH 34/44] tests: cover table detection in various modes --- tests/test_run.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tests/test_run.py b/tests/test_run.py index 98cee30..79c64c2 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -67,6 +67,44 @@ def test_run_eynollah_layout_filename(tmp_path, pytestconfig, caplog, options): lines = tree.xpath("//page:TextLine", namespaces=NS) assert len(lines) == 31, "result is inaccurate" # 29 paragraph lines, 1 page and 1 catch-word line +@pytest.mark.parametrize( + "options", + [ + ["--tables"], + ["--tables", "--full-layout"], + ["--tables", "--full-layout", "--textline_light", "--light_version"], + ], ids=str) +def test_run_eynollah_layout_filename2(tmp_path, pytestconfig, caplog, options): + infile = testdir.joinpath('resources/euler_rechenkunst01_1738_0025.tif') + outfile = tmp_path / 'euler_rechenkunst01_1738_0025.xml' + args = [ + '-m', MODELS_LAYOUT, + '-i', str(infile), + '-o', str(outfile.parent), + ] + if pytestconfig.getoption('verbose') > 0: + args.extend(['-l', 'DEBUG']) + caplog.set_level(logging.INFO) + def only_eynollah(logrec): + return logrec.name == 'eynollah' + runner = CliRunner() + with caplog.filtering(only_eynollah): + result = runner.invoke(layout_cli, args + options, catch_exceptions=False) + assert result.exit_code == 0, result.stdout + logmsgs = [logrec.message for logrec in caplog.records] + assert str(infile) in logmsgs + assert outfile.exists() + tree = page_from_file(str(outfile)).etree + regions = tree.xpath("//page:TextRegion", namespaces=NS) + assert len(regions) >= 2, "result is inaccurate" + regions = tree.xpath("//page:TableRegion", namespaces=NS) + # model/decoding is not very precise, so (depending on mode) we can get fractures/splits/FP + assert len(regions) >= 1, "result is inaccurate" + regions = tree.xpath("//page:SeparatorRegion", namespaces=NS) + assert len(regions) >= 2, "result is inaccurate" + lines = tree.xpath("//page:TextLine", namespaces=NS) + assert len(lines) >= 2, "result is inaccurate" # mostly table (if detected correctly), but 1 page and 1 catch-word line + def test_run_eynollah_layout_directory(tmp_path, pytestconfig, caplog): indir = testdir.joinpath('resources') outdir = tmp_path From e5b52645685b669d5af7c5da2870a01660f81cdb Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 8 Oct 2025 12:17:53 +0200 Subject: [PATCH 35/44] CI: add diagnostic message for model symlink --- .github/workflows/test-eynollah.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test-eynollah.yml b/.github/workflows/test-eynollah.yml index 7c3f5ae..759b26c 100644 --- a/.github/workflows/test-eynollah.yml +++ b/.github/workflows/test-eynollah.yml @@ -66,6 +66,7 @@ jobs: python -m pip install --upgrade pip make install-dev EXTRAS=OCR,plotting make deps-test EXTRAS=OCR,plotting + ls -l models_* - name: Test with pytest run: make coverage PYTEST_ARGS="-vv --junitxml=pytest.xml" - name: Get coverage results From 839b7c4d846d6f73069529aa1f337caa362917c0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 8 Oct 2025 12:33:14 +0200 Subject: [PATCH 36/44] make models: avoid re-download --- Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile b/Makefile index 618b1f9..29dd877 100644 --- a/Makefile +++ b/Makefile @@ -58,6 +58,9 @@ help: # Download and extract models to $(PWD)/models_layout_v0_5_0 models: $(BIN_MODELNAME) $(SEG_MODELNAME) $(OCR_MODELNAME) +# do not download these files if we already have the directories +.INTERMEDIATE: $(BIN_MODELFILE) $(SEG_MODELFILE) $(OCR_MODELFILE) + $(BIN_MODELFILE): wget -O $@ $(BIN_MODEL) $(SEG_MODELFILE): From 1d4815b48f1f5b1bf006efe78141fd3161ee8073 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 8 Oct 2025 14:56:14 +0200 Subject: [PATCH 37/44] utils_ocr: forgot to pass coordinate offsets --- src/eynollah/eynollah.py | 24 ++++++++++++------------ src/eynollah/utils/utils_ocr.py | 10 ++++++++-- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index b109c90..a6b65c4 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4265,8 +4265,8 @@ class Eynollah: if self.ocr and not self.tr: gc.collect() ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines( - image_page, all_found_textline_polygons, self.prediction_model, - self.b_s_ocr, self.num_to_char, textline_light=True) + image_page, all_found_textline_polygons, np.zeros((len(all_found_textline_polygons), 4)), + self.prediction_model, self.b_s_ocr, self.num_to_char, textline_light=True) else: ocr_all_textlines = None @@ -4756,36 +4756,36 @@ class Eynollah: if len(all_found_textline_polygons)>0: ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines( - image_page, all_found_textline_polygons, self.prediction_model, - self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + image_page, all_found_textline_polygons, all_box_coord, + self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) else: ocr_all_textlines = None if all_found_textline_polygons_marginals_left and len(all_found_textline_polygons_marginals_left)>0: ocr_all_textlines_marginals_left = return_rnn_cnn_ocr_of_given_textlines( - image_page, all_found_textline_polygons_marginals_left, self.prediction_model, - self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + image_page, all_found_textline_polygons_marginals_left, all_box_coord_marginals_left, + self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) else: ocr_all_textlines_marginals_left = None if all_found_textline_polygons_marginals_right and len(all_found_textline_polygons_marginals_right)>0: ocr_all_textlines_marginals_right = return_rnn_cnn_ocr_of_given_textlines( - image_page, all_found_textline_polygons_marginals_right, self.prediction_model, - self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + image_page, all_found_textline_polygons_marginals_right, all_box_coord_marginals_right, + self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) else: ocr_all_textlines_marginals_right = None if all_found_textline_polygons_h and len(all_found_textline_polygons)>0: ocr_all_textlines_h = return_rnn_cnn_ocr_of_given_textlines( - image_page, all_found_textline_polygons_h, self.prediction_model, - self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + image_page, all_found_textline_polygons_h, all_box_coord_h, + self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) else: ocr_all_textlines_h = None if polygons_of_drop_capitals and len(polygons_of_drop_capitals)>0: ocr_all_textlines_drop = return_rnn_cnn_ocr_of_given_textlines( - image_page, polygons_of_drop_capitals, self.prediction_model, - self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + image_page, polygons_of_drop_capitals, np.zeros((len(polygons_of_drop_capitals), 4)), + self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) else: ocr_all_textlines_drop = None diff --git a/src/eynollah/utils/utils_ocr.py b/src/eynollah/utils/utils_ocr.py index 602ad6e..6e71b0f 100644 --- a/src/eynollah/utils/utils_ocr.py +++ b/src/eynollah/utils/utils_ocr.py @@ -1,13 +1,17 @@ +import math +import copy + import numpy as np import cv2 import tensorflow as tf from scipy.signal import find_peaks from scipy.ndimage import gaussian_filter1d -import math from PIL import Image, ImageDraw, ImageFont from Bio import pairwise2 + from .resize import resize_image + def decode_batch_predictions(pred, num_to_char, max_len = 128): # input_len is the product of the batch size and the # number of time steps. @@ -370,7 +374,9 @@ def return_textline_contour_with_added_box_coordinate(textline_contour, box_ind return textline_contour -def return_rnn_cnn_ocr_of_given_textlines(image, all_found_textline_polygons, +def return_rnn_cnn_ocr_of_given_textlines(image, + all_found_textline_polygons, + all_box_coord, prediction_model, b_s_ocr, num_to_char, textline_light=False, From 027b87d32125afdc1bebbb968fc32b55b58bf153 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 8 Oct 2025 14:56:57 +0200 Subject: [PATCH 38/44] fixup c0137c2 (missing arguments for utils_ocr) --- src/eynollah/eynollah.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index a6b65c4..aeb01be 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -96,6 +96,7 @@ from .utils.rotate import ( rotation_image_new ) from .utils.utils_ocr import ( + return_start_and_end_of_common_text_of_textline_ocr_without_common_section, return_textline_contour_with_added_box_coordinate, preprocess_and_resize_image_for_ocrcnn_model, return_textlines_split_if_needed, @@ -4796,7 +4797,6 @@ class Eynollah: self.logger.info("Using light text line detection for OCR") self.logger.info("Processing text lines...") - self.device.reset() gc.collect() torch.cuda.empty_cache() From 096def1e9d0b95cf3690734730f675ae5a74c0fd Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 8 Oct 2025 15:13:13 +0200 Subject: [PATCH 39/44] mbreorder/enhancment: fix missing imports (not sure if these models really need that, though) --- src/eynollah/image_enhancer.py | 6 +++--- src/eynollah/mb_ro_on_layout.py | 7 +++---- tests/test_smoke.py | 1 - 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/eynollah/image_enhancer.py b/src/eynollah/image_enhancer.py index 89dde16..9247efe 100644 --- a/src/eynollah/image_enhancer.py +++ b/src/eynollah/image_enhancer.py @@ -6,23 +6,23 @@ from logging import Logger import os import time from typing import Optional -import atexit -from functools import partial from pathlib import Path -from multiprocessing import cpu_count import gc + import cv2 import numpy as np from ocrd_utils import getLogger, tf_disable_interactive_logs import tensorflow as tf from skimage.morphology import skeletonize from tensorflow.keras.models import load_model + from .utils.resize import resize_image from .utils.pil_cv2 import pil2cv from .utils import ( is_image_filename, crop_image_inside_box ) +from .eynollah import PatchEncoder, Patches DPI_THRESHOLD = 298 KERNEL = np.ones((5, 5), np.uint8) diff --git a/src/eynollah/mb_ro_on_layout.py b/src/eynollah/mb_ro_on_layout.py index 45db8e4..218f973 100644 --- a/src/eynollah/mb_ro_on_layout.py +++ b/src/eynollah/mb_ro_on_layout.py @@ -6,25 +6,24 @@ from logging import Logger import os import time from typing import Optional -import atexit -from functools import partial from pathlib import Path -from multiprocessing import cpu_count import xml.etree.ElementTree as ET + import cv2 import numpy as np from ocrd_utils import getLogger import statistics import tensorflow as tf from tensorflow.keras.models import load_model -from .utils.resize import resize_image +from .utils.resize import resize_image from .utils.contour import ( find_new_features_of_contours, return_contours_of_image, return_parent_contours, ) from .utils import is_xml_filename +from .eynollah import PatchEncoder, Patches DPI_THRESHOLD = 298 KERNEL = np.ones((5, 5), np.uint8) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 252213f..e2b323a 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -2,6 +2,5 @@ def test_utils_import(): import eynollah.utils import eynollah.utils.contour import eynollah.utils.drop_capitals - import eynollah.utils.drop_capitals import eynollah.utils.is_nan import eynollah.utils.rotate From 8a2d682e12d8e95414aa53f1e2a9cfea74c778a3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 8 Oct 2025 16:52:22 +0200 Subject: [PATCH 40/44] fix identifier scope in layout OCR options (w/o full_layout) --- src/eynollah/eynollah.py | 33 +++++++++++---------------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index aeb01be..7d6229a 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4726,7 +4726,6 @@ class Eynollah: self.plotter.write_images_into_directory(polygons_of_images, image_page) t_order = time.time() - #if self.full_layout: self.logger.info("Step 4/5: Reading Order Detection") if self.reading_order_machine_based: @@ -4749,46 +4748,41 @@ class Eynollah: boxes_d, textline_mask_tot_d) self.logger.info(f"Detection of reading order took {time.time() - t_order:.1f}s") + ocr_all_textlines = None + ocr_all_textlines_marginals_left = None + ocr_all_textlines_marginals_right = None + ocr_all_textlines_h = None + ocr_all_textlines_drop = None if self.ocr: self.logger.info("Step 4.5/5: OCR Processing") if not self.tr: gc.collect() - if len(all_found_textline_polygons)>0: + if len(all_found_textline_polygons): ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines( image_page, all_found_textline_polygons, all_box_coord, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) - else: - ocr_all_textlines = None - if all_found_textline_polygons_marginals_left and len(all_found_textline_polygons_marginals_left)>0: + if len(all_found_textline_polygons_marginals_left): ocr_all_textlines_marginals_left = return_rnn_cnn_ocr_of_given_textlines( image_page, all_found_textline_polygons_marginals_left, all_box_coord_marginals_left, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) - else: - ocr_all_textlines_marginals_left = None - if all_found_textline_polygons_marginals_right and len(all_found_textline_polygons_marginals_right)>0: + if len(all_found_textline_polygons_marginals_right): ocr_all_textlines_marginals_right = return_rnn_cnn_ocr_of_given_textlines( image_page, all_found_textline_polygons_marginals_right, all_box_coord_marginals_right, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) - else: - ocr_all_textlines_marginals_right = None - if all_found_textline_polygons_h and len(all_found_textline_polygons)>0: + if self.full_layout and len(all_found_textline_polygons): ocr_all_textlines_h = return_rnn_cnn_ocr_of_given_textlines( image_page, all_found_textline_polygons_h, all_box_coord_h, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) - else: - ocr_all_textlines_h = None - if polygons_of_drop_capitals and len(polygons_of_drop_capitals)>0: + if self.full_layout and len(polygons_of_drop_capitals): ocr_all_textlines_drop = return_rnn_cnn_ocr_of_given_textlines( image_page, polygons_of_drop_capitals, np.zeros((len(polygons_of_drop_capitals), 4)), self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) - else: - ocr_all_textlines_drop = None else: if self.light_version: @@ -4805,6 +4799,7 @@ class Eynollah: ind_tot = 0 #cv2.imwrite('./img_out.png', image_page) ocr_all_textlines = [] + # FIXME: what about lines in marginals / headings / drop-capitals here? for indexing, ind_poly_first in enumerate(all_found_textline_polygons): ocr_textline_in_textregion = [] for indexing2, ind_poly in enumerate(ind_poly_first): @@ -4840,12 +4835,6 @@ class Eynollah: ocr_textline_in_textregion.append(text_ocr) ind_tot = ind_tot +1 ocr_all_textlines.append(ocr_textline_in_textregion) - else: - ocr_all_textlines = None - ocr_all_textlines_marginals_left = None - ocr_all_textlines_marginals_right = None - ocr_all_textlines_h = None - ocr_all_textlines_drop = None self.logger.info("Step 5/5: Output Generation") From b3d29bef8961435f85cf0c95ec3dd6c239e74621 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 8 Oct 2025 19:21:07 +0200 Subject: [PATCH 41/44] return_contours_of_interested_region*: rm unused variants --- src/eynollah/eynollah.py | 17 +++++++---------- src/eynollah/utils/contour.py | 33 --------------------------------- 2 files changed, 7 insertions(+), 43 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 7d6229a..e15afd6 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -79,7 +79,6 @@ from .utils.contour import ( get_textregion_contours_in_org_image_light, return_contours_of_image, return_contours_of_interested_region, - return_contours_of_interested_region_by_min_size, return_contours_of_interested_textline, return_parent_contours, dilate_textregion_contours, @@ -4242,14 +4241,11 @@ class Eynollah: all_found_textline_polygons = filter_contours_area_of_image( textline_mask_tot_ea, cnt_clean_rot_raw, hir_on_cnt_clean_rot, max_area=1, min_area=0.00001) - M_main_tot = [cv2.moments(all_found_textline_polygons[j]) - for j in range(len(all_found_textline_polygons))] - w_h_textlines = [cv2.boundingRect(all_found_textline_polygons[j])[2:] - for j in range(len(all_found_textline_polygons))] - w_h_textlines = [w_h_textlines[j][0] / float(w_h_textlines[j][1]) for j in range(len(w_h_textlines))] - cx_main_tot = [(M_main_tot[j]["m10"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] - cy_main_tot = [(M_main_tot[j]["m01"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] - + cx_main_tot, cy_main_tot = find_center_of_contours(all_found_textline_polygons) + w_h_textlines = [cv2.boundingRect(polygon)[2:] + for polygon in all_found_textline_polygons] + w_h_textlines = [w / float(h) for w, h in w_h_textlines] + all_found_textline_polygons = self.get_textlines_of_a_textregion_sorted( #all_found_textline_polygons[::-1] all_found_textline_polygons, cx_main_tot, cy_main_tot, w_h_textlines) @@ -4677,7 +4673,8 @@ class Eynollah: self.plotter.save_plot_of_layout_all(text_regions_p, image_page) label_img = 4 - polygons_of_drop_capitals = return_contours_of_interested_region_by_min_size(text_regions_p, label_img) + polygons_of_drop_capitals = return_contours_of_interested_region(text_regions_p, label_img, + min_area=0.00003) ##all_found_textline_polygons = adhere_drop_capital_region_into_corresponding_textline( ##text_regions_p, polygons_of_drop_capitals, contours_only_text_parent, contours_only_text_parent_h, ##all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index 2560846..f998c4d 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -253,39 +253,6 @@ def return_contours_of_image(image): contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) return contours, hierarchy -def return_contours_of_interested_region_by_min_size(region_pre_p, label, min_size=0.00003): - # pixels of images are identified by 5 - if region_pre_p.ndim == 3: - cnts_images = (region_pre_p[:, :, 0] == label) * 1 - else: - cnts_images = (region_pre_p[:, :] == label) * 1 - _, thresh = cv2.threshold(cnts_images.astype(np.uint8), 0, 255, 0) - - contours_imgs, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - contours_imgs = return_parent_contours(contours_imgs, hierarchy) - contours_imgs = filter_contours_area_of_image_tables( - thresh, contours_imgs, hierarchy, max_area=1, min_area=min_size) - - return contours_imgs - -def return_contours_of_interested_region_by_size(region_pre_p, label, min_area, max_area): - # pixels of images are identified by 5 - if region_pre_p.ndim == 3: - cnts_images = (region_pre_p[:, :, 0] == label) * 1 - else: - cnts_images = (region_pre_p[:, :] == label) * 1 - _, thresh = cv2.threshold(cnts_images.astype(np.uint8), 0, 255, 0) - contours_imgs, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - - contours_imgs = return_parent_contours(contours_imgs, hierarchy) - contours_imgs = filter_contours_area_of_image_tables( - thresh, contours_imgs, hierarchy, max_area=max_area, min_area=min_area) - - img_ret = np.zeros((region_pre_p.shape[0], region_pre_p.shape[1])) - img_ret = cv2.fillPoly(img_ret, pts=contours_imgs, color=1) - - return img_ret - def dilate_textline_contours(all_found_textline_polygons): return [[polygon2contour(contour2polygon(contour, dilate=6)) for contour in region] From a144026b2789ae056c7bac619d2e3e2b582e62d6 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 8 Oct 2025 15:13:57 +0200 Subject: [PATCH 42/44] add rough ruff config --- pyproject.toml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 8a63543..2df39b9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,3 +51,18 @@ where = ["src"] [tool.coverage.run] branch = true source = ["eynollah"] + +[tool.ruff] +line-length = 120 + +[tool.ruff.lint] +ignore = [ +# disable unused imports +"F401", +# disable import order +"E402", +# disable unused variables +"F841", +# disable bare except +"E722", +] From e1b56d97dab9eed6110fabd85b5ae74b36f18c9f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 8 Oct 2025 17:54:38 +0200 Subject: [PATCH 43/44] CI: lint with ruff --- .github/workflows/test-eynollah.yml | 4 ++++ pyproject.toml | 3 +++ 2 files changed, 7 insertions(+) diff --git a/.github/workflows/test-eynollah.yml b/.github/workflows/test-eynollah.yml index 759b26c..466e690 100644 --- a/.github/workflows/test-eynollah.yml +++ b/.github/workflows/test-eynollah.yml @@ -67,6 +67,10 @@ jobs: make install-dev EXTRAS=OCR,plotting make deps-test EXTRAS=OCR,plotting ls -l models_* + - name: Lint with ruff + uses: astral-sh/ruff-action@v3 + with: + src: "./src" - name: Test with pytest run: make coverage PYTEST_ARGS="-vv --junitxml=pytest.xml" - name: Get coverage results diff --git a/pyproject.toml b/pyproject.toml index 2df39b9..79f9164 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,3 +66,6 @@ ignore = [ # disable bare except "E722", ] + +[tool.ruff.format] +quote-style = "preserve" From cab392601e74e0360e659296f26e1719fb6f742f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 9 Oct 2025 20:12:06 +0200 Subject: [PATCH 44/44] :memo: update changelog --- CHANGELOG.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f6776d6..ab3dd83 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,11 +15,17 @@ Fixed: * `get_smallest_skew`: after shifting search range of rotation angle, use overall best result * Dockerfile: fix CUDA installation (cuDNN contested between Torch and TF due to extra OCR) * OCR: re-instate missing methods and fix `utils_ocr` function calls + * mbreorder/enhancement CLIs: missing imports * :fire: writer: `SeparatorRegion` needs `SeparatorRegionType` (not `ImageRegionType`) f458e3e * tests: switch from `pytest-subtests` to `parametrize` so we can use `pytest-isolate` (so CUDA memory gets freed between tests if running on GPU) +Added: + * test coverage for OCR options in `layout` + * test coverage for table detection in `layout` + * CI linting with ruff + Changed: * polygons: slightly widen for regions and lines, increase for separators @@ -28,7 +34,19 @@ Changed: but use shared memory if necessary, and switch back from `loky` to stdlib, and shutdown in `del()` instead of `atexit` * :fire: OCR: switch CNN-RNN model to `20250930` version compatible with TF 2.12 on CPU, too + * OCR: allow running `-tr` without `-fl`, too * :fire: writer: use `@type='heading'` instead of `'header'` for headings + * :fire: performance gains via refactoring (simplification, less copy-code, vectorization, + avoiding unused calculations, avoiding unnecessary 3-channel image operations) + * :fire: heuristic reading order detection: many improvements + - contour vs splitter box matching: + * contour must be contained in box exactly instead of heuristics + * make fallback center matching, center must be contained in box + - original vs deskewed contour matching: + * same min-area filter on both sides + * similar area score in addition to center proximity + * avoid duplicate and missing mappings by allowing N:M + matches and splitting+joining where necessary * CI: update+improve model caching