From 73b7c780abc9cccab32d391e0d835c4d13199412 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Tue, 9 Mar 2021 08:20:33 -0500 Subject: [PATCH 1/3] Update eynollah.py reading order bug for documents with text regions less than 5: fixed --- qurator/eynollah/eynollah.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index cb5b028..f8d8e9f 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -1672,10 +1672,17 @@ class Eynollah: cx_bigest_d_big, cy_biggest_d_big, _, _, _, _, _ = find_new_features_of_contours([contours_biggest_d]) cx_bigest_d, cy_biggest_d, _, _, _, _, _ = find_new_features_of_contours(contours_only_text_parent_d) try: - cx_bigest_d_last5 = cx_bigest_d[-5:] - cy_biggest_d_last5 = cy_biggest_d[-5:] - dists_d = [math.sqrt((cx_bigest_big[0] - cx_bigest_d_last5[j]) ** 2 + (cy_biggest_big[0] - cy_biggest_d_last5[j]) ** 2) for j in range(len(cy_biggest_d_last5))] - ind_largest = len(cx_bigest_d) -5 + np.argmin(dists_d) + if len(cx_bigest_d) >= 5: + cx_bigest_d_last5 = cx_bigest_d[-5:] + cy_biggest_d_last5 = cy_biggest_d[-5:] + dists_d = [math.sqrt((cx_bigest_big[0] - cx_bigest_d_last5[j]) ** 2 + (cy_biggest_big[0] - cy_biggest_d_last5[j]) ** 2) for j in range(len(cy_biggest_d_last5))] + ind_largest = len(cx_bigest_d) -5 + np.argmin(dists_d) + else: + cx_bigest_d_last5 = cx_bigest_d[-len(cx_bigest_d):] + cy_biggest_d_last5 = cy_biggest_d[-len(cx_bigest_d):] + dists_d = [math.sqrt((cx_bigest_big[0]-cx_bigest_d_last5[j])**2 + (cy_biggest_big[0]-cy_biggest_d_last5[j])**2) for j in range(len(cy_biggest_d_last5))] + ind_largest = len(cx_bigest_d) - len(cx_bigest_d) + np.argmin(dists_d) + cx_bigest_d_big[0] = cx_bigest_d[ind_largest] cy_biggest_d_big[0] = cy_biggest_d[ind_largest] except Exception as why: From 4b3c8a67070a63abe118b7b52dd10a5e860e9e22 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Tue, 9 Mar 2021 08:33:37 -0500 Subject: [PATCH 2/3] bug in reading order is fixed --- qurator/eynollah/utils/__init__.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/qurator/eynollah/utils/__init__.py b/qurator/eynollah/utils/__init__.py index ac72ef9..d095b4b 100644 --- a/qurator/eynollah/utils/__init__.py +++ b/qurator/eynollah/utils/__init__.py @@ -2155,13 +2155,17 @@ def return_boxes_of_images_by_order_of_reading_new(splitter_y_new, regions_witho x_start_by_order=[] x_end_by_order=[] if len(x_starting)>0: + all_columns = np.array(range(len(peaks_neg_tot)-1)) columns_covered_by_lines_covered_more_than_2col=[] for dj in range(len(x_starting)): - columns_covered_by_lines_covered_more_than_2col=columns_covered_by_lines_covered_more_than_2col+list(np.array(range(x_starting[dj],x_ending[dj])) ) + if set( list(np.array(range(x_starting[dj],x_ending[dj])) ) ) == set(all_columns): + pass + else: + columns_covered_by_lines_covered_more_than_2col=columns_covered_by_lines_covered_more_than_2col+list(np.array(range(x_starting[dj],x_ending[dj])) ) columns_covered_by_lines_covered_more_than_2col=list(set(columns_covered_by_lines_covered_more_than_2col)) - all_columns=np.array(range(len(peaks_neg_tot)-1)) + columns_not_covered=list( set(all_columns)-set(columns_covered_by_lines_covered_more_than_2col) ) From 67a9fc8820f5f4300d9533a929412b6685ed329a Mon Sep 17 00:00:00 2001 From: vahid Date: Tue, 9 Mar 2021 08:55:19 -0500 Subject: [PATCH 3/3] .. --- qurator/eynollah/eynollah.py | 2 +- qurator/eynollah/utils/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index f8d8e9f..d587cc9 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -1677,7 +1677,7 @@ class Eynollah: cy_biggest_d_last5 = cy_biggest_d[-5:] dists_d = [math.sqrt((cx_bigest_big[0] - cx_bigest_d_last5[j]) ** 2 + (cy_biggest_big[0] - cy_biggest_d_last5[j]) ** 2) for j in range(len(cy_biggest_d_last5))] ind_largest = len(cx_bigest_d) -5 + np.argmin(dists_d) - else: + else: cx_bigest_d_last5 = cx_bigest_d[-len(cx_bigest_d):] cy_biggest_d_last5 = cy_biggest_d[-len(cx_bigest_d):] dists_d = [math.sqrt((cx_bigest_big[0]-cx_bigest_d_last5[j])**2 + (cy_biggest_big[0]-cy_biggest_d_last5[j])**2) for j in range(len(cy_biggest_d_last5))] diff --git a/qurator/eynollah/utils/__init__.py b/qurator/eynollah/utils/__init__.py index d095b4b..707e32d 100644 --- a/qurator/eynollah/utils/__init__.py +++ b/qurator/eynollah/utils/__init__.py @@ -2161,7 +2161,7 @@ def return_boxes_of_images_by_order_of_reading_new(splitter_y_new, regions_witho for dj in range(len(x_starting)): if set( list(np.array(range(x_starting[dj],x_ending[dj])) ) ) == set(all_columns): pass - else: + else: columns_covered_by_lines_covered_more_than_2col=columns_covered_by_lines_covered_more_than_2col+list(np.array(range(x_starting[dj],x_ending[dj])) ) columns_covered_by_lines_covered_more_than_2col=list(set(columns_covered_by_lines_covered_more_than_2col))