From 7983a650065c392f64585e2ba540f639bde45bf6 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Sat, 14 Oct 2023 13:31:56 +0200 Subject: [PATCH 01/47] filtering separators in a correct way without missing them --- qurator/eynollah/utils/contour.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/qurator/eynollah/utils/contour.py b/qurator/eynollah/utils/contour.py index bac8235..53b39b5 100644 --- a/qurator/eynollah/utils/contour.py +++ b/qurator/eynollah/utils/contour.py @@ -44,8 +44,8 @@ def get_text_region_boxes_by_given_contours(contours): def filter_contours_area_of_image(image, contours, hierarchy, max_area, min_area): found_polygons_early = list() - jv = 0 - for c in contours: + + for jv,c in enumerate(contours): if len(c) < 3: # A polygon cannot have less than 3 points continue @@ -53,14 +53,12 @@ def filter_contours_area_of_image(image, contours, hierarchy, max_area, min_area area = polygon.area if area >= min_area * np.prod(image.shape[:2]) and area <= max_area * np.prod(image.shape[:2]) and hierarchy[0][jv][3] == -1: # and hierarchy[0][jv][3]==-1 : found_polygons_early.append(np.array([[point] for point in polygon.exterior.coords], dtype=np.uint)) - jv += 1 return found_polygons_early def filter_contours_area_of_image_tables(image, contours, hierarchy, max_area, min_area): found_polygons_early = list() - jv = 0 - for c in contours: + for jv,c in enumerate(contours): if len(c) < 3: # A polygon cannot have less than 3 points continue @@ -73,7 +71,6 @@ def filter_contours_area_of_image_tables(image, contours, hierarchy, max_area, m if area >= min_area * np.prod(image.shape[:2]) and area <= max_area * np.prod(image.shape[:2]): # and hierarchy[0][jv][3]==-1 : # print(c[0][0][1]) found_polygons_early.append(np.array([[point] for point in polygon.exterior.coords], dtype=np.int32)) - jv += 1 return found_polygons_early def find_new_features_of_contours(contours_main): @@ -234,8 +231,6 @@ def get_textregion_contours_in_org_image_multi2(cnts, img, slope_first): with Pool(cpu_count()) as p: cnts_org = p.starmap(loop_contour_image, [(index_l,cnts, img,slope_first) for index_l in range(len(cnts))]) - print(len(cnts_org),'lendiha') - return cnts_org def get_textregion_contours_in_org_image(cnts, img, slope_first): From f2811ee46990e95de7f0534411546954744416db Mon Sep 17 00:00:00 2001 From: Clemens Neudecker <952378+cneud@users.noreply.github.com> Date: Tue, 24 Oct 2023 17:32:06 +0200 Subject: [PATCH 02/47] add supported OS to readme --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 47d81bc..b095edb 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,8 @@ * [OCR-D](https://github.com/qurator-spk/eynollah#use-as-ocr-d-processor) interface ## Installation -Python versions `3.8-3.11` with Tensorflow versions >=`2.12` are currently supported. +Python versions `3.8-3.11` with Tensorflow versions >=`2.12` on Linux are currently supported. Unfortunately we can not currently support Windows or MacOS. +Windows users may be able to successfully run the tool through [WSL](https://learn.microsoft.com/en-us/windows/wsl/). For (limited) GPU support the CUDA toolkit needs to be installed. From 6018b354aa9ca5ac97d522f33afe1bff94b76ea5 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Mon, 27 Nov 2023 17:23:34 +0100 Subject: [PATCH 03/47] comment unnecessary print commands --- qurator/eynollah/eynollah.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index 4b1b5e9..49422fa 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -2071,7 +2071,6 @@ class Eynollah: arg_text_con = [] for ii in range(len(cx_text_only)): for jj in range(len(boxes)): - print(cx_text_only[ii],cy_text_only[ii],'markaz') if cx_text_only[ii] >= boxes[jj][0] and cx_text_only[ii] < boxes[jj][1] and cy_text_only[ii] >= boxes[jj][2] and cy_text_only[ii] < boxes[jj][3]: # this is valid if the center of region identify in which box it is located arg_text_con.append(jj) break @@ -2107,8 +2106,6 @@ class Eynollah: ref_point += len(id_of_texts) order_of_texts_tot = [] - print(len(contours_only_text_parent),'contours_only_text_parent') - print(len(order_by_con_main),'order_by_con_main') for tj1 in range(len(contours_only_text_parent)): order_of_texts_tot.append(int(order_by_con_main[tj1])) From e7d12d3549caaae7024e23fae0aa1cfaac8221ae Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Mon, 27 Nov 2023 20:18:24 +0100 Subject: [PATCH 04/47] first update for only images extraction --- qurator/eynollah/eynollah.py | 637 +++++++++++++++++++++-------------- 1 file changed, 376 insertions(+), 261 deletions(-) diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index 49422fa..2375ad3 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -195,6 +195,7 @@ class Eynollah: self.allow_scaling = allow_scaling self.headers_off = headers_off self.light_version = light_version + self.extract_only_images = True self.ignore_page_extraction = ignore_page_extraction self.pcgts = pcgts if not dir_in: @@ -225,6 +226,7 @@ class Eynollah: self.model_page_dir = dir_models + "/eynollah-page-extraction_20210425" self.model_region_dir_p_ens = dir_models + "/eynollah-main-regions-ensembled_20210425" self.model_region_dir_p_ens_light = dir_models + "/eynollah-main-regions_20220314" + self.model_region_dir_p_ens_light_only_images_extraction = dir_models + "/eynollah-main-regions_20231127_672_org_ens_11_13_16_17_18" if self.textline_light: self.model_textline_dir = dir_models + "/eynollah-textline_light_20210425" else: @@ -249,7 +251,23 @@ class Eynollah: self.ls_imgs = os.listdir(self.dir_in) - if dir_in and not light_version: + if dir_in and self.extract_only_images: + config = tf.compat.v1.ConfigProto() + config.gpu_options.allow_growth = True + session = tf.compat.v1.Session(config=config) + set_session(session) + + self.model_page = self.our_load_model(self.model_page_dir) + self.model_classifier = self.our_load_model(self.model_dir_of_col_classifier) + #self.model_bin = self.our_load_model(self.model_dir_of_binarization) + #self.model_textline = self.our_load_model(self.model_textline_dir) + self.model_region = self.our_load_model(self.model_region_dir_p_ens_light_only_images_extraction) + #self.model_region_fl_np = self.our_load_model(self.model_region_dir_fully_np) + #self.model_region_fl = self.our_load_model(self.model_region_dir_fully) + + self.ls_imgs = os.listdir(self.dir_in) + + if dir_in and not (light_version or self.extract_only_images): config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True session = tf.compat.v1.Session(config=config) @@ -267,6 +285,7 @@ class Eynollah: self.ls_imgs = os.listdir(self.dir_in) + def _cache_images(self, image_filename=None, image_pil=None): ret = {} @@ -462,6 +481,27 @@ class Eynollah: num_column_is_classified = True return img_new, num_column_is_classified + + def calculate_width_height_by_columns_extract_only_images(self, img, num_col, width_early, label_p_pred): + self.logger.debug("enter calculate_width_height_by_columns") + if num_col == 1: + img_w_new = 700 + elif num_col == 2: + img_w_new = 900 + elif num_col == 3: + img_w_new = 1500 + elif num_col == 4: + img_w_new = 1800 + elif num_col == 5: + img_w_new = 2200 + elif num_col == 6: + img_w_new = 2500 + img_h_new = int(img.shape[0] / float(img.shape[1]) * img_w_new) + + img_new = resize_image(img, img_h_new, img_w_new) + num_column_is_classified = True + + return img_new, num_column_is_classified def resize_image_with_column_classifier(self, is_image_enhanced, img_bin): self.logger.debug("enter resize_image_with_column_classifier") @@ -511,7 +551,7 @@ class Eynollah: is_image_enhanced = True return img, img_new, is_image_enhanced - + def resize_and_enhance_image_with_column_classifier(self,light_version): self.logger.debug("enter resize_and_enhance_image_with_column_classifier") dpi = self.dpi @@ -569,17 +609,22 @@ class Eynollah: num_col = np.argmax(label_p_pred[0]) + 1 self.logger.info("Found %d columns (%s)", num_col, np.around(label_p_pred, decimals=5)) - - if dpi < DPI_THRESHOLD: - img_new, num_column_is_classified = self.calculate_width_height_by_columns(img, num_col, width_early, label_p_pred) - if light_version: - image_res = np.copy(img_new) + + if not self.extract_only_images: + if dpi < DPI_THRESHOLD: + img_new, num_column_is_classified = self.calculate_width_height_by_columns(img, num_col, width_early, label_p_pred) + if light_version: + image_res = np.copy(img_new) + else: + image_res = self.predict_enhancement(img_new) + is_image_enhanced = True else: - image_res = self.predict_enhancement(img_new) - is_image_enhanced = True + num_column_is_classified = True + image_res = np.copy(img) + is_image_enhanced = False else: - num_column_is_classified = True - image_res = np.copy(img) + img_new, num_column_is_classified = self.calculate_width_height_by_columns_extract_only_images(img, num_col, width_early, label_p_pred) + image_res = np.copy(img_new) is_image_enhanced = False self.logger.debug("exit resize_and_enhance_image_with_column_classifier") @@ -867,11 +912,13 @@ class Eynollah: seg_not_base = label_p_pred[0,:,:,4] ##seg2 = -label_p_pred[0,:,:,2] - - seg_not_base[seg_not_base>0.03] =1 - seg_not_base[seg_not_base<1] =0 - - + if self.extract_only_images: + seg_not_base[seg_not_base>0.3] =1 + seg_not_base[seg_not_base<1] =0 + else: + seg_not_base[seg_not_base>0.03] =1 + seg_not_base[seg_not_base<1] =0 + seg_test = label_p_pred[0,:,:,1] ##seg2 = -label_p_pred[0,:,:,2] @@ -888,13 +935,10 @@ class Eynollah: seg_line[seg_line>0.1] =1 seg_line[seg_line<1] =0 - - seg_background = label_p_pred[0,:,:,0] - ##seg2 = -label_p_pred[0,:,:,2] - - - seg_background[seg_background>0.25] =1 - seg_background[seg_background<1] =0 + if not self.extract_only_images: + seg_background = label_p_pred[0,:,:,0] + seg_background[seg_background>0.25] =1 + seg_background[seg_background<1] =0 ##seg = seg+seg2 #seg = label_p_pred[0,:,:,2] #seg[seg>0.4] =1 @@ -908,7 +952,8 @@ class Eynollah: #seg[seg==1]=0 #seg[seg_test==1]=1 seg[seg_not_base==1]=4 - seg[seg_background==1]=0 + if not self.extract_only_images: + seg[seg_background==1]=0 seg[(seg_line==1) & (seg==0)]=3 seg_color = np.repeat(seg[:, :, np.newaxis], 3, axis=2) @@ -1573,6 +1618,60 @@ class Eynollah: q.put(slopes_sub) poly.put(poly_sub) box_sub.put(boxes_sub_new) + + def get_regions_light_v_extract_only_images(self,img,is_image_enhanced, num_col_classifier): + self.logger.debug("enter get_regions_light_v") + erosion_hurts = False + img_org = np.copy(img) + img_height_h = img_org.shape[0] + img_width_h = img_org.shape[1] + + #model_region, session_region = self.start_new_session_and_model(self.model_region_dir_p_ens) + + + img_resized = np.copy(img) + + + + if not self.dir_in: + model_region, session_region = self.start_new_session_and_model(self.model_region_dir_p_ens_light_only_images_extraction) + prediction_regions_org = self.do_prediction_new_concept(True, img_resized, model_region) + else: + prediction_regions_org = self.do_prediction_new_concept(True, img_resized, self.model_region) + + #plt.imshow(prediction_regions_org[:,:,0]) + #plt.show() + + prediction_regions_org = resize_image(prediction_regions_org,img_height_h, img_width_h ) + + + prediction_regions_org=prediction_regions_org[:,:,0] + + mask_lines_only = (prediction_regions_org[:,:] ==3)*1 + + mask_texts_only = (prediction_regions_org[:,:] ==1)*1 + + mask_images_only=(prediction_regions_org[:,:] ==2)*1 + + polygons_lines_xml, hir_lines_xml = return_contours_of_image(mask_lines_only) + polygons_lines_xml = textline_con_fil = filter_contours_area_of_image(mask_lines_only, polygons_lines_xml, hir_lines_xml, max_area=1, min_area=0.00001) + + + polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only,1,0.00001) + + polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only,1,0.00001) + + text_regions_p_true = np.zeros(prediction_regions_org.shape) + + text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_lines, color=(3,3,3)) + + text_regions_p_true[:,:][mask_images_only[:,:] == 1] = 2 + + text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_texts, color=(1,1,1)) + + polygons_of_images = return_contours_of_interested_region(text_regions_p_true, 2) + + return text_regions_p_true, erosion_hurts, polygons_lines_xml, polygons_of_images def get_regions_light_v(self,img,is_image_enhanced, num_col_classifier): self.logger.debug("enter get_regions_light_v") erosion_hurts = False @@ -2824,6 +2923,8 @@ class Eynollah: Get image and scales, then extract the page of scanned image """ self.logger.debug("enter run") + + self.extract_only_images = True t0_tot = time.time() @@ -2836,272 +2937,286 @@ class Eynollah: if self.dir_in: self.reset_file_name_dir(os.path.join(self.dir_in,img_name)) - img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(self.light_version) - self.logger.info("Enhancing took %.1fs ", time.time() - t0) - t1 = time.time() - if self.light_version: - text_regions_p_1 ,erosion_hurts, polygons_lines_xml, textline_mask_tot_ea = self.get_regions_light_v(img_res, is_image_enhanced, num_col_classifier) - slope_deskew, slope_first = self.run_deskew(textline_mask_tot_ea) + if self.extract_only_images: + img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(self.light_version) + self.logger.info("Enhancing took %.1fs ", time.time() - t0) + + text_regions_p_1 ,erosion_hurts, polygons_lines_xml,polygons_of_images = self.get_regions_light_v_extract_only_images(img_res, is_image_enhanced, num_col_classifier) #self.logger.info("Textregion detection took %.1fs ", time.time() - t1t) - num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, text_regions_p_1, cont_page, table_prediction, textline_mask_tot_ea = \ - self.run_graphics_and_columns_light(text_regions_p_1, textline_mask_tot_ea, num_col_classifier, num_column_is_classified, erosion_hurts) - #self.logger.info("run graphics %.1fs ", time.time() - t1t) - textline_mask_tot_ea_org = np.copy(textline_mask_tot_ea) - else: - text_regions_p_1 ,erosion_hurts, polygons_lines_xml = self.get_regions_from_xy_2models(img_res, is_image_enhanced, num_col_classifier) - self.logger.info("Textregion detection took %.1fs ", time.time() - t1) + + if self.plotter: + self.plotter.write_images_into_directory(polygons_of_images, img_res) + #plt.imshow(text_regions_p_1) + #plt.show() + else: + img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(self.light_version) + self.logger.info("Enhancing took %.1fs ", time.time() - t0) + t1 = time.time() - num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, text_regions_p_1, cont_page, table_prediction = \ - self.run_graphics_and_columns(text_regions_p_1, num_col_classifier, num_column_is_classified, erosion_hurts) - self.logger.info("Graphics detection took %.1fs ", time.time() - t1) - #self.logger.info('cont_page %s', cont_page) - - if not num_col: - self.logger.info("No columns detected, outputting an empty PAGE-XML") - pcgts = self.writer.build_pagexml_no_full_layout([], page_coord, [], [], [], [], [], [], [], [], [], [], cont_page, [], []) - self.logger.info("Job done in %.1fs", time.time() - t1) - if self.dir_in: - self.writer.write_pagexml(pcgts) - continue + if self.light_version: + text_regions_p_1 ,erosion_hurts, polygons_lines_xml, textline_mask_tot_ea = self.get_regions_light_v(img_res, is_image_enhanced, num_col_classifier) + slope_deskew, slope_first = self.run_deskew(textline_mask_tot_ea) + #self.logger.info("Textregion detection took %.1fs ", time.time() - t1t) + num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, text_regions_p_1, cont_page, table_prediction, textline_mask_tot_ea = \ + self.run_graphics_and_columns_light(text_regions_p_1, textline_mask_tot_ea, num_col_classifier, num_column_is_classified, erosion_hurts) + #self.logger.info("run graphics %.1fs ", time.time() - t1t) + textline_mask_tot_ea_org = np.copy(textline_mask_tot_ea) else: - return pcgts + text_regions_p_1 ,erosion_hurts, polygons_lines_xml = self.get_regions_from_xy_2models(img_res, is_image_enhanced, num_col_classifier) + self.logger.info("Textregion detection took %.1fs ", time.time() - t1) + + t1 = time.time() + num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, text_regions_p_1, cont_page, table_prediction = \ + self.run_graphics_and_columns(text_regions_p_1, num_col_classifier, num_column_is_classified, erosion_hurts) + self.logger.info("Graphics detection took %.1fs ", time.time() - t1) + #self.logger.info('cont_page %s', cont_page) + + if not num_col: + self.logger.info("No columns detected, outputting an empty PAGE-XML") + pcgts = self.writer.build_pagexml_no_full_layout([], page_coord, [], [], [], [], [], [], [], [], [], [], cont_page, [], []) + self.logger.info("Job done in %.1fs", time.time() - t1) + if self.dir_in: + self.writer.write_pagexml(pcgts) + continue + else: + return pcgts - t1 = time.time() - if not self.light_version: - textline_mask_tot_ea = self.run_textline(image_page) - self.logger.info("textline detection took %.1fs", time.time() - t1) + t1 = time.time() + if not self.light_version: + textline_mask_tot_ea = self.run_textline(image_page) + self.logger.info("textline detection took %.1fs", time.time() - t1) + t1 = time.time() + slope_deskew, slope_first = self.run_deskew(textline_mask_tot_ea) + self.logger.info("deskewing took %.1fs", time.time() - t1) t1 = time.time() - slope_deskew, slope_first = self.run_deskew(textline_mask_tot_ea) - self.logger.info("deskewing took %.1fs", time.time() - t1) - t1 = time.time() - #plt.imshow(table_prediction) - #plt.show() - - textline_mask_tot, text_regions_p, image_page_rotated = self.run_marginals(image_page, textline_mask_tot_ea, mask_images, mask_lines, num_col_classifier, slope_deskew, text_regions_p_1, table_prediction) - self.logger.info("detection of marginals took %.1fs", time.time() - t1) - t1 = time.time() - if not self.full_layout: - polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, regions_without_separators_d, boxes, boxes_d, polygons_of_marginals, contours_tables = self.run_boxes_no_full_layout(image_page, textline_mask_tot, text_regions_p, slope_deskew, num_col_classifier, table_prediction, erosion_hurts) - - if self.full_layout: - polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, regions_without_separators_d, regions_fully, regions_without_separators, polygons_of_marginals, contours_tables = self.run_boxes_full_layout(image_page, textline_mask_tot, text_regions_p, slope_deskew, num_col_classifier, img_only_regions, table_prediction, erosion_hurts) - text_only = ((img_revised_tab[:, :] == 1)) * 1 - if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - text_only_d = ((text_regions_p_1_n[:, :] == 1)) * 1 - - - min_con_area = 0.000005 - if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - contours_only_text, hir_on_text = return_contours_of_image(text_only) - contours_only_text_parent = return_parent_contours(contours_only_text, hir_on_text) - - if len(contours_only_text_parent) > 0: - areas_cnt_text = np.array([cv2.contourArea(c) for c in contours_only_text_parent]) - areas_cnt_text = areas_cnt_text / float(text_only.shape[0] * text_only.shape[1]) - #self.logger.info('areas_cnt_text %s', areas_cnt_text) - contours_biggest = contours_only_text_parent[np.argmax(areas_cnt_text)] - contours_only_text_parent = [c for jz, c in enumerate(contours_only_text_parent) if areas_cnt_text[jz] > min_con_area] - areas_cnt_text_parent = [area for area in areas_cnt_text if area > min_con_area] - index_con_parents = np.argsort(areas_cnt_text_parent) - contours_only_text_parent = list(np.array(contours_only_text_parent,dtype=object)[index_con_parents]) - areas_cnt_text_parent = list(np.array(areas_cnt_text_parent)[index_con_parents]) - - cx_bigest_big, cy_biggest_big, _, _, _, _, _ = find_new_features_of_contours([contours_biggest]) - cx_bigest, cy_biggest, _, _, _, _, _ = find_new_features_of_contours(contours_only_text_parent) - - contours_only_text_d, hir_on_text_d = return_contours_of_image(text_only_d) - contours_only_text_parent_d = return_parent_contours(contours_only_text_d, hir_on_text_d) - - areas_cnt_text_d = np.array([cv2.contourArea(c) for c in contours_only_text_parent_d]) - areas_cnt_text_d = areas_cnt_text_d / float(text_only_d.shape[0] * text_only_d.shape[1]) - - if len(areas_cnt_text_d)>0: - contours_biggest_d = contours_only_text_parent_d[np.argmax(areas_cnt_text_d)] - index_con_parents_d = np.argsort(areas_cnt_text_d) - contours_only_text_parent_d = list(np.array(contours_only_text_parent_d,dtype=object)[index_con_parents_d]) - areas_cnt_text_d = list(np.array(areas_cnt_text_d)[index_con_parents_d]) - - cx_bigest_d_big, cy_biggest_d_big, _, _, _, _, _ = find_new_features_of_contours([contours_biggest_d]) - cx_bigest_d, cy_biggest_d, _, _, _, _, _ = find_new_features_of_contours(contours_only_text_parent_d) - try: - if len(cx_bigest_d) >= 5: - cx_bigest_d_last5 = cx_bigest_d[-5:] - cy_biggest_d_last5 = cy_biggest_d[-5:] - dists_d = [math.sqrt((cx_bigest_big[0] - cx_bigest_d_last5[j]) ** 2 + (cy_biggest_big[0] - cy_biggest_d_last5[j]) ** 2) for j in range(len(cy_biggest_d_last5))] - ind_largest = len(cx_bigest_d) -5 + np.argmin(dists_d) - else: - cx_bigest_d_last5 = cx_bigest_d[-len(cx_bigest_d):] - cy_biggest_d_last5 = cy_biggest_d[-len(cx_bigest_d):] - dists_d = [math.sqrt((cx_bigest_big[0]-cx_bigest_d_last5[j])**2 + (cy_biggest_big[0]-cy_biggest_d_last5[j])**2) for j in range(len(cy_biggest_d_last5))] - ind_largest = len(cx_bigest_d) - len(cx_bigest_d) + np.argmin(dists_d) - - cx_bigest_d_big[0] = cx_bigest_d[ind_largest] - cy_biggest_d_big[0] = cy_biggest_d[ind_largest] - except Exception as why: - self.logger.error(why) - - (h, w) = text_only.shape[:2] - center = (w // 2.0, h // 2.0) - M = cv2.getRotationMatrix2D(center, slope_deskew, 1.0) - M_22 = np.array(M)[:2, :2] - p_big = np.dot(M_22, [cx_bigest_big, cy_biggest_big]) - x_diff = p_big[0] - cx_bigest_d_big - y_diff = p_big[1] - cy_biggest_d_big + #plt.imshow(table_prediction) + #plt.show() - contours_only_text_parent_d_ordered = [] - for i in range(len(contours_only_text_parent)): - p = np.dot(M_22, [cx_bigest[i], cy_biggest[i]]) - p[0] = p[0] - x_diff[0] - p[1] = p[1] - y_diff[0] - dists = [math.sqrt((p[0] - cx_bigest_d[j]) ** 2 + (p[1] - cy_biggest_d[j]) ** 2) for j in range(len(cx_bigest_d))] - contours_only_text_parent_d_ordered.append(contours_only_text_parent_d[np.argmin(dists)]) - # img2=np.zeros((text_only.shape[0],text_only.shape[1],3)) - # img2=cv2.fillPoly(img2,pts=[contours_only_text_parent_d[np.argmin(dists)]] ,color=(1,1,1)) - # plt.imshow(img2[:,:,0]) - # plt.show() + textline_mask_tot, text_regions_p, image_page_rotated = self.run_marginals(image_page, textline_mask_tot_ea, mask_images, mask_lines, num_col_classifier, slope_deskew, text_regions_p_1, table_prediction) + self.logger.info("detection of marginals took %.1fs", time.time() - t1) + t1 = time.time() + if not self.full_layout: + polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, regions_without_separators_d, boxes, boxes_d, polygons_of_marginals, contours_tables = self.run_boxes_no_full_layout(image_page, textline_mask_tot, text_regions_p, slope_deskew, num_col_classifier, table_prediction, erosion_hurts) + + if self.full_layout: + polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, regions_without_separators_d, regions_fully, regions_without_separators, polygons_of_marginals, contours_tables = self.run_boxes_full_layout(image_page, textline_mask_tot, text_regions_p, slope_deskew, num_col_classifier, img_only_regions, table_prediction, erosion_hurts) + text_only = ((img_revised_tab[:, :] == 1)) * 1 + if np.abs(slope_deskew) >= SLOPE_THRESHOLD: + text_only_d = ((text_regions_p_1_n[:, :] == 1)) * 1 + + + min_con_area = 0.000005 + if np.abs(slope_deskew) >= SLOPE_THRESHOLD: + contours_only_text, hir_on_text = return_contours_of_image(text_only) + contours_only_text_parent = return_parent_contours(contours_only_text, hir_on_text) + + if len(contours_only_text_parent) > 0: + areas_cnt_text = np.array([cv2.contourArea(c) for c in contours_only_text_parent]) + areas_cnt_text = areas_cnt_text / float(text_only.shape[0] * text_only.shape[1]) + #self.logger.info('areas_cnt_text %s', areas_cnt_text) + contours_biggest = contours_only_text_parent[np.argmax(areas_cnt_text)] + contours_only_text_parent = [c for jz, c in enumerate(contours_only_text_parent) if areas_cnt_text[jz] > min_con_area] + areas_cnt_text_parent = [area for area in areas_cnt_text if area > min_con_area] + index_con_parents = np.argsort(areas_cnt_text_parent) + contours_only_text_parent = list(np.array(contours_only_text_parent,dtype=object)[index_con_parents]) + areas_cnt_text_parent = list(np.array(areas_cnt_text_parent)[index_con_parents]) + + cx_bigest_big, cy_biggest_big, _, _, _, _, _ = find_new_features_of_contours([contours_biggest]) + cx_bigest, cy_biggest, _, _, _, _, _ = find_new_features_of_contours(contours_only_text_parent) + + contours_only_text_d, hir_on_text_d = return_contours_of_image(text_only_d) + contours_only_text_parent_d = return_parent_contours(contours_only_text_d, hir_on_text_d) + + areas_cnt_text_d = np.array([cv2.contourArea(c) for c in contours_only_text_parent_d]) + areas_cnt_text_d = areas_cnt_text_d / float(text_only_d.shape[0] * text_only_d.shape[1]) + + if len(areas_cnt_text_d)>0: + contours_biggest_d = contours_only_text_parent_d[np.argmax(areas_cnt_text_d)] + index_con_parents_d = np.argsort(areas_cnt_text_d) + contours_only_text_parent_d = list(np.array(contours_only_text_parent_d,dtype=object)[index_con_parents_d]) + areas_cnt_text_d = list(np.array(areas_cnt_text_d)[index_con_parents_d]) + + cx_bigest_d_big, cy_biggest_d_big, _, _, _, _, _ = find_new_features_of_contours([contours_biggest_d]) + cx_bigest_d, cy_biggest_d, _, _, _, _, _ = find_new_features_of_contours(contours_only_text_parent_d) + try: + if len(cx_bigest_d) >= 5: + cx_bigest_d_last5 = cx_bigest_d[-5:] + cy_biggest_d_last5 = cy_biggest_d[-5:] + dists_d = [math.sqrt((cx_bigest_big[0] - cx_bigest_d_last5[j]) ** 2 + (cy_biggest_big[0] - cy_biggest_d_last5[j]) ** 2) for j in range(len(cy_biggest_d_last5))] + ind_largest = len(cx_bigest_d) -5 + np.argmin(dists_d) + else: + cx_bigest_d_last5 = cx_bigest_d[-len(cx_bigest_d):] + cy_biggest_d_last5 = cy_biggest_d[-len(cx_bigest_d):] + dists_d = [math.sqrt((cx_bigest_big[0]-cx_bigest_d_last5[j])**2 + (cy_biggest_big[0]-cy_biggest_d_last5[j])**2) for j in range(len(cy_biggest_d_last5))] + ind_largest = len(cx_bigest_d) - len(cx_bigest_d) + np.argmin(dists_d) + + cx_bigest_d_big[0] = cx_bigest_d[ind_largest] + cy_biggest_d_big[0] = cy_biggest_d[ind_largest] + except Exception as why: + self.logger.error(why) + + (h, w) = text_only.shape[:2] + center = (w // 2.0, h // 2.0) + M = cv2.getRotationMatrix2D(center, slope_deskew, 1.0) + M_22 = np.array(M)[:2, :2] + p_big = np.dot(M_22, [cx_bigest_big, cy_biggest_big]) + x_diff = p_big[0] - cx_bigest_d_big + y_diff = p_big[1] - cy_biggest_d_big + + contours_only_text_parent_d_ordered = [] + for i in range(len(contours_only_text_parent)): + p = np.dot(M_22, [cx_bigest[i], cy_biggest[i]]) + p[0] = p[0] - x_diff[0] + p[1] = p[1] - y_diff[0] + dists = [math.sqrt((p[0] - cx_bigest_d[j]) ** 2 + (p[1] - cy_biggest_d[j]) ** 2) for j in range(len(cx_bigest_d))] + contours_only_text_parent_d_ordered.append(contours_only_text_parent_d[np.argmin(dists)]) + # img2=np.zeros((text_only.shape[0],text_only.shape[1],3)) + # img2=cv2.fillPoly(img2,pts=[contours_only_text_parent_d[np.argmin(dists)]] ,color=(1,1,1)) + # plt.imshow(img2[:,:,0]) + # plt.show() + else: + contours_only_text_parent_d_ordered = [] + contours_only_text_parent_d = [] + contours_only_text_parent = [] + else: contours_only_text_parent_d_ordered = [] contours_only_text_parent_d = [] contours_only_text_parent = [] - - else: - contours_only_text_parent_d_ordered = [] - contours_only_text_parent_d = [] - contours_only_text_parent = [] - else: - contours_only_text, hir_on_text = return_contours_of_image(text_only) - contours_only_text_parent = return_parent_contours(contours_only_text, hir_on_text) - - if len(contours_only_text_parent) > 0: - areas_cnt_text = np.array([cv2.contourArea(c) for c in contours_only_text_parent]) - areas_cnt_text = areas_cnt_text / float(text_only.shape[0] * text_only.shape[1]) - - contours_biggest = contours_only_text_parent[np.argmax(areas_cnt_text)] - contours_only_text_parent = [c for jz, c in enumerate(contours_only_text_parent) if areas_cnt_text[jz] > min_con_area] - areas_cnt_text_parent = [area for area in areas_cnt_text if area > min_con_area] - - index_con_parents = np.argsort(areas_cnt_text_parent) - contours_only_text_parent = list(np.array(contours_only_text_parent,dtype=object)[index_con_parents]) - areas_cnt_text_parent = list(np.array(areas_cnt_text_parent)[index_con_parents]) - - cx_bigest_big, cy_biggest_big, _, _, _, _, _ = find_new_features_of_contours([contours_biggest]) - cx_bigest, cy_biggest, _, _, _, _, _ = find_new_features_of_contours(contours_only_text_parent) - #self.logger.debug('areas_cnt_text_parent %s', areas_cnt_text_parent) - # self.logger.debug('areas_cnt_text_parent_d %s', areas_cnt_text_parent_d) - # self.logger.debug('len(contours_only_text_parent) %s', len(contours_only_text_parent_d)) else: - pass - if self.light_version: - txt_con_org = get_textregion_contours_in_org_image_light(contours_only_text_parent, self.image, slope_first) - else: - txt_con_org = get_textregion_contours_in_org_image(contours_only_text_parent, self.image, slope_first) - boxes_text, _ = get_text_region_boxes_by_given_contours(contours_only_text_parent) - boxes_marginals, _ = get_text_region_boxes_by_given_contours(polygons_of_marginals) - - if not self.curved_line: - if self.light_version: - if self.textline_light: - slopes, all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = self.get_slopes_and_deskew_new_light(txt_con_org, contours_only_text_parent, textline_mask_tot_ea_org, image_page_rotated, boxes_text, slope_deskew) - slopes_marginals, all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _ = self.get_slopes_and_deskew_new_light(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea_org, image_page_rotated, boxes_marginals, slope_deskew) + contours_only_text, hir_on_text = return_contours_of_image(text_only) + contours_only_text_parent = return_parent_contours(contours_only_text, hir_on_text) + + if len(contours_only_text_parent) > 0: + areas_cnt_text = np.array([cv2.contourArea(c) for c in contours_only_text_parent]) + areas_cnt_text = areas_cnt_text / float(text_only.shape[0] * text_only.shape[1]) + + contours_biggest = contours_only_text_parent[np.argmax(areas_cnt_text)] + contours_only_text_parent = [c for jz, c in enumerate(contours_only_text_parent) if areas_cnt_text[jz] > min_con_area] + areas_cnt_text_parent = [area for area in areas_cnt_text if area > min_con_area] + + index_con_parents = np.argsort(areas_cnt_text_parent) + contours_only_text_parent = list(np.array(contours_only_text_parent,dtype=object)[index_con_parents]) + areas_cnt_text_parent = list(np.array(areas_cnt_text_parent)[index_con_parents]) + + cx_bigest_big, cy_biggest_big, _, _, _, _, _ = find_new_features_of_contours([contours_biggest]) + cx_bigest, cy_biggest, _, _, _, _, _ = find_new_features_of_contours(contours_only_text_parent) + #self.logger.debug('areas_cnt_text_parent %s', areas_cnt_text_parent) + # self.logger.debug('areas_cnt_text_parent_d %s', areas_cnt_text_parent_d) + # self.logger.debug('len(contours_only_text_parent) %s', len(contours_only_text_parent_d)) else: - slopes, all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = self.get_slopes_and_deskew_new_light(txt_con_org, contours_only_text_parent, textline_mask_tot_ea, image_page_rotated, boxes_text, slope_deskew) - slopes_marginals, all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _ = self.get_slopes_and_deskew_new_light(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew) + pass + if self.light_version: + txt_con_org = get_textregion_contours_in_org_image_light(contours_only_text_parent, self.image, slope_first) else: - slopes, all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = self.get_slopes_and_deskew_new(txt_con_org, contours_only_text_parent, textline_mask_tot_ea, image_page_rotated, boxes_text, slope_deskew) - slopes_marginals, all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _ = self.get_slopes_and_deskew_new(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew) - - else: + txt_con_org = get_textregion_contours_in_org_image(contours_only_text_parent, self.image, slope_first) + boxes_text, _ = get_text_region_boxes_by_given_contours(contours_only_text_parent) + boxes_marginals, _ = get_text_region_boxes_by_given_contours(polygons_of_marginals) - scale_param = 1 - all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_curved(txt_con_org, contours_only_text_parent, cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1), image_page_rotated, boxes_text, text_only, num_col_classifier, scale_param, slope_deskew) - all_found_textline_polygons = small_textlines_to_parent_adherence2(all_found_textline_polygons, textline_mask_tot_ea, num_col_classifier) - all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new_curved(polygons_of_marginals, polygons_of_marginals, cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1), image_page_rotated, boxes_marginals, text_only, num_col_classifier, scale_param, slope_deskew) - all_found_textline_polygons_marginals = small_textlines_to_parent_adherence2(all_found_textline_polygons_marginals, textline_mask_tot_ea, num_col_classifier) - - if self.full_layout: - if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered, dtype=object)[index_by_text_par_con]) + if not self.curved_line: if self.light_version: - text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header_light(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_textline_polygons, slopes, contours_only_text_parent_d_ordered) + if self.textline_light: + slopes, all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = self.get_slopes_and_deskew_new_light(txt_con_org, contours_only_text_parent, textline_mask_tot_ea_org, image_page_rotated, boxes_text, slope_deskew) + slopes_marginals, all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _ = self.get_slopes_and_deskew_new_light(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea_org, image_page_rotated, boxes_marginals, slope_deskew) + else: + slopes, all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = self.get_slopes_and_deskew_new_light(txt_con_org, contours_only_text_parent, textline_mask_tot_ea, image_page_rotated, boxes_text, slope_deskew) + slopes_marginals, all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _ = self.get_slopes_and_deskew_new_light(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew) else: - text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_textline_polygons, slopes, contours_only_text_parent_d_ordered) + slopes, all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = self.get_slopes_and_deskew_new(txt_con_org, contours_only_text_parent, textline_mask_tot_ea, image_page_rotated, boxes_text, slope_deskew) + slopes_marginals, all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _ = self.get_slopes_and_deskew_new(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew) + else: - #takes long timee - contours_only_text_parent_d_ordered = None - if self.light_version: - text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header_light(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_textline_polygons, slopes, contours_only_text_parent_d_ordered) + + scale_param = 1 + all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_curved(txt_con_org, contours_only_text_parent, cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1), image_page_rotated, boxes_text, text_only, num_col_classifier, scale_param, slope_deskew) + all_found_textline_polygons = small_textlines_to_parent_adherence2(all_found_textline_polygons, textline_mask_tot_ea, num_col_classifier) + all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new_curved(polygons_of_marginals, polygons_of_marginals, cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1), image_page_rotated, boxes_marginals, text_only, num_col_classifier, scale_param, slope_deskew) + all_found_textline_polygons_marginals = small_textlines_to_parent_adherence2(all_found_textline_polygons_marginals, textline_mask_tot_ea, num_col_classifier) + + if self.full_layout: + if np.abs(slope_deskew) >= SLOPE_THRESHOLD: + contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered, dtype=object)[index_by_text_par_con]) + if self.light_version: + text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header_light(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_textline_polygons, slopes, contours_only_text_parent_d_ordered) + else: + text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_textline_polygons, slopes, contours_only_text_parent_d_ordered) else: - text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_textline_polygons, slopes, contours_only_text_parent_d_ordered) + #takes long timee + contours_only_text_parent_d_ordered = None + if self.light_version: + text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header_light(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_textline_polygons, slopes, contours_only_text_parent_d_ordered) + else: + text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_textline_polygons, slopes, contours_only_text_parent_d_ordered) - if self.plotter: - self.plotter.save_plot_of_layout(text_regions_p, image_page) - self.plotter.save_plot_of_layout_all(text_regions_p, image_page) - - pixel_img = 4 - polygons_of_drop_capitals = return_contours_of_interested_region_by_min_size(text_regions_p, pixel_img) - all_found_textline_polygons = adhere_drop_capital_region_into_corresponding_textline(text_regions_p, polygons_of_drop_capitals, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, kernel=KERNEL, curved_line=self.curved_line) - pixel_lines = 6 - + if self.plotter: + self.plotter.save_plot_of_layout(text_regions_p, image_page) + self.plotter.save_plot_of_layout_all(text_regions_p, image_page) + + pixel_img = 4 + polygons_of_drop_capitals = return_contours_of_interested_region_by_min_size(text_regions_p, pixel_img) + all_found_textline_polygons = adhere_drop_capital_region_into_corresponding_textline(text_regions_p, polygons_of_drop_capitals, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, kernel=KERNEL, curved_line=self.curved_line) + pixel_lines = 6 + + + if not self.headers_off: + if np.abs(slope_deskew) < SLOPE_THRESHOLD: + num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, self.tables, pixel_lines, contours_only_text_parent_h) + else: + _, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, self.tables, pixel_lines, contours_only_text_parent_h_d_ordered) + elif self.headers_off: + if np.abs(slope_deskew) < SLOPE_THRESHOLD: + num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, self.tables, pixel_lines) + else: + _, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, self.tables, pixel_lines) + + if num_col_classifier >= 3: + if np.abs(slope_deskew) < SLOPE_THRESHOLD: + regions_without_separators = regions_without_separators.astype(np.uint8) + regions_without_separators = cv2.erode(regions_without_separators[:, :], KERNEL, iterations=6) + + else: + regions_without_separators_d = regions_without_separators_d.astype(np.uint8) + regions_without_separators_d = cv2.erode(regions_without_separators_d[:, :], KERNEL, iterations=6) + - if not self.headers_off: if np.abs(slope_deskew) < SLOPE_THRESHOLD: - num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, self.tables, pixel_lines, contours_only_text_parent_h) + boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new(splitter_y_new, regions_without_separators, matrix_of_lines_ch, num_col_classifier, erosion_hurts, self.tables, self.right2left) else: - _, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, self.tables, pixel_lines, contours_only_text_parent_h_d_ordered) - elif self.headers_off: + boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new(splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, num_col_classifier, erosion_hurts, self.tables, self.right2left) + + #print(boxes_d,'boxes_d') + #img_once = np.zeros((textline_mask_tot_d.shape[0],textline_mask_tot_d.shape[1])) + #for box_i in boxes_d: + #img_once[int(box_i[2]):int(box_i[3]),int(box_i[0]):int(box_i[1]) ] =1 + #plt.imshow(img_once) + #plt.show() + #print(np.unique(img_once),'img_once') + if self.plotter: + self.plotter.write_images_into_directory(polygons_of_images, image_page) + t_order = time.time() + if self.full_layout: if np.abs(slope_deskew) < SLOPE_THRESHOLD: - num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, self.tables, pixel_lines) + order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot) else: - _, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, self.tables, pixel_lines) + order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d) - if num_col_classifier >= 3: + pcgts = self.writer.build_pagexml_full_layout(contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, cont_page, polygons_lines_xml) + self.logger.info("Job done in %.1fs", time.time() - t0) + ##return pcgts + else: + contours_only_text_parent_h = None if np.abs(slope_deskew) < SLOPE_THRESHOLD: - regions_without_separators = regions_without_separators.astype(np.uint8) - regions_without_separators = cv2.erode(regions_without_separators[:, :], KERNEL, iterations=6) - + order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot) else: - regions_without_separators_d = regions_without_separators_d.astype(np.uint8) - regions_without_separators_d = cv2.erode(regions_without_separators_d[:, :], KERNEL, iterations=6) - - - if np.abs(slope_deskew) < SLOPE_THRESHOLD: - boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new(splitter_y_new, regions_without_separators, matrix_of_lines_ch, num_col_classifier, erosion_hurts, self.tables, self.right2left) - else: - boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new(splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, num_col_classifier, erosion_hurts, self.tables, self.right2left) - - #print(boxes_d,'boxes_d') - #img_once = np.zeros((textline_mask_tot_d.shape[0],textline_mask_tot_d.shape[1])) - #for box_i in boxes_d: - #img_once[int(box_i[2]):int(box_i[3]),int(box_i[0]):int(box_i[1]) ] =1 - #plt.imshow(img_once) - #plt.show() - #print(np.unique(img_once),'img_once') - if self.plotter: - self.plotter.write_images_into_directory(polygons_of_images, image_page) - t_order = time.time() - if self.full_layout: - if np.abs(slope_deskew) < SLOPE_THRESHOLD: - order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot) - else: - order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d) - - pcgts = self.writer.build_pagexml_full_layout(contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, cont_page, polygons_lines_xml) - self.logger.info("Job done in %.1fs", time.time() - t0) - ##return pcgts - else: - contours_only_text_parent_h = None - if np.abs(slope_deskew) < SLOPE_THRESHOLD: - order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot) - else: - contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered, dtype=object)[index_by_text_par_con]) - order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d) - pcgts = self.writer.build_pagexml_no_full_layout(txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_xml, contours_tables) - self.logger.info("Job done in %.1fs", time.time() - t0) - ##return pcgts - self.writer.write_pagexml(pcgts) - #self.logger.info("Job done in %.1fs", time.time() - t0) + contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered, dtype=object)[index_by_text_par_con]) + order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d) + pcgts = self.writer.build_pagexml_no_full_layout(txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_xml, contours_tables) + self.logger.info("Job done in %.1fs", time.time() - t0) + ##return pcgts + self.writer.write_pagexml(pcgts) + #self.logger.info("Job done in %.1fs", time.time() - t0) if self.dir_in: self.logger.info("All jobs done in %.1fs", time.time() - t0_tot) From 6aac0b8fafb74046a7c1f5d11419f16b3c2d15ff Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Mon, 27 Nov 2023 22:12:50 +0100 Subject: [PATCH 05/47] avoiding artifact images on the boundary of documents --- qurator/eynollah/eynollah.py | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index 2375ad3..0c11327 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -1669,9 +1669,39 @@ class Eynollah: text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_texts, color=(1,1,1)) - polygons_of_images = return_contours_of_interested_region(text_regions_p_true, 2) + polygons_of_images = return_contours_of_interested_region(text_regions_p_true, 2, 0.0001) - return text_regions_p_true, erosion_hurts, polygons_lines_xml, polygons_of_images + image_boundary_of_doc = np.zeros((text_regions_p_true.shape[0], text_regions_p_true.shape[1])) + + image_boundary_of_doc[:20, :] = 1 + image_boundary_of_doc[text_regions_p_true.shape[0]-20:text_regions_p_true.shape[0], :] = 1 + + image_boundary_of_doc[:, :20] = 1 + image_boundary_of_doc[:, text_regions_p_true.shape[1]-20:text_regions_p_true.shape[1]] = 1 + + #plt.imshow(image_boundary_of_doc) + #plt.show() + + polygons_of_images_fin = [] + for ploy_img_ind in polygons_of_images: + test_poly_image = np.zeros((text_regions_p_true.shape[0], text_regions_p_true.shape[1])) + test_poly_image = cv2.fillPoly(test_poly_image, pts = [ploy_img_ind], color=(1,1,1)) + + test_poly_image = test_poly_image[:,:] + image_boundary_of_doc[:,:] + test_poly_image_intersected_area = ( test_poly_image[:,:]==2 )*1 + + test_poly_image_intersected_area = test_poly_image_intersected_area.sum() + + if test_poly_image_intersected_area==0: + polygons_of_images_fin.append(ploy_img_ind) + #plt.imshow(test_poly_image) + #plt.show() + + + + + + return text_regions_p_true, erosion_hurts, polygons_lines_xml, polygons_of_images_fin def get_regions_light_v(self,img,is_image_enhanced, num_col_classifier): self.logger.debug("enter get_regions_light_v") erosion_hurts = False From 364ccacab2623b1b3c799aef2cfa8d23d408f33b Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Tue, 28 Nov 2023 00:50:45 +0100 Subject: [PATCH 06/47] adding extracting images only in cli --- qurator/eynollah/cli.py | 11 +++++++++++ qurator/eynollah/eynollah.py | 6 ++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/qurator/eynollah/cli.py b/qurator/eynollah/cli.py index a2a2ad0..a12a61d 100644 --- a/qurator/eynollah/cli.py +++ b/qurator/eynollah/cli.py @@ -67,6 +67,12 @@ from qurator.eynollah.eynollah import Eynollah is_flag=True, help="If set, will plot intermediary files and images", ) +@click.option( + "--extract_only_images/--disable-extracting_only_images", + "-eoi/-noeoi", + is_flag=True, + help="If a directory is given, only images in documents will be cropped and saved there and the other processing will not be done", +) @click.option( "--allow-enhancement/--no-allow-enhancement", "-ae/-noae", @@ -148,6 +154,7 @@ def main( save_layout, save_deskewed, save_all, + extract_only_images, save_page, enable_plotting, allow_enhancement, @@ -175,12 +182,16 @@ def main( if textline_light and not light_version: print('Error: You used -tll to enable light textline detection but -light is not enabled') sys.exit(1) + if extract_only_images and not ( save_images and enable_plotting): + print('Error: You used -eoi to enable extract images only mode but did not enable plotting with -ep and providing an output directory with -si') + sys.exit(1) eynollah = Eynollah( image_filename=image, dir_out=out, dir_in=dir_in, dir_models=model, dir_of_cropped_images=save_images, + extract_only_images=extract_only_images, dir_of_layout=save_layout, dir_of_deskewed=save_deskewed, dir_of_all=save_all, diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index 0c11327..deb178f 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -148,6 +148,7 @@ class Eynollah: dir_out=None, dir_in=None, dir_of_cropped_images=None, + extract_only_images=False, dir_of_layout=None, dir_of_deskewed=None, dir_of_all=None, @@ -195,7 +196,7 @@ class Eynollah: self.allow_scaling = allow_scaling self.headers_off = headers_off self.light_version = light_version - self.extract_only_images = True + self.extract_only_images = extract_only_images self.ignore_page_extraction = ignore_page_extraction self.pcgts = pcgts if not dir_in: @@ -2953,9 +2954,6 @@ class Eynollah: Get image and scales, then extract the page of scanned image """ self.logger.debug("enter run") - - self.extract_only_images = True - t0_tot = time.time() From aa41e4df2025fb3f81a79802573e8d84a3e253b1 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Tue, 28 Nov 2023 21:37:55 +0100 Subject: [PATCH 07/47] The contours of images can now be written in an XML file --- qurator/eynollah/cli.py | 4 +-- qurator/eynollah/eynollah.py | 48 +++++++++++++++++++++++++++++------- 2 files changed, 41 insertions(+), 11 deletions(-) diff --git a/qurator/eynollah/cli.py b/qurator/eynollah/cli.py index a12a61d..9aba31d 100644 --- a/qurator/eynollah/cli.py +++ b/qurator/eynollah/cli.py @@ -182,8 +182,8 @@ def main( if textline_light and not light_version: print('Error: You used -tll to enable light textline detection but -light is not enabled') sys.exit(1) - if extract_only_images and not ( save_images and enable_plotting): - print('Error: You used -eoi to enable extract images only mode but did not enable plotting with -ep and providing an output directory with -si') + if extract_only_images and (allow_enhancement or allow_scaling or light_version) : + print('Error: You used -eoi which can not be enabled alongside light_version -light or allow_scaling -as or allow_enhancement -ae') sys.exit(1) eynollah = Eynollah( image_filename=image, diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index deb178f..5a8adeb 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -624,8 +624,11 @@ class Eynollah: image_res = np.copy(img) is_image_enhanced = False else: - img_new, num_column_is_classified = self.calculate_width_height_by_columns_extract_only_images(img, num_col, width_early, label_p_pred) - image_res = np.copy(img_new) + #img_new, num_column_is_classified = self.calculate_width_height_by_columns_extract_only_images(img, num_col, width_early, label_p_pred) + #image_res = np.copy(img_new) + #is_image_enhanced = True + num_column_is_classified = True + image_res = np.copy(img) is_image_enhanced = False self.logger.debug("exit resize_and_enhance_image_with_column_classifier") @@ -1621,16 +1624,27 @@ class Eynollah: box_sub.put(boxes_sub_new) def get_regions_light_v_extract_only_images(self,img,is_image_enhanced, num_col_classifier): - self.logger.debug("enter get_regions_light_v") + self.logger.debug("enter get_regions_extract_images_only") erosion_hurts = False img_org = np.copy(img) img_height_h = img_org.shape[0] img_width_h = img_org.shape[1] - #model_region, session_region = self.start_new_session_and_model(self.model_region_dir_p_ens) - + if num_col_classifier == 1: + img_w_new = 700 + elif num_col_classifier == 2: + img_w_new = 900 + elif num_col_classifier == 3: + img_w_new = 1500 + elif num_col_classifier == 4: + img_w_new = 1800 + elif num_col_classifier == 5: + img_w_new = 2200 + elif num_col_classifier == 6: + img_w_new = 2500 + img_h_new = int(img.shape[0] / float(img.shape[1]) * img_w_new) - img_resized = np.copy(img) + img_resized = resize_image(img,img_h_new, img_w_new ) @@ -1644,6 +1658,11 @@ class Eynollah: #plt.show() prediction_regions_org = resize_image(prediction_regions_org,img_height_h, img_width_h ) + + image_page, page_coord, cont_page = self.extract_page() + + + prediction_regions_org = prediction_regions_org[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]] prediction_regions_org=prediction_regions_org[:,:,0] @@ -1695,6 +1714,13 @@ class Eynollah: if test_poly_image_intersected_area==0: polygons_of_images_fin.append(ploy_img_ind) + + #x, y, w, h = cv2.boundingRect(ploy_img_ind) + #box = [x, y, w, h] + #_, page_coord = crop_image_inside_box(box, text_regions_p_true) + #cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]])) + + #polygons_of_images_fin.append(np.array(cont_page)) #plt.imshow(test_poly_image) #plt.show() @@ -1702,7 +1728,7 @@ class Eynollah: - return text_regions_p_true, erosion_hurts, polygons_lines_xml, polygons_of_images_fin + return text_regions_p_true, erosion_hurts, polygons_lines_xml, polygons_of_images_fin, image_page, page_coord, cont_page def get_regions_light_v(self,img,is_image_enhanced, num_col_classifier): self.logger.debug("enter get_regions_light_v") erosion_hurts = False @@ -2554,6 +2580,7 @@ class Eynollah: prediction_table_erode = cv2.erode(prediction_table[:,:,0], KERNEL, iterations=20) prediction_table_erode = cv2.dilate(prediction_table_erode, KERNEL, iterations=20) return prediction_table_erode.astype(np.int16) + def run_graphics_and_columns_light(self, text_regions_p_1, textline_mask_tot_ea, num_col_classifier, num_column_is_classified, erosion_hurts): img_g = self.imread(grayscale=True, uint8=True) @@ -2970,13 +2997,16 @@ class Eynollah: img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(self.light_version) self.logger.info("Enhancing took %.1fs ", time.time() - t0) - text_regions_p_1 ,erosion_hurts, polygons_lines_xml,polygons_of_images = self.get_regions_light_v_extract_only_images(img_res, is_image_enhanced, num_col_classifier) - #self.logger.info("Textregion detection took %.1fs ", time.time() - t1t) + text_regions_p_1 ,erosion_hurts, polygons_lines_xml,polygons_of_images,image_page, page_coord, cont_page = self.get_regions_light_v_extract_only_images(img_res, is_image_enhanced, num_col_classifier) + + pcgts = self.writer.build_pagexml_no_full_layout([], page_coord, [], [], [], [], polygons_of_images, [], [], [], [], [], cont_page, [], []) if self.plotter: self.plotter.write_images_into_directory(polygons_of_images, img_res) #plt.imshow(text_regions_p_1) #plt.show() + + self.writer.write_pagexml(pcgts) else: img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(self.light_version) From 7cbca79f1676da3bead00acaba44157dab5de05c Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Fri, 1 Dec 2023 23:40:47 +0100 Subject: [PATCH 08/47] replacing images cotour with bounding box --- qurator/eynollah/eynollah.py | 32 +++++++++++++++++--------------- qurator/eynollah/writer.py | 16 ++++++++++++---- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index 5a8adeb..e3e3a20 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -1693,17 +1693,18 @@ class Eynollah: image_boundary_of_doc = np.zeros((text_regions_p_true.shape[0], text_regions_p_true.shape[1])) - image_boundary_of_doc[:20, :] = 1 - image_boundary_of_doc[text_regions_p_true.shape[0]-20:text_regions_p_true.shape[0], :] = 1 + ###image_boundary_of_doc[:6, :] = 1 + ###image_boundary_of_doc[text_regions_p_true.shape[0]-6:text_regions_p_true.shape[0], :] = 1 - image_boundary_of_doc[:, :20] = 1 - image_boundary_of_doc[:, text_regions_p_true.shape[1]-20:text_regions_p_true.shape[1]] = 1 + ###image_boundary_of_doc[:, :6] = 1 + ###image_boundary_of_doc[:, text_regions_p_true.shape[1]-6:text_regions_p_true.shape[1]] = 1 #plt.imshow(image_boundary_of_doc) #plt.show() polygons_of_images_fin = [] for ploy_img_ind in polygons_of_images: + """ test_poly_image = np.zeros((text_regions_p_true.shape[0], text_regions_p_true.shape[1])) test_poly_image = cv2.fillPoly(test_poly_image, pts = [ploy_img_ind], color=(1,1,1)) @@ -1713,20 +1714,21 @@ class Eynollah: test_poly_image_intersected_area = test_poly_image_intersected_area.sum() if test_poly_image_intersected_area==0: - polygons_of_images_fin.append(ploy_img_ind) + ##polygons_of_images_fin.append(ploy_img_ind) - #x, y, w, h = cv2.boundingRect(ploy_img_ind) - #box = [x, y, w, h] - #_, page_coord = crop_image_inside_box(box, text_regions_p_true) + x, y, w, h = cv2.boundingRect(ploy_img_ind) + box = [x, y, w, h] + _, page_coord_img = crop_image_inside_box(box, text_regions_p_true) #cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]])) - #polygons_of_images_fin.append(np.array(cont_page)) - #plt.imshow(test_poly_image) - #plt.show() - - - - + polygons_of_images_fin.append(np.array([[page_coord_img[2], page_coord_img[0]], [page_coord_img[3], page_coord_img[0]], [page_coord_img[3], page_coord_img[1]], [page_coord_img[2], page_coord_img[1]]]) ) + """ + x, y, w, h = cv2.boundingRect(ploy_img_ind) + box = [x, y, w, h] + _, page_coord_img = crop_image_inside_box(box, text_regions_p_true) + #cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]])) + + polygons_of_images_fin.append(np.array([[page_coord_img[2], page_coord_img[0]], [page_coord_img[3], page_coord_img[0]], [page_coord_img[3], page_coord_img[1]], [page_coord_img[2], page_coord_img[1]]]) ) return text_regions_p_true, erosion_hurts, polygons_lines_xml, polygons_of_images_fin, image_page, page_coord, cont_page def get_regions_light_v(self,img,is_image_enhanced, num_col_classifier): diff --git a/qurator/eynollah/writer.py b/qurator/eynollah/writer.py index f537f65..4487af5 100644 --- a/qurator/eynollah/writer.py +++ b/qurator/eynollah/writer.py @@ -172,10 +172,18 @@ class EynollahXmlWriter(): page.add_ImageRegion(img_region) points_co = '' for lmm in range(len(found_polygons_text_region_img[mm])): - points_co += str(int((found_polygons_text_region_img[mm][lmm,0,0] + page_coord[2]) / self.scale_x)) - points_co += ',' - points_co += str(int((found_polygons_text_region_img[mm][lmm,0,1] + page_coord[0]) / self.scale_y)) - points_co += ' ' + try: + points_co += str(int((found_polygons_text_region_img[mm][lmm,0,0] + page_coord[2]) / self.scale_x)) + points_co += ',' + points_co += str(int((found_polygons_text_region_img[mm][lmm,0,1] + page_coord[0]) / self.scale_y)) + points_co += ' ' + except: + + points_co += str(int((found_polygons_text_region_img[mm][lmm][0] + page_coord[2])/ self.scale_x )) + points_co += ',' + points_co += str(int((found_polygons_text_region_img[mm][lmm][1] + page_coord[0])/ self.scale_y )) + points_co += ' ' + img_region.get_Coords().set_points(points_co[:-1]) for mm in range(len(polygons_lines_to_be_written_in_xml)): From f09b7c1bef9e91f244232eb88fab48f59624f822 Mon Sep 17 00:00:00 2001 From: cneud <952378+cneud@users.noreply.github.com> Date: Tue, 19 Mar 2024 20:29:10 +0100 Subject: [PATCH 09/47] use tf1 compatibility for keras backend --- qurator/eynollah/eynollah.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index 49422fa..c162af7 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -29,7 +29,8 @@ warnings.filterwarnings("ignore") from scipy.signal import find_peaks import matplotlib.pyplot as plt from scipy.ndimage import gaussian_filter1d -from tensorflow.python.keras.backend import set_session +# use tf1 compatibility for keras backend +from tensorflow.compat.v1.keras.backend import set_session from tensorflow.keras import layers from .utils.contour import ( From b3fa68439559479f2786c12482fd9270af9b4075 Mon Sep 17 00:00:00 2001 From: cneud <952378+cneud@users.noreply.github.com> Date: Tue, 19 Mar 2024 20:30:40 +0100 Subject: [PATCH 10/47] pin tf2 version to 2.12.1 until we fix keras compatibility --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 530dac2..f01d319 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ ocrd >= 2.23.3 numpy <1.24.0 scikit-learn >= 0.23.2 -tensorflow >=2.12.0 +tensorflow == 2.12.1 imutils >= 0.5.3 matplotlib setuptools >= 50 From 533736a3e355c37fbe7bea8c993c502992390f85 Mon Sep 17 00:00:00 2001 From: Clemens Neudecker <952378+cneud@users.noreply.github.com> Date: Wed, 20 Mar 2024 00:28:22 +0100 Subject: [PATCH 11/47] update supported Python+Tensorflow version combinations --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index b095edb..2dc90ec 100644 --- a/README.md +++ b/README.md @@ -19,8 +19,7 @@ * [OCR-D](https://github.com/qurator-spk/eynollah#use-as-ocr-d-processor) interface ## Installation -Python versions `3.8-3.11` with Tensorflow versions >=`2.12` on Linux are currently supported. Unfortunately we can not currently support Windows or MacOS. -Windows users may be able to successfully run the tool through [WSL](https://learn.microsoft.com/en-us/windows/wsl/). +Python versions `3.8-3.11` with Tensorflow versions `2.12-2.15` on Linux are currently supported. For (limited) GPU support the CUDA toolkit needs to be installed. From ba64282118cd4891067a69825d7e03614c4eada7 Mon Sep 17 00:00:00 2001 From: Clemens Neudecker <952378+cneud@users.noreply.github.com> Date: Wed, 3 Apr 2024 19:58:24 +0200 Subject: [PATCH 12/47] Update README.md --- README.md | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 2dc90ec..302880a 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,11 @@ # Eynollah -> Document Layout Analysis (segmentation) using pre-trained models and heuristics +> Document Layout Analysis with Deep Learning and Heuristics [![PyPI Version](https://img.shields.io/pypi/v/eynollah)](https://pypi.org/project/eynollah/) [![CircleCI Build Status](https://circleci.com/gh/qurator-spk/eynollah.svg?style=shield)](https://circleci.com/gh/qurator-spk/eynollah) [![GH Actions Test](https://github.com/qurator-spk/eynollah/actions/workflows/test-eynollah.yml/badge.svg)](https://github.com/qurator-spk/eynollah/actions/workflows/test-eynollah.yml) [![License: ASL](https://img.shields.io/github/license/qurator-spk/eynollah)](https://opensource.org/license/apache-2-0/) +[![DOI](https://img.shields.io/badge/DOI-10.1145%2F3604951.3605513-red)](https://doi.org/10.1145/3604951.3605513) ![](https://user-images.githubusercontent.com/952378/102350683-8a74db80-3fa5-11eb-8c7e-f743f7d6eae2.jpg) @@ -14,16 +15,19 @@ * Support for various image optimization operations: * cropping (border detection), binarization, deskewing, dewarping, scaling, enhancing, resizing * Text line segmentation to bounding boxes or polygons (contours) including for curved lines and vertical text -* Detection of reading order +* Detection of reading order (left-to-right or right-to-left) * Output in [PAGE-XML](https://github.com/PRImA-Research-Lab/PAGE-XML) * [OCR-D](https://github.com/qurator-spk/eynollah#use-as-ocr-d-processor) interface +:warning: Eynollah development is currently focused on achieving high quality results for a wide variety of historical documents. +Processing can be very slow, with a lot of potential to improve. We aim to work on this too, but contributions are always welcome. + ## Installation -Python versions `3.8-3.11` with Tensorflow versions `2.12-2.15` on Linux are currently supported. +Python `3.8-3.11` with Tensorflow `2.12-2.15` on Linux are currently supported. For (limited) GPU support the CUDA toolkit needs to be installed. -You can either install via +You can either install from PyPI ``` pip install eynollah @@ -39,18 +43,21 @@ cd eynollah; pip install -e . Alternatively, you can run `make install` or `make install-dev` for editable installation. ## Models -Pre-trained models can be downloaded from [qurator-data.de](https://qurator-data.de/eynollah/). +Pre-trained models can be downloaded from [qurator-data.de](https://qurator-data.de/eynollah/) or [huggingface](https://huggingface.co/SBB). -In case you want to train your own model to use with Eynollah, have a look at [sbb_pixelwise_segmentation](https://github.com/qurator-spk/sbb_pixelwise_segmentation). +## Train +🚧 **Work in progress** + +In case you want to train your own model, have a look at [`sbb_pixelwise_segmentation`](https://github.com/qurator-spk/sbb_pixelwise_segmentation). ## Usage The command-line interface can be called like this: ```sh eynollah \ - -i \ + -i | -di \ -o \ - -m \ + -m \ [OPTIONS] ``` @@ -67,7 +74,6 @@ The following options can be used to further configure the processing: | `-ib` | apply binarization (the resulting image is saved to the output directory) | | `-ep` | enable plotting (MUST always be used with `-sl`, `-sd`, `-sa`, `-si` or `-ae`) | | `-ho` | ignore headers for reading order dectection | -| `-di ` | process all images in a directory in batch mode | | `-si ` | save image regions detected to this directory | | `-sd ` | save deskewed image to this directory | | `-sl ` | save layout prediction as plot to this directory | @@ -78,6 +84,7 @@ If no option is set, the tool will perform layout detection of main regions (bac The tool produces better quality output when RGB images are used as input than greyscale or binarized images. #### Use as OCR-D processor +🚧 **Work in progress** Eynollah ships with a CLI interface to be used as [OCR-D](https://ocr-d.de) processor. @@ -95,11 +102,14 @@ ocrd-eynollah-segment -I OCR-D-IMG-BIN -O SEG-LINE -P models uses the original (RGB) image despite any binarization that may have occured in previous OCR-D processing steps +#### Additional documentation +Please check the [wiki](https://github.com/qurator-spk/eynollah/wiki). + ## How to cite If you find this tool useful in your work, please consider citing our paper: ```bibtex -@inproceedings{rezanezhad2023eynollah, +@inproceedings{hip23rezanezhad, title = {Document Layout Analysis with Deep Learning and Heuristics}, author = {Rezanezhad, Vahid and Baierer, Konstantin and Gerber, Mike and Labusch, Kai and Neudecker, Clemens}, booktitle = {Proceedings of the 7th International Workshop on Historical Document Imaging and Processing {HIP} 2023, From 899bb9f00c3b14306eb96c2a4955a0d599cc175a Mon Sep 17 00:00:00 2001 From: Clemens Neudecker <952378+cneud@users.noreply.github.com> Date: Wed, 10 Apr 2024 15:27:29 +0200 Subject: [PATCH 13/47] update GitHub actions --- .github/workflows/test-eynollah.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test-eynollah.yml b/.github/workflows/test-eynollah.yml index 30c9729..5a1acf4 100644 --- a/.github/workflows/test-eynollah.yml +++ b/.github/workflows/test-eynollah.yml @@ -14,8 +14,8 @@ jobs: python-version: ['3.8', '3.9', '3.10', '3.11'] steps: - - uses: actions/checkout@v2 - - uses: actions/cache@v2 + - uses: actions/checkout@v4 + - uses: actions/cache@v4 id: model_cache with: path: models_eynollah @@ -24,7 +24,7 @@ jobs: if: steps.model_cache.outputs.cache-hit != 'true' run: make models - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies From f88ee99f3c8aea2772abdfef6b8cbc919682a794 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 23 May 2024 21:17:38 +0200 Subject: [PATCH 14/47] non-legacy namespace package --- qurator/__init__.py | 1 - qurator/eynollah/__init__.py | 1 - setup.py | 1 - 3 files changed, 3 deletions(-) delete mode 100644 qurator/__init__.py diff --git a/qurator/__init__.py b/qurator/__init__.py deleted file mode 100644 index 5284146..0000000 --- a/qurator/__init__.py +++ /dev/null @@ -1 +0,0 @@ -__import__("pkg_resources").declare_namespace(__name__) diff --git a/qurator/eynollah/__init__.py b/qurator/eynollah/__init__.py index 8b13789..e69de29 100644 --- a/qurator/eynollah/__init__.py +++ b/qurator/eynollah/__init__.py @@ -1 +0,0 @@ - diff --git a/setup.py b/setup.py index 9abf158..c78ee3f 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,6 @@ setup( author='Vahid Rezanezhad', url='https://github.com/qurator-spk/eynollah', license='Apache License 2.0', - namespace_packages=['qurator'], packages=find_packages(exclude=['tests']), install_requires=install_requires, package_data={ From 45bd76f5e81c305446750360c7ac62e38f454bac Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 May 2024 14:27:56 +0000 Subject: [PATCH 15/47] fix namespace pkg setup --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index c78ee3f..af8a321 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -from setuptools import setup, find_packages +from setuptools import setup, find_namespace_packages from json import load install_requires = open('requirements.txt').read().split('\n') @@ -13,7 +13,7 @@ setup( author='Vahid Rezanezhad', url='https://github.com/qurator-spk/eynollah', license='Apache License 2.0', - packages=find_packages(exclude=['tests']), + packages=find_namespace_packages(include=['qurator']), install_requires=install_requires, package_data={ '': ['*.json'] From ad133e34251b0164cca059542240690762dfb7db Mon Sep 17 00:00:00 2001 From: cneud <952378+cneud@users.noreply.github.com> Date: Wed, 31 Jul 2024 19:49:43 +0200 Subject: [PATCH 16/47] Update model download url --- Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 525e6c3..439b534 100644 --- a/Makefile +++ b/Makefile @@ -24,12 +24,13 @@ models: models_eynollah models_eynollah: models_eynollah.tar.gz # tar xf models_eynollah_renamed.tar.gz --transform 's/models_eynollah_renamed/models_eynollah/' # tar xf models_eynollah_renamed.tar.gz - tar xf 2022-04-05.SavedModel.tar.gz --transform 's/models_eynollah_renamed/models_eynollah/' + tar xf models_eynollah_renamed_savedmodel.tar.gz --transform 's/models_eynollah_renamed_savedmodel/models_eynollah/' models_eynollah.tar.gz: # wget 'https://qurator-data.de/eynollah/2021-04-25/models_eynollah.tar.gz' # wget 'https://qurator-data.de/eynollah/2022-04-05/models_eynollah_renamed.tar.gz' - wget 'https://ocr-d.kba.cloud/2022-04-05.SavedModel.tar.gz' + # wget 'https://ocr-d.kba.cloud/2022-04-05.SavedModel.tar.gz' + wget 'https://qurator-data.de/eynollah/2022-04-05/models_eynollah_renamed_savedmodel.tar.gz' # Install with pip install: From 3cfa447e84027867798a4c358244ed9ce0095ae9 Mon Sep 17 00:00:00 2001 From: cneud <952378+cneud@users.noreply.github.com> Date: Wed, 31 Jul 2024 20:01:36 +0200 Subject: [PATCH 17/47] remove CircleCI --- .circleci/config.yml | 51 -------------------------------------------- README.md | 1 - 2 files changed, 52 deletions(-) delete mode 100644 .circleci/config.yml diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index d2b7057..0000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,51 +0,0 @@ -version: 2 - -jobs: - - build-python37: - machine: - - image: ubuntu-2004:2023.02.1 - - steps: - - checkout - - restore_cache: - keys: - - model-cache - - run: make models - - save_cache: - key: model-cache - paths: - models_eynollah.tar.gz - models_eynollah - - run: - name: "Set Python Version" - command: pyenv install -s 3.7.16 && pyenv global 3.7.16 - - run: make install - - run: make smoke-test - - build-python38: - machine: - - image: ubuntu-2004:2023.02.1 - steps: - - checkout - - restore_cache: - keys: - - model-cache - - run: make models - - save_cache: - key: model-cache - paths: - models_eynollah.tar.gz - models_eynollah - - run: - name: "Set Python Version" - command: pyenv install -s 3.8.16 && pyenv global 3.8.16 - - run: make install - - run: make smoke-test - -workflows: - version: 2 - build: - jobs: - # - build-python37 - - build-python38 diff --git a/README.md b/README.md index 302880a..3b4f784 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,6 @@ > Document Layout Analysis with Deep Learning and Heuristics [![PyPI Version](https://img.shields.io/pypi/v/eynollah)](https://pypi.org/project/eynollah/) -[![CircleCI Build Status](https://circleci.com/gh/qurator-spk/eynollah.svg?style=shield)](https://circleci.com/gh/qurator-spk/eynollah) [![GH Actions Test](https://github.com/qurator-spk/eynollah/actions/workflows/test-eynollah.yml/badge.svg)](https://github.com/qurator-spk/eynollah/actions/workflows/test-eynollah.yml) [![License: ASL](https://img.shields.io/github/license/qurator-spk/eynollah)](https://opensource.org/license/apache-2-0/) [![DOI](https://img.shields.io/badge/DOI-10.1145%2F3604951.3605513-red)](https://doi.org/10.1145/3604951.3605513) From 40f5408b1e576eb83983f28d4fcd68c298d79899 Mon Sep 17 00:00:00 2001 From: cneud <952378+cneud@users.noreply.github.com> Date: Wed, 31 Jul 2024 20:02:56 +0200 Subject: [PATCH 18/47] improve huggingface url --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3b4f784..f7a0a77 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ cd eynollah; pip install -e . Alternatively, you can run `make install` or `make install-dev` for editable installation. ## Models -Pre-trained models can be downloaded from [qurator-data.de](https://qurator-data.de/eynollah/) or [huggingface](https://huggingface.co/SBB). +Pre-trained models can be downloaded from [qurator-data.de](https://qurator-data.de/eynollah/) or [huggingface](https://huggingface.co/SBB?search_models=eynollah). ## Train 🚧 **Work in progress** From 38698c66097e7f3793eb4143a0519d4b36aa053f Mon Sep 17 00:00:00 2001 From: cneud <952378+cneud@users.noreply.github.com> Date: Wed, 31 Jul 2024 21:16:02 +0200 Subject: [PATCH 19/47] Update README.md --- README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index f7a0a77..b47eae3 100644 --- a/README.md +++ b/README.md @@ -18,8 +18,7 @@ * Output in [PAGE-XML](https://github.com/PRImA-Research-Lab/PAGE-XML) * [OCR-D](https://github.com/qurator-spk/eynollah#use-as-ocr-d-processor) interface -:warning: Eynollah development is currently focused on achieving high quality results for a wide variety of historical documents. -Processing can be very slow, with a lot of potential to improve. We aim to work on this too, but contributions are always welcome. +:warning: Eynollah development is currently focused on achieving the best possible quality of results for a wide variety of historical documents and therefore processing can be very slow. We aim to improve this, but contributions are always welcome. ## Installation Python `3.8-3.11` with Tensorflow `2.12-2.15` on Linux are currently supported. @@ -79,8 +78,8 @@ The following options can be used to further configure the processing: | `-sp ` | save cropped page image to this directory | | `-sa ` | save all (plot, enhanced/binary image, layout) to this directory | -If no option is set, the tool will perform layout detection of main regions (background, text, images, separators and marginals). -The tool produces better quality output when RGB images are used as input than greyscale or binarized images. +If no option is set, the tool performs layout detection of main regions (background, text, images, separators and marginals). +Best quality output is produced when RGB images are used as input rather than greyscale or binarized images. #### Use as OCR-D processor 🚧 **Work in progress** From 8862df9156b73eae0c1afb43dd7082f4115555dd Mon Sep 17 00:00:00 2001 From: cneud <952378+cneud@users.noreply.github.com> Date: Wed, 31 Jul 2024 22:53:36 +0200 Subject: [PATCH 20/47] format options table --- README.md | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index b47eae3..a92ad87 100644 --- a/README.md +++ b/README.md @@ -61,22 +61,22 @@ eynollah \ The following options can be used to further configure the processing: -| option | description | -|----------|:-------------| -| `-fl` | full layout analysis including all steps and segmentation classes | -| `-light` | lighter and faster but simpler method for main region detection and deskewing | -| `-tab` | apply table detection | -| `-ae` | apply enhancement (the resulting image is saved to the output directory) | -| `-as` | apply scaling | -| `-cl` | apply contour detection for curved text lines instead of bounding boxes | -| `-ib` | apply binarization (the resulting image is saved to the output directory) | -| `-ep` | enable plotting (MUST always be used with `-sl`, `-sd`, `-sa`, `-si` or `-ae`) | -| `-ho` | ignore headers for reading order dectection | -| `-si ` | save image regions detected to this directory | -| `-sd ` | save deskewed image to this directory | -| `-sl ` | save layout prediction as plot to this directory | -| `-sp ` | save cropped page image to this directory | -| `-sa ` | save all (plot, enhanced/binary image, layout) to this directory | +| option | description | +|-------------------|:-------------------------------------------------------------------------------| +| `-fl` | full layout analysis including all steps and segmentation classes | +| `-light` | lighter and faster but simpler method for main region detection and deskewing | +| `-tab` | apply table detection | +| `-ae` | apply enhancement (the resulting image is saved to the output directory) | +| `-as` | apply scaling | +| `-cl` | apply contour detection for curved text lines instead of bounding boxes | +| `-ib` | apply binarization (the resulting image is saved to the output directory) | +| `-ep` | enable plotting (MUST always be used with `-sl`, `-sd`, `-sa`, `-si` or `-ae`) | +| `-ho` | ignore headers for reading order dectection | +| `-si ` | save image regions detected to this directory | +| `-sd ` | save deskewed image to this directory | +| `-sl ` | save layout prediction as plot to this directory | +| `-sp ` | save cropped page image to this directory | +| `-sa ` | save all (plot, enhanced/binary image, layout) to this directory | If no option is set, the tool performs layout detection of main regions (background, text, images, separators and marginals). Best quality output is produced when RGB images are used as input rather than greyscale or binarized images. From c9f63826c05d5ddf975174a6ae28e7f7d9912fc0 Mon Sep 17 00:00:00 2001 From: cneud <952378+cneud@users.noreply.github.com> Date: Thu, 1 Aug 2024 00:13:42 +0200 Subject: [PATCH 21/47] create draft pyproject.toml --- pyproject.toml.txt | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 pyproject.toml.txt diff --git a/pyproject.toml.txt b/pyproject.toml.txt new file mode 100644 index 0000000..43d7093 --- /dev/null +++ b/pyproject.toml.txt @@ -0,0 +1,38 @@ +[build-system] +requires = ["setuptools>=61.0", "setuptools-ocrd"] + +[project] +name = "eynollah" +version = "0.3.0" +authors = [ + {name = "Vahid Rezanezhad"}, + {name = "Staatsbibliothek zu Berlin - Preußischer Kulturbesitz"}, +] +description = "Document Layout Analysis" +readme = "README.md" +license.file = "LICENSE" +requires-python = ">=3.8" +keywords = ["document layout analysis", "image segmentation"] + +dynamic = ["dependencies"] + +classifiers = [ + "Development Status :: 4 - Beta", + "Environment :: Console", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Topic :: Scientific/Engineering :: Image Processing", +] + +[project.scripts] +eynollah = "eynollah.eynollah.cli:main" +ocrd-eynollah-segment = "eynollah.eynollah.ocrd_cli:main" + +[project.urls] +Homepage = "https://github.com/qurator-spk/eynollah" +Repository = "https://github.com/qurator-spk/eynollah.git" + +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} From 7ded54a8d21b14fff3c4d048a33710910476b834 Mon Sep 17 00:00:00 2001 From: cneud <952378+cneud@users.noreply.github.com> Date: Thu, 1 Aug 2024 00:25:31 +0200 Subject: [PATCH 22/47] rename GH action --- .github/workflows/test-eynollah.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-eynollah.yml b/.github/workflows/test-eynollah.yml index 5a1acf4..98ddc06 100644 --- a/.github/workflows/test-eynollah.yml +++ b/.github/workflows/test-eynollah.yml @@ -1,7 +1,7 @@ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions -name: Python package +name: Test on: [push] From f0e7f75499577bea004bff5b7a3e8b5a673688a1 Mon Sep 17 00:00:00 2001 From: cneud <952378+cneud@users.noreply.github.com> Date: Thu, 1 Aug 2024 00:30:25 +0200 Subject: [PATCH 23/47] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a92ad87..1720f7f 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ * Output in [PAGE-XML](https://github.com/PRImA-Research-Lab/PAGE-XML) * [OCR-D](https://github.com/qurator-spk/eynollah#use-as-ocr-d-processor) interface -:warning: Eynollah development is currently focused on achieving the best possible quality of results for a wide variety of historical documents and therefore processing can be very slow. We aim to improve this, but contributions are always welcome. +:warning: Development is currently focused on achieving the best possible quality of results for a wide variety of historical documents and therefore processing can be very slow. We aim to improve this, but contributions are welcome. ## Installation Python `3.8-3.11` with Tensorflow `2.12-2.15` on Linux are currently supported. @@ -79,7 +79,7 @@ The following options can be used to further configure the processing: | `-sa ` | save all (plot, enhanced/binary image, layout) to this directory | If no option is set, the tool performs layout detection of main regions (background, text, images, separators and marginals). -Best quality output is produced when RGB images are used as input rather than greyscale or binarized images. +The best output quality is produced when RGB images are used as input rather than greyscale or binarized images. #### Use as OCR-D processor 🚧 **Work in progress** From 9170a9f21c795430e55473df4090e08fa04922a7 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Tue, 6 Aug 2024 16:11:32 +0200 Subject: [PATCH 24/47] only images extraction - update inference parameters --- qurator/eynollah/eynollah.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index e3e3a20..a5d7b38 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -260,7 +260,7 @@ class Eynollah: self.model_page = self.our_load_model(self.model_page_dir) self.model_classifier = self.our_load_model(self.model_dir_of_col_classifier) - #self.model_bin = self.our_load_model(self.model_dir_of_binarization) + self.model_bin = self.our_load_model(self.model_dir_of_binarization) #self.model_textline = self.our_load_model(self.model_textline_dir) self.model_region = self.our_load_model(self.model_region_dir_p_ens_light_only_images_extraction) #self.model_region_fl_np = self.our_load_model(self.model_region_dir_fully_np) @@ -917,7 +917,8 @@ class Eynollah: ##seg2 = -label_p_pred[0,:,:,2] if self.extract_only_images: - seg_not_base[seg_not_base>0.3] =1 + #seg_not_base[seg_not_base>0.3] =1 + seg_not_base[seg_not_base>0.5] =1 seg_not_base[seg_not_base<1] =0 else: seg_not_base[seg_not_base>0.03] =1 @@ -955,7 +956,7 @@ class Eynollah: ##plt.show() #seg[seg==1]=0 #seg[seg_test==1]=1 - seg[seg_not_base==1]=4 + ###seg[seg_not_base==1]=4 if not self.extract_only_images: seg[seg_background==1]=0 seg[(seg_line==1) & (seg==0)]=3 @@ -1689,7 +1690,13 @@ class Eynollah: text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_texts, color=(1,1,1)) - polygons_of_images = return_contours_of_interested_region(text_regions_p_true, 2, 0.0001) + + + text_regions_p_true[text_regions_p_true.shape[0]-15:text_regions_p_true.shape[0], :] = 0 + text_regions_p_true[:, text_regions_p_true.shape[1]-15:text_regions_p_true.shape[1]] = 0 + + ##polygons_of_images = return_contours_of_interested_region(text_regions_p_true, 2, 0.0001) + polygons_of_images = return_contours_of_interested_region(text_regions_p_true, 2, 0.001) image_boundary_of_doc = np.zeros((text_regions_p_true.shape[0], text_regions_p_true.shape[1])) From 8e2cdad1be6c7ad6577f495eab22495671f4428c Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 7 Aug 2024 23:22:27 +0200 Subject: [PATCH 25/47] extracting images only - avoid artifacts with heuristics --- qurator/eynollah/eynollah.py | 15 +++++++----- run_image_extraction_over_ppn_lists.py | 33 ++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 6 deletions(-) create mode 100644 run_image_extraction_over_ppn_lists.py diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index a5d7b38..6c3fa3e 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -1731,11 +1731,14 @@ class Eynollah: polygons_of_images_fin.append(np.array([[page_coord_img[2], page_coord_img[0]], [page_coord_img[3], page_coord_img[0]], [page_coord_img[3], page_coord_img[1]], [page_coord_img[2], page_coord_img[1]]]) ) """ x, y, w, h = cv2.boundingRect(ploy_img_ind) - box = [x, y, w, h] - _, page_coord_img = crop_image_inside_box(box, text_regions_p_true) - #cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]])) - - polygons_of_images_fin.append(np.array([[page_coord_img[2], page_coord_img[0]], [page_coord_img[3], page_coord_img[0]], [page_coord_img[3], page_coord_img[1]], [page_coord_img[2], page_coord_img[1]]]) ) + if h < 150 or w < 150: + pass + else: + box = [x, y, w, h] + _, page_coord_img = crop_image_inside_box(box, text_regions_p_true) + #cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]])) + + polygons_of_images_fin.append(np.array([[page_coord_img[2], page_coord_img[0]], [page_coord_img[3], page_coord_img[0]], [page_coord_img[3], page_coord_img[1]], [page_coord_img[2], page_coord_img[1]]]) ) return text_regions_p_true, erosion_hurts, polygons_lines_xml, polygons_of_images_fin, image_page, page_coord, cont_page def get_regions_light_v(self,img,is_image_enhanced, num_col_classifier): @@ -3011,7 +3014,7 @@ class Eynollah: pcgts = self.writer.build_pagexml_no_full_layout([], page_coord, [], [], [], [], polygons_of_images, [], [], [], [], [], cont_page, [], []) if self.plotter: - self.plotter.write_images_into_directory(polygons_of_images, img_res) + self.plotter.write_images_into_directory(polygons_of_images, image_page) #plt.imshow(text_regions_p_1) #plt.show() diff --git a/run_image_extraction_over_ppn_lists.py b/run_image_extraction_over_ppn_lists.py new file mode 100644 index 0000000..a890022 --- /dev/null +++ b/run_image_extraction_over_ppn_lists.py @@ -0,0 +1,33 @@ +import os +import sys + +dir_ppn = '/home/vahid/Documents/eynollah/ppn_list.txt' + + +with open(dir_ppn) as f: + ppn_list = f.readlines() + + +ppn_list = [ind.split('\n')[0] for ind in ppn_list] + +url_main = 'https://content.staatsbibliothek-berlin.de/dc/download/zip?ppn=PPN' + +out_result = './new_results_ppns2' + + +for ppn_ind in ppn_list: + url = url_main + ppn_ind + #curl -o ./ppn.zip "https://content.staatsbibliothek-berlin.de/dc/download/zip?ppn=PPN1762638355" + os.system("curl -o "+"./PPN_"+ppn_ind+".zip"+" "+url) + os.system("unzip "+"PPN_"+ppn_ind+".zip"+ " -d "+"PPN_"+ppn_ind) + os.system("rm -rf "+"PPN_"+ppn_ind+"/*.txt") + + os.system("mkdir "+out_result+'/'+"PPN_"+ppn_ind+"_out") + os.system("mkdir "+out_result+'/'+"PPN_"+ppn_ind+"_out_images") + command_eynollah = "eynollah -m /home/vahid/Downloads/models_eynollah_renamed_savedmodel -di "+"PPN_"+ppn_ind+" "+"-o "+out_result+'/'+"PPN_"+ppn_ind+"_out "+"-eoi "+"-ep -si "+out_result+'/'+"PPN_"+ppn_ind+"_out_images" + os.system(command_eynollah) + + os.system("rm -rf "+"PPN_"+ppn_ind+".zip") + os.system("rm -rf "+"PPN_"+ppn_ind) + #sys.exit() + From e3edb0ec30826541817263c0a4a52419fe430ca9 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Fri, 9 Aug 2024 02:23:17 +0200 Subject: [PATCH 26/47] update --- qurator/eynollah/cli.py | 8 +++++--- qurator/eynollah/eynollah.py | 12 ++++++++---- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/qurator/eynollah/cli.py b/qurator/eynollah/cli.py index a2a2ad0..822db18 100644 --- a/qurator/eynollah/cli.py +++ b/qurator/eynollah/cli.py @@ -198,9 +198,11 @@ def main( light_version=light_version, ignore_page_extraction=ignore_page_extraction, ) - eynollah.run() - #pcgts = eynollah.run() - ##eynollah.writer.write_pagexml(pcgts) + if dir_in: + eynollah.run() + else: + pcgts = eynollah.run() + eynollah.writer.write_pagexml(pcgts) if __name__ == "__main__": main() diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index c162af7..7f5561c 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -3091,7 +3091,8 @@ class Eynollah: pcgts = self.writer.build_pagexml_full_layout(contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, cont_page, polygons_lines_xml) self.logger.info("Job done in %.1fs", time.time() - t0) - ##return pcgts + if not self.dir_in: + return pcgts else: contours_only_text_parent_h = None if np.abs(slope_deskew) < SLOPE_THRESHOLD: @@ -3101,8 +3102,11 @@ class Eynollah: order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d) pcgts = self.writer.build_pagexml_no_full_layout(txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_xml, contours_tables) self.logger.info("Job done in %.1fs", time.time() - t0) - ##return pcgts - self.writer.write_pagexml(pcgts) - #self.logger.info("Job done in %.1fs", time.time() - t0) + if not self.dir_in: + return pcgts + + if self.dir_in: + self.writer.write_pagexml(pcgts) + #self.logger.info("Job done in %.1fs", time.time() - t0) if self.dir_in: self.logger.info("All jobs done in %.1fs", time.time() - t0_tot) From 23ac58405c1642413aa34f493c43ed279bda4945 Mon Sep 17 00:00:00 2001 From: Clemens Neudecker <952378+cneud@users.noreply.github.com> Date: Tue, 13 Aug 2024 21:47:32 +0200 Subject: [PATCH 27/47] update pyproject.toml --- pyproject.toml.txt | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/pyproject.toml.txt b/pyproject.toml.txt index 43d7093..760c040 100644 --- a/pyproject.toml.txt +++ b/pyproject.toml.txt @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools>=61.0", "setuptools-ocrd"] +requires = ["setuptools>=61.0", "wheel", "setuptools-ocrd"] [project] name = "eynollah" @@ -30,9 +30,20 @@ classifiers = [ eynollah = "eynollah.eynollah.cli:main" ocrd-eynollah-segment = "eynollah.eynollah.ocrd_cli:main" +[project.readme] +file = "README.md" +content-type = "text/markdown" + [project.urls] Homepage = "https://github.com/qurator-spk/eynollah" Repository = "https://github.com/qurator-spk/eynollah.git" [tool.setuptools.dynamic] dependencies = {file = ["requirements.txt"]} + +[tool.setuptools.packages.find] +where = ["src"] +namespaces = false + +[tool.setuptools.package-data] +"*" = ["*.json", '*.yml', '*.xml', '*.xsd'] From 28ee1e527ea96ce992ebc534401ba171179de9f9 Mon Sep 17 00:00:00 2001 From: cneud <952378+cneud@users.noreply.github.com> Date: Wed, 14 Aug 2024 19:50:57 +0200 Subject: [PATCH 28/47] update pyproject.toml for v0.3.1 --- pyproject.toml | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..8f83249 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,44 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel", "setuptools-ocrd"] + +[project] +name = "eynollah" +version = "0.3.0" +authors = [ + {name = "Vahid Rezanezhad"}, + {name = "Staatsbibliothek zu Berlin - Preußischer Kulturbesitz"}, +] +description = "Document Layout Analysis" +readme = "README.md" +license.file = "LICENSE" +requires-python = ">=3.8" +keywords = ["document layout analysis", "image segmentation"] + +dynamic = ["dependencies"] + +classifiers = [ + "Development Status :: 4 - Beta", + "Environment :: Console", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Topic :: Scientific/Engineering :: Image Processing", +] + +[project.scripts] +eynollah = "qurator.eynollah.cli:main" +ocrd-eynollah-segment = "qurator.eynollah.ocrd_cli:main" + +[project.urls] +Homepage = "https://github.com/qurator-spk/eynollah" +Repository = "https://github.com/qurator-spk/eynollah.git" + +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + +[tool.setuptools.packages.find] +where = ["qurator"] + +[tool.setuptools.package-data] +"*" = ["*.json", '*.yml', '*.xml', '*.xsd'] From 8f769663946c0074557a039bc5c8059ec9d410fc Mon Sep 17 00:00:00 2001 From: cneud <952378+cneud@users.noreply.github.com> Date: Wed, 14 Aug 2024 19:51:48 +0200 Subject: [PATCH 29/47] update pyproject.toml for v0.3.1 --- pyproject.toml.txt | 49 ---------------------------------------------- 1 file changed, 49 deletions(-) delete mode 100644 pyproject.toml.txt diff --git a/pyproject.toml.txt b/pyproject.toml.txt deleted file mode 100644 index 760c040..0000000 --- a/pyproject.toml.txt +++ /dev/null @@ -1,49 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0", "wheel", "setuptools-ocrd"] - -[project] -name = "eynollah" -version = "0.3.0" -authors = [ - {name = "Vahid Rezanezhad"}, - {name = "Staatsbibliothek zu Berlin - Preußischer Kulturbesitz"}, -] -description = "Document Layout Analysis" -readme = "README.md" -license.file = "LICENSE" -requires-python = ">=3.8" -keywords = ["document layout analysis", "image segmentation"] - -dynamic = ["dependencies"] - -classifiers = [ - "Development Status :: 4 - Beta", - "Environment :: Console", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: Apache Software License", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3 :: Only", - "Topic :: Scientific/Engineering :: Image Processing", -] - -[project.scripts] -eynollah = "eynollah.eynollah.cli:main" -ocrd-eynollah-segment = "eynollah.eynollah.ocrd_cli:main" - -[project.readme] -file = "README.md" -content-type = "text/markdown" - -[project.urls] -Homepage = "https://github.com/qurator-spk/eynollah" -Repository = "https://github.com/qurator-spk/eynollah.git" - -[tool.setuptools.dynamic] -dependencies = {file = ["requirements.txt"]} - -[tool.setuptools.packages.find] -where = ["src"] -namespaces = false - -[tool.setuptools.package-data] -"*" = ["*.json", '*.yml', '*.xml', '*.xsd'] From 7f99526b9dae4aff85fa01092aeb921f8c699cf5 Mon Sep 17 00:00:00 2001 From: cneud <952378+cneud@users.noreply.github.com> Date: Thu, 15 Aug 2024 23:59:18 +0200 Subject: [PATCH 30/47] update Makefile model location --- Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 439b534..4b43564 100644 --- a/Makefile +++ b/Makefile @@ -24,13 +24,15 @@ models: models_eynollah models_eynollah: models_eynollah.tar.gz # tar xf models_eynollah_renamed.tar.gz --transform 's/models_eynollah_renamed/models_eynollah/' # tar xf models_eynollah_renamed.tar.gz - tar xf models_eynollah_renamed_savedmodel.tar.gz --transform 's/models_eynollah_renamed_savedmodel/models_eynollah/' + # tar xf models_eynollah_renamed_savedmodel.tar.gz --transform 's/models_eynollah_renamed_savedmodel/models_eynollah/' + tar xf models_eynollah.tar.gz models_eynollah.tar.gz: # wget 'https://qurator-data.de/eynollah/2021-04-25/models_eynollah.tar.gz' # wget 'https://qurator-data.de/eynollah/2022-04-05/models_eynollah_renamed.tar.gz' # wget 'https://ocr-d.kba.cloud/2022-04-05.SavedModel.tar.gz' - wget 'https://qurator-data.de/eynollah/2022-04-05/models_eynollah_renamed_savedmodel.tar.gz' + # wget 'https://qurator-data.de/eynollah/2022-04-05/models_eynollah_renamed_savedmodel.tar.gz' + wget https://github.com/qurator-spk/eynollah/releases/download/v0.3.0/models_eynollah.tar.gz # Install with pip install: From 84d05bd0ae93c2fa09c3e5fa40caa8660241fffa Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 23 Aug 2024 14:01:20 +0200 Subject: [PATCH 31/47] s,url,local_filename, --- qurator/eynollah/processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py index ccec456..1bd190e 100644 --- a/qurator/eynollah/processor.py +++ b/qurator/eynollah/processor.py @@ -42,7 +42,7 @@ class EynollahProcessor(Processor): page = pcgts.get_Page() # XXX loses DPI information # page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') - image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))).local_filename + image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(local_filename=page.imageFilename))).local_filename eynollah_kwargs = { 'dir_models': self.resolve_resource(self.parameter['models']), 'allow_enhancement': False, From 9ae05754364ed815dd73d74d79edc00a9f65fef4 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 27 Aug 2024 14:52:01 +0200 Subject: [PATCH 32/47] :memo: changelog --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index da2e1c0..0fd3938 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Fixed: + + * regression in OCR-D processor, #106 + * Expected Ptrcv::UMat for argument 'contour', #110 + * Memory usage explosion with very narrow images (e.g. book spine), #67 + ## [0.3.0] - 2023-05-13 Changed: From a5c7f223d1713ac2770bafd08dd3fc6d4b8e29a3 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 27 Aug 2024 14:54:59 +0200 Subject: [PATCH 33/47] :package: v0.3.1 --- CHANGELOG.md | 4 ++++ pyproject.toml | 2 +- qurator/eynollah/ocrd-tool.json | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0fd3938..cf6263d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [0.3.1] - 2024-08-27 + Fixed: * regression in OCR-D processor, #106 @@ -123,6 +125,8 @@ Fixed: Initial release +[0.3.1]: ../../compare/v0.3.1...v0.3.0 +[0.3.0]: ../../compare/v0.3.0...v0.2.0 [0.2.0]: ../../compare/v0.2.0...v0.1.0 [0.1.0]: ../../compare/v0.1.0...v0.0.11 [0.0.11]: ../../compare/v0.0.11...v0.0.10 diff --git a/pyproject.toml b/pyproject.toml index 8f83249..d6f16b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ requires = ["setuptools>=61.0", "wheel", "setuptools-ocrd"] [project] name = "eynollah" -version = "0.3.0" +version = "0.3.1" authors = [ {name = "Vahid Rezanezhad"}, {name = "Staatsbibliothek zu Berlin - Preußischer Kulturbesitz"}, diff --git a/qurator/eynollah/ocrd-tool.json b/qurator/eynollah/ocrd-tool.json index 8a2cb95..4551168 100644 --- a/qurator/eynollah/ocrd-tool.json +++ b/qurator/eynollah/ocrd-tool.json @@ -1,5 +1,5 @@ { - "version": "0.3.0", + "version": "0.3.1", "git_url": "https://github.com/qurator-spk/eynollah", "tools": { "ocrd-eynollah-segment": { From 62314c453ce7cbe0c66061b88a0367d4163124a2 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 27 Aug 2024 15:04:57 +0200 Subject: [PATCH 34/47] fully transition to pyproject --- pyproject.toml | 3 +-- setup.py | 28 ++-------------------------- 2 files changed, 3 insertions(+), 28 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d6f16b3..8f9f175 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,6 @@ requires = ["setuptools>=61.0", "wheel", "setuptools-ocrd"] [project] name = "eynollah" -version = "0.3.1" authors = [ {name = "Vahid Rezanezhad"}, {name = "Staatsbibliothek zu Berlin - Preußischer Kulturbesitz"}, @@ -14,7 +13,7 @@ license.file = "LICENSE" requires-python = ">=3.8" keywords = ["document layout analysis", "image segmentation"] -dynamic = ["dependencies"] +dynamic = ["dependencies", "version"] classifiers = [ "Development Status :: 4 - Beta", diff --git a/setup.py b/setup.py index af8a321..6068493 100644 --- a/setup.py +++ b/setup.py @@ -1,27 +1,3 @@ -from setuptools import setup, find_namespace_packages -from json import load +from setuptools import setup -install_requires = open('requirements.txt').read().split('\n') -with open('ocrd-tool.json', 'r', encoding='utf-8') as f: - version = load(f)['version'] - -setup( - name='eynollah', - version=version, - long_description=open('README.md').read(), - long_description_content_type='text/markdown', - author='Vahid Rezanezhad', - url='https://github.com/qurator-spk/eynollah', - license='Apache License 2.0', - packages=find_namespace_packages(include=['qurator']), - install_requires=install_requires, - package_data={ - '': ['*.json'] - }, - entry_points={ - 'console_scripts': [ - 'eynollah=qurator.eynollah.cli:main', - 'ocrd-eynollah-segment=qurator.eynollah.ocrd_cli:main', - ] - }, -) +setup() From 9367f86483329f7771d2d63cb063107d258f5412 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 29 Aug 2024 17:06:39 +0200 Subject: [PATCH 35/47] remove setup.py stub completely --- setup.py | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 setup.py diff --git a/setup.py b/setup.py deleted file mode 100644 index 6068493..0000000 --- a/setup.py +++ /dev/null @@ -1,3 +0,0 @@ -from setuptools import setup - -setup() From 84b844203d7a1cb27fccefd19dee2869b0abe3b2 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 29 Aug 2024 17:11:29 +0200 Subject: [PATCH 36/47] switch from qurator namespace to src-layout --- ocrd-tool.json | 2 +- pyproject.toml | 6 +++--- qurator/.gitkeep | 0 {qurator => src}/eynollah/__init__.py | 0 {qurator => src}/eynollah/cli.py | 2 +- {qurator => src}/eynollah/eynollah.py | 0 {qurator => src}/eynollah/ocrd-tool.json | 0 {qurator => src}/eynollah/ocrd_cli.py | 0 {qurator => src}/eynollah/plot.py | 0 {qurator => src}/eynollah/processor.py | 0 {qurator => src}/eynollah/utils/__init__.py | 0 {qurator => src}/eynollah/utils/contour.py | 0 {qurator => src}/eynollah/utils/counter.py | 0 {qurator => src}/eynollah/utils/drop_capitals.py | 0 {qurator => src}/eynollah/utils/is_nan.py | 0 {qurator => src}/eynollah/utils/marginals.py | 0 {qurator => src}/eynollah/utils/pil_cv2.py | 0 {qurator => src}/eynollah/utils/resize.py | 0 {qurator => src}/eynollah/utils/rotate.py | 0 {qurator => src}/eynollah/utils/separate_lines.py | 0 {qurator => src}/eynollah/utils/xml.py | 0 {qurator => src}/eynollah/writer.py | 0 tests/test_counter.py | 2 +- tests/test_dpi.py | 2 +- tests/test_run.py | 2 +- tests/test_smoke.py | 12 ++++++------ tests/test_xml.py | 2 +- 27 files changed, 15 insertions(+), 15 deletions(-) delete mode 100644 qurator/.gitkeep rename {qurator => src}/eynollah/__init__.py (100%) rename {qurator => src}/eynollah/cli.py (99%) rename {qurator => src}/eynollah/eynollah.py (100%) rename {qurator => src}/eynollah/ocrd-tool.json (100%) rename {qurator => src}/eynollah/ocrd_cli.py (100%) rename {qurator => src}/eynollah/plot.py (100%) rename {qurator => src}/eynollah/processor.py (100%) rename {qurator => src}/eynollah/utils/__init__.py (100%) rename {qurator => src}/eynollah/utils/contour.py (100%) rename {qurator => src}/eynollah/utils/counter.py (100%) rename {qurator => src}/eynollah/utils/drop_capitals.py (100%) rename {qurator => src}/eynollah/utils/is_nan.py (100%) rename {qurator => src}/eynollah/utils/marginals.py (100%) rename {qurator => src}/eynollah/utils/pil_cv2.py (100%) rename {qurator => src}/eynollah/utils/resize.py (100%) rename {qurator => src}/eynollah/utils/rotate.py (100%) rename {qurator => src}/eynollah/utils/separate_lines.py (100%) rename {qurator => src}/eynollah/utils/xml.py (100%) rename {qurator => src}/eynollah/writer.py (100%) diff --git a/ocrd-tool.json b/ocrd-tool.json index 5c48493..711a192 120000 --- a/ocrd-tool.json +++ b/ocrd-tool.json @@ -1 +1 @@ -qurator/eynollah/ocrd-tool.json \ No newline at end of file +src/eynollah/ocrd-tool.json \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 8f9f175..67a420d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,8 +26,8 @@ classifiers = [ ] [project.scripts] -eynollah = "qurator.eynollah.cli:main" -ocrd-eynollah-segment = "qurator.eynollah.ocrd_cli:main" +eynollah = "eynollah.cli:main" +ocrd-eynollah-segment = "eynollah.ocrd_cli:main" [project.urls] Homepage = "https://github.com/qurator-spk/eynollah" @@ -37,7 +37,7 @@ Repository = "https://github.com/qurator-spk/eynollah.git" dependencies = {file = ["requirements.txt"]} [tool.setuptools.packages.find] -where = ["qurator"] +where = ["src"] [tool.setuptools.package-data] "*" = ["*.json", '*.yml', '*.xml', '*.xsd'] diff --git a/qurator/.gitkeep b/qurator/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/qurator/eynollah/__init__.py b/src/eynollah/__init__.py similarity index 100% rename from qurator/eynollah/__init__.py rename to src/eynollah/__init__.py diff --git a/qurator/eynollah/cli.py b/src/eynollah/cli.py similarity index 99% rename from qurator/eynollah/cli.py rename to src/eynollah/cli.py index 822db18..d61928f 100644 --- a/qurator/eynollah/cli.py +++ b/src/eynollah/cli.py @@ -1,7 +1,7 @@ import sys import click from ocrd_utils import initLogging, setOverrideLogLevel -from qurator.eynollah.eynollah import Eynollah +from eynollah.eynollah import Eynollah @click.command() diff --git a/qurator/eynollah/eynollah.py b/src/eynollah/eynollah.py similarity index 100% rename from qurator/eynollah/eynollah.py rename to src/eynollah/eynollah.py diff --git a/qurator/eynollah/ocrd-tool.json b/src/eynollah/ocrd-tool.json similarity index 100% rename from qurator/eynollah/ocrd-tool.json rename to src/eynollah/ocrd-tool.json diff --git a/qurator/eynollah/ocrd_cli.py b/src/eynollah/ocrd_cli.py similarity index 100% rename from qurator/eynollah/ocrd_cli.py rename to src/eynollah/ocrd_cli.py diff --git a/qurator/eynollah/plot.py b/src/eynollah/plot.py similarity index 100% rename from qurator/eynollah/plot.py rename to src/eynollah/plot.py diff --git a/qurator/eynollah/processor.py b/src/eynollah/processor.py similarity index 100% rename from qurator/eynollah/processor.py rename to src/eynollah/processor.py diff --git a/qurator/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py similarity index 100% rename from qurator/eynollah/utils/__init__.py rename to src/eynollah/utils/__init__.py diff --git a/qurator/eynollah/utils/contour.py b/src/eynollah/utils/contour.py similarity index 100% rename from qurator/eynollah/utils/contour.py rename to src/eynollah/utils/contour.py diff --git a/qurator/eynollah/utils/counter.py b/src/eynollah/utils/counter.py similarity index 100% rename from qurator/eynollah/utils/counter.py rename to src/eynollah/utils/counter.py diff --git a/qurator/eynollah/utils/drop_capitals.py b/src/eynollah/utils/drop_capitals.py similarity index 100% rename from qurator/eynollah/utils/drop_capitals.py rename to src/eynollah/utils/drop_capitals.py diff --git a/qurator/eynollah/utils/is_nan.py b/src/eynollah/utils/is_nan.py similarity index 100% rename from qurator/eynollah/utils/is_nan.py rename to src/eynollah/utils/is_nan.py diff --git a/qurator/eynollah/utils/marginals.py b/src/eynollah/utils/marginals.py similarity index 100% rename from qurator/eynollah/utils/marginals.py rename to src/eynollah/utils/marginals.py diff --git a/qurator/eynollah/utils/pil_cv2.py b/src/eynollah/utils/pil_cv2.py similarity index 100% rename from qurator/eynollah/utils/pil_cv2.py rename to src/eynollah/utils/pil_cv2.py diff --git a/qurator/eynollah/utils/resize.py b/src/eynollah/utils/resize.py similarity index 100% rename from qurator/eynollah/utils/resize.py rename to src/eynollah/utils/resize.py diff --git a/qurator/eynollah/utils/rotate.py b/src/eynollah/utils/rotate.py similarity index 100% rename from qurator/eynollah/utils/rotate.py rename to src/eynollah/utils/rotate.py diff --git a/qurator/eynollah/utils/separate_lines.py b/src/eynollah/utils/separate_lines.py similarity index 100% rename from qurator/eynollah/utils/separate_lines.py rename to src/eynollah/utils/separate_lines.py diff --git a/qurator/eynollah/utils/xml.py b/src/eynollah/utils/xml.py similarity index 100% rename from qurator/eynollah/utils/xml.py rename to src/eynollah/utils/xml.py diff --git a/qurator/eynollah/writer.py b/src/eynollah/writer.py similarity index 100% rename from qurator/eynollah/writer.py rename to src/eynollah/writer.py diff --git a/tests/test_counter.py b/tests/test_counter.py index 8ef0756..42bf074 100644 --- a/tests/test_counter.py +++ b/tests/test_counter.py @@ -1,5 +1,5 @@ from tests.base import main -from qurator.eynollah.utils.counter import EynollahIdCounter +from eynollah.utils.counter import EynollahIdCounter def test_counter_string(): c = EynollahIdCounter() diff --git a/tests/test_dpi.py b/tests/test_dpi.py index 510ffc5..3376bf4 100644 --- a/tests/test_dpi.py +++ b/tests/test_dpi.py @@ -1,6 +1,6 @@ import cv2 from pathlib import Path -from qurator.eynollah.utils.pil_cv2 import check_dpi +from eynollah.utils.pil_cv2 import check_dpi from tests.base import main def test_dpi(): diff --git a/tests/test_run.py b/tests/test_run.py index b1137e7..2596dad 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -2,7 +2,7 @@ from os import environ from pathlib import Path from ocrd_utils import pushd_popd from tests.base import CapturingTestCase as TestCase, main -from qurator.eynollah.cli import main as eynollah_cli +from eynollah.cli import main as eynollah_cli testdir = Path(__file__).parent.resolve() diff --git a/tests/test_smoke.py b/tests/test_smoke.py index d069479..252213f 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -1,7 +1,7 @@ def test_utils_import(): - import qurator.eynollah.utils - import qurator.eynollah.utils.contour - import qurator.eynollah.utils.drop_capitals - import qurator.eynollah.utils.drop_capitals - import qurator.eynollah.utils.is_nan - import qurator.eynollah.utils.rotate + import eynollah.utils + import eynollah.utils.contour + import eynollah.utils.drop_capitals + import eynollah.utils.drop_capitals + import eynollah.utils.is_nan + import eynollah.utils.rotate diff --git a/tests/test_xml.py b/tests/test_xml.py index 8422fd1..09a6ddf 100644 --- a/tests/test_xml.py +++ b/tests/test_xml.py @@ -1,5 +1,5 @@ from pytest import main -from qurator.eynollah.utils.xml import create_page_xml +from eynollah.utils.xml import create_page_xml from ocrd_models.ocrd_page import to_xml PAGE_2019 = 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15' From b6d3d2bdbfc206bcfaaeea67c5cbf68bed2f32b4 Mon Sep 17 00:00:00 2001 From: cneud <952378+cneud@users.noreply.github.com> Date: Mon, 2 Sep 2024 20:11:42 +0200 Subject: [PATCH 37/47] fix indentation --- src/eynollah/eynollah.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 0081643..56036eb 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -3273,8 +3273,9 @@ class Eynollah: pcgts = self.writer.build_pagexml_full_layout(contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, cont_page, polygons_lines_xml) self.logger.info("Job done in %.1fs", time.time() - t0) - if not self.dir_in: - return pcgts + + if not self.dir_in: + return pcgts else: contours_only_text_parent_h = None if np.abs(slope_deskew) < SLOPE_THRESHOLD: From 6b2e5d110e15c7a0b5d69217a1288e4833169ade Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Tue, 3 Sep 2024 13:55:55 +0200 Subject: [PATCH 38/47] all tests are passed --- src/eynollah/eynollah.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 56036eb..caa1978 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -3271,25 +3271,25 @@ class Eynollah: else: order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d) - pcgts = self.writer.build_pagexml_full_layout(contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, cont_page, polygons_lines_xml) - self.logger.info("Job done in %.1fs", time.time() - t0) + pcgts = self.writer.build_pagexml_full_layout(contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, cont_page, polygons_lines_xml) + self.logger.info("Job done in %.1fs", time.time() - t0) - if not self.dir_in: - return pcgts - else: - contours_only_text_parent_h = None - if np.abs(slope_deskew) < SLOPE_THRESHOLD: - order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot) + if not self.dir_in: + return pcgts else: - contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered, dtype=object)[index_by_text_par_con]) - order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d) - pcgts = self.writer.build_pagexml_no_full_layout(txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_xml, contours_tables) - self.logger.info("Job done in %.1fs", time.time() - t0) - if not self.dir_in: - return pcgts + contours_only_text_parent_h = None + if np.abs(slope_deskew) < SLOPE_THRESHOLD: + order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot) + else: + contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered, dtype=object)[index_by_text_par_con]) + order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d) + pcgts = self.writer.build_pagexml_no_full_layout(txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_xml, contours_tables) + self.logger.info("Job done in %.1fs", time.time() - t0) + if not self.dir_in: + return pcgts + if self.dir_in: + self.writer.write_pagexml(pcgts) + #self.logger.info("Job done in %.1fs", time.time() - t0) if self.dir_in: - self.writer.write_pagexml(pcgts) - #self.logger.info("Job done in %.1fs", time.time() - t0) - if self.dir_in: - self.logger.info("All jobs done in %.1fs", time.time() - t0_tot) + self.logger.info("All jobs done in %.1fs", time.time() - t0_tot) From c156a1612ec8a379d209f5926e7941d5dcfe8e90 Mon Sep 17 00:00:00 2001 From: Clemens Neudecker <952378+cneud@users.noreply.github.com> Date: Tue, 3 Sep 2024 20:03:44 +0200 Subject: [PATCH 39/47] Exclude `run_image_extraction_over_ppn_lists.py` from merge --- run_image_extraction_over_ppn_lists.py | 33 -------------------------- 1 file changed, 33 deletions(-) delete mode 100644 run_image_extraction_over_ppn_lists.py diff --git a/run_image_extraction_over_ppn_lists.py b/run_image_extraction_over_ppn_lists.py deleted file mode 100644 index a890022..0000000 --- a/run_image_extraction_over_ppn_lists.py +++ /dev/null @@ -1,33 +0,0 @@ -import os -import sys - -dir_ppn = '/home/vahid/Documents/eynollah/ppn_list.txt' - - -with open(dir_ppn) as f: - ppn_list = f.readlines() - - -ppn_list = [ind.split('\n')[0] for ind in ppn_list] - -url_main = 'https://content.staatsbibliothek-berlin.de/dc/download/zip?ppn=PPN' - -out_result = './new_results_ppns2' - - -for ppn_ind in ppn_list: - url = url_main + ppn_ind - #curl -o ./ppn.zip "https://content.staatsbibliothek-berlin.de/dc/download/zip?ppn=PPN1762638355" - os.system("curl -o "+"./PPN_"+ppn_ind+".zip"+" "+url) - os.system("unzip "+"PPN_"+ppn_ind+".zip"+ " -d "+"PPN_"+ppn_ind) - os.system("rm -rf "+"PPN_"+ppn_ind+"/*.txt") - - os.system("mkdir "+out_result+'/'+"PPN_"+ppn_ind+"_out") - os.system("mkdir "+out_result+'/'+"PPN_"+ppn_ind+"_out_images") - command_eynollah = "eynollah -m /home/vahid/Downloads/models_eynollah_renamed_savedmodel -di "+"PPN_"+ppn_ind+" "+"-o "+out_result+'/'+"PPN_"+ppn_ind+"_out "+"-eoi "+"-ep -si "+out_result+'/'+"PPN_"+ppn_ind+"_out_images" - os.system(command_eynollah) - - os.system("rm -rf "+"PPN_"+ppn_ind+".zip") - os.system("rm -rf "+"PPN_"+ppn_ind) - #sys.exit() - From 478edc804a0de01a1966a0df24468344e7a26cf0 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 16 Sep 2024 18:21:14 +0200 Subject: [PATCH 40/47] Add Dockerfile and make docker target --- Dockerfile | 26 ++++++++++++++++++++++++++ Makefile | 14 ++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..6c76564 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,26 @@ +ARG DOCKER_BASE_IMAGE +FROM $DOCKER_BASE_IMAGE + +ARG VCS_REF +ARG BUILD_DATE +LABEL \ + maintainer="https://ocr-d.de/kontakt" \ + org.label-schema.vcs-ref=$VCS_REF \ + org.label-schema.vcs-url="https://github.com/qurator-spk/eynollah" \ + org.label-schema.build-date=$BUILD_DATE + +ENV DEBIAN_FRONTEND=noninteractive +ENV PYTHONIOENCODING=utf8 +ENV XDG_DATA_HOME=/usr/local/share + +WORKDIR /build-eynollah +COPY qurator/ ./qurator +COPY pyproject.toml . +COPY requirements.txt . +COPY README.md . +COPY Makefile . +RUN apt-get install -y --no-install-recommends g++ +RUN make install + +WORKDIR /data +VOLUME /data diff --git a/Makefile b/Makefile index 4b43564..a3bde05 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,11 @@ EYNOLLAH_MODELS ?= $(PWD)/models_eynollah export EYNOLLAH_MODELS +# DOCKER_BASE_IMAGE = artefakt.dev.sbb.berlin:5000/sbb/ocrd_core:v2.68.0 +DOCKER_BASE_IMAGE = docker.io/ocrd/core:v2.68.0 +DOCKER_TAG = ocrd/eynollah + + # BEGIN-EVAL makefile-parser --make-help Makefile help: @@ -48,3 +53,12 @@ smoke-test: # Run unit tests test: pytest tests + +# Build docker image +docker: + docker build \ + --build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \ + --build-arg VCS_REF=$$(git rev-parse --short HEAD) \ + --build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ + -t $(DOCKER_TAG) . + From 351e9a897a390cc5978346ae56bd725f021876d9 Mon Sep 17 00:00:00 2001 From: Clemens Neudecker <952378+cneud@users.noreply.github.com> Date: Tue, 17 Sep 2024 21:32:23 +0200 Subject: [PATCH 41/47] update `ocrd-tool.json` with v0.3.1 models --- src/eynollah/ocrd-tool.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/eynollah/ocrd-tool.json b/src/eynollah/ocrd-tool.json index 4551168..b840005 100644 --- a/src/eynollah/ocrd-tool.json +++ b/src/eynollah/ocrd-tool.json @@ -52,10 +52,10 @@ }, "resources": [ { - "description": "models for eynollah (TensorFlow format)", - "url": "https://github.com/qurator-spk/eynollah/releases/download/v0.3.0/models_eynollah.tar.gz", + "description": "models for eynollah (TensorFlow SavedModel format)", + "url": "https://github.com/qurator-spk/eynollah/releases/download/v0.3.1/models_eynollah.tar.gz", "name": "default", - "size": 1761991295, + "size": 1894627041, "type": "archive", "path_in_archive": "models_eynollah" } From 327b446a16cc0d28281d41c96de1062c18293601 Mon Sep 17 00:00:00 2001 From: Clemens Neudecker <952378+cneud@users.noreply.github.com> Date: Tue, 17 Sep 2024 21:39:17 +0200 Subject: [PATCH 42/47] update Makefile with v0.3.1 models --- Makefile | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 4b43564..e0ff6a9 100644 --- a/Makefile +++ b/Makefile @@ -22,17 +22,14 @@ help: models: models_eynollah models_eynollah: models_eynollah.tar.gz - # tar xf models_eynollah_renamed.tar.gz --transform 's/models_eynollah_renamed/models_eynollah/' - # tar xf models_eynollah_renamed.tar.gz - # tar xf models_eynollah_renamed_savedmodel.tar.gz --transform 's/models_eynollah_renamed_savedmodel/models_eynollah/' tar xf models_eynollah.tar.gz models_eynollah.tar.gz: # wget 'https://qurator-data.de/eynollah/2021-04-25/models_eynollah.tar.gz' # wget 'https://qurator-data.de/eynollah/2022-04-05/models_eynollah_renamed.tar.gz' - # wget 'https://ocr-d.kba.cloud/2022-04-05.SavedModel.tar.gz' # wget 'https://qurator-data.de/eynollah/2022-04-05/models_eynollah_renamed_savedmodel.tar.gz' - wget https://github.com/qurator-spk/eynollah/releases/download/v0.3.0/models_eynollah.tar.gz + # wget 'https://github.com/qurator-spk/eynollah/releases/download/v0.3.0/models_eynollah.tar.gz' + wget 'https://github.com/qurator-spk/eynollah/releases/download/v0.3.1/models_eynollah.tar.gz' # Install with pip install: From 74a0699f6bd441315e20223da81851ef1be53121 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Thu, 19 Sep 2024 11:20:13 +0200 Subject: [PATCH 43/47] extracting images only now works for a single image input --- src/eynollah/eynollah.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index caa1978..511e994 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -3013,10 +3013,11 @@ class Eynollah: if self.plotter: self.plotter.write_images_into_directory(polygons_of_images, image_page) - #plt.imshow(text_regions_p_1) - #plt.show() - self.writer.write_pagexml(pcgts) + if self.dir_in: + self.writer.write_pagexml(pcgts) + else: + return pcgts else: img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(self.light_version) From 723f27bec44104632bc62bce39f59f44cb6be97a Mon Sep 17 00:00:00 2001 From: michalbubula <149780022+michalbubula@users.noreply.github.com> Date: Thu, 19 Sep 2024 14:41:17 +0200 Subject: [PATCH 44/47] Add -eoi option to README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 1720f7f..292cfbc 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,7 @@ The following options can be used to further configure the processing: | `-cl` | apply contour detection for curved text lines instead of bounding boxes | | `-ib` | apply binarization (the resulting image is saved to the output directory) | | `-ep` | enable plotting (MUST always be used with `-sl`, `-sd`, `-sa`, `-si` or `-ae`) | +| `-eoi` | extract only images to output directory (other processing will not be done) | | `-ho` | ignore headers for reading order dectection | | `-si ` | save image regions detected to this directory | | `-sd ` | save deskewed image to this directory | From d168edfd77119fb9501cf66aaa0f5f42a687f248 Mon Sep 17 00:00:00 2001 From: michalbubula <149780022+michalbubula@users.noreply.github.com> Date: Thu, 19 Sep 2024 15:20:37 +0200 Subject: [PATCH 45/47] Update cli.py to block other processing in the case of extract_image_only --- src/eynollah/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py index 82505ed..564b8b0 100644 --- a/src/eynollah/cli.py +++ b/src/eynollah/cli.py @@ -182,8 +182,8 @@ def main( if textline_light and not light_version: print('Error: You used -tll to enable light textline detection but -light is not enabled') sys.exit(1) - if extract_only_images and (allow_enhancement or allow_scaling or light_version) : - print('Error: You used -eoi which can not be enabled alongside light_version -light or allow_scaling -as or allow_enhancement -ae') + if extract_only_images and (allow_enhancement or allow_scaling or light_version or curved_line or textline_light or full_layout or tables or right2left or headers_off) : + print('Error: You used -eoi which can not be enabled alongside light_version -light or allow_scaling -as or allow_enhancement -ae or curved_line -cl or textline_light -tll or full_layout -fl or tables -tab or right2left -r2l or headers_off -ho') sys.exit(1) eynollah = Eynollah( image_filename=image, From c487be2a1dfcb444f1896dd725183b6dfc8fb96f Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 1 Oct 2024 15:38:01 +0200 Subject: [PATCH 46/47] dockerfile: use src-layout --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 6c76564..6780bc2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,7 +14,7 @@ ENV PYTHONIOENCODING=utf8 ENV XDG_DATA_HOME=/usr/local/share WORKDIR /build-eynollah -COPY qurator/ ./qurator +COPY src/ ./src COPY pyproject.toml . COPY requirements.txt . COPY README.md . From b13759fdcf50db60966ec98050fa95bddb54728a Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 1 Oct 2024 15:38:39 +0200 Subject: [PATCH 47/47] ci: smoke-test make docker --- .github/workflows/test-eynollah.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/test-eynollah.yml b/.github/workflows/test-eynollah.yml index 98ddc06..3a33dcf 100644 --- a/.github/workflows/test-eynollah.yml +++ b/.github/workflows/test-eynollah.yml @@ -34,3 +34,5 @@ jobs: pip install -r requirements-test.txt - name: Test with pytest run: make test + - name: Test docker build + run: make docker