diff --git a/README.md b/README.md index 0283fe9..59da45a 100644 --- a/README.md +++ b/README.md @@ -120,9 +120,6 @@ The following options can be used to further configure the processing: | `-sa ` | save all (plot, enhanced/binary image, layout) to this directory | | `-thart` | threshold of artifical class in the case of textline detection. The default value is 0.1 | | `-tharl` | threshold of artifical class in the case of layout detection. The default value is 0.1 | -| `-ocr` | do ocr | -| `-tr` | apply transformer ocr. Default model is a CNN-RNN model | -| `-bs_ocr` | ocr inference batch size. Default bs for trocr and cnn_rnn models are 2 and 8 respectively | | `-ncu` | upper limit of columns in document image | | `-ncl` | lower limit of columns in document image | | `-slro` | skip layout detection and reading order | diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py index c37864a..8a4cb3a 100644 --- a/src/eynollah/cli.py +++ b/src/eynollah/cli.py @@ -321,7 +321,7 @@ def enhancement(ctx, image, out, overwrite, dir_in, num_col_upper, num_col_lower "--input_binary/--input-RGB", "-ib/-irgb", is_flag=True, - help="in general, eynollah uses RGB as input but if the input document is strongly dark, bright or for any other reason you can turn binarized input on. This option does not mean that you have to provide a binary image, otherwise this means that the tool itself will binarized the RGB input document.", + help="In general, eynollah uses RGB as input but if the input document is very dark, very bright or for any other reason you can turn on input binarization. When this flag is set, eynollah will binarize the RGB input document, you should always provide RGB images to eynollah.", ) @click.option( "--allow_scaling/--no-allow-scaling", @@ -353,23 +353,6 @@ def enhancement(ctx, image, out, overwrite, dir_in, num_col_upper, num_col_lower is_flag=True, help="if this parameter set to true, this tool would apply machine based reading order detection", ) -@click.option( - "--do_ocr", - "-ocr/-noocr", - is_flag=True, - help="if this parameter set to true, this tool will try to do ocr", -) -@click.option( - "--transformer_ocr", - "-tr/-notr", - is_flag=True, - help="if this parameter set to true, this tool will apply transformer ocr", -) -@click.option( - "--batch_size_ocr", - "-bs_ocr", - help="number of inference batch size of ocr model. Default b_s for trocr and cnn_rnn models are 2 and 8 respectively", -) @click.option( "--num_col_upper", "-ncu", @@ -421,9 +404,6 @@ def layout( headers_off, light_version, reading_order_machine_based, - do_ocr, - transformer_ocr, - batch_size_ocr, num_col_upper, num_col_lower, threshold_art_class_textline, @@ -470,9 +450,6 @@ def layout( light_version=light_version, ignore_page_extraction=ignore_page_extraction, reading_order_machine_based=reading_order_machine_based, - do_ocr=do_ocr, - transformer_ocr=transformer_ocr, - batch_size_ocr=batch_size_ocr, num_col_upper=num_col_upper, num_col_lower=num_col_lower, skip_layout_and_reading_order=skip_layout_and_reading_order, @@ -506,7 +483,15 @@ def layout( @click.option( "--dir_in_bin", "-dib", - help="directory of binarized images (in addition to --dir_in for RGB images; filename stems must match the RGB image files, with '.png' suffix).\nPerform prediction using both RGB and binary images. (This does not necessarily improve results, however it may be beneficial for certain document images.)", + help=(""" + directory of binarized images (in addition to --dir_in for RGB + images; filename stems must match the RGB image files, with '.png' + \n + Perform prediction using both RGB and binary images. + (This does not necessarily improve results, however it may be beneficial + for certain document images. +"""), + type=click.Path(exists=True, file_okay=False), ) @click.option( diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index dc90f1d..45fabd1 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -95,19 +95,6 @@ from .utils.rotate import ( rotation_not_90_func_full_layout, rotation_image_new ) -from .utils.utils_ocr import ( - return_start_and_end_of_common_text_of_textline_ocr_without_common_section, - return_textline_contour_with_added_box_coordinate, - preprocess_and_resize_image_for_ocrcnn_model, - return_textlines_split_if_needed, - decode_batch_predictions, - return_rnn_cnn_ocr_of_given_textlines, - fit_text_single_line, - break_curved_line_into_small_pieces_and_then_merge, - get_orientation_moments, - rotate_image_with_padding, - get_contours_and_bounding_boxes -) from .utils.separate_lines import ( separate_lines_new2, return_deskew_slop, @@ -176,9 +163,6 @@ class Eynollah: light_version : bool = False, ignore_page_extraction : bool = False, reading_order_machine_based : bool = False, - do_ocr : bool = False, - transformer_ocr: bool = False, - batch_size_ocr: Optional[int] = None, num_col_upper : Optional[int] = None, num_col_lower : Optional[int] = None, threshold_art_class_layout: Optional[float] = None, @@ -209,12 +193,6 @@ class Eynollah: self.extract_only_images = extract_only_images self.ignore_page_extraction = ignore_page_extraction self.skip_layout_and_reading_order = skip_layout_and_reading_order - self.ocr = do_ocr - self.tr = transformer_ocr - if not batch_size_ocr: - self.b_s_ocr = 8 - else: - self.b_s_ocr = int(batch_size_ocr) if num_col_upper: self.num_col_upper = int(num_col_upper) else: @@ -284,14 +262,6 @@ class Eynollah: if self.tables: loadable.append(("table", 'light' if self.light_version else '')) - if self.ocr: - if self.tr: - loadable.append(('ocr', 'tr')) - loadable.append(('trocr_processor', '')) - else: - loadable.append('ocr') - loadable.append('num_to_char') - self.model_zoo.load_models(*loadable) def __del__(self): @@ -2078,15 +2048,7 @@ class Eynollah: ###img_bin = np.copy(prediction_bin) ###else: ###img_bin = np.copy(img_resized) - if (self.ocr and self.tr) and not self.input_binary: - prediction_bin = self.do_prediction(True, img_resized, self.model_zoo.get("binarization"), n_batch_inference=5) - prediction_bin = 255 * (prediction_bin[:,:,0] == 0) - prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2) - prediction_bin = prediction_bin.astype(np.uint16) - #img= np.copy(prediction_bin) - img_bin = np.copy(prediction_bin) - else: - img_bin = np.copy(img_resized) + img_bin = np.copy(img_resized) #print("inside 1 ", time.time()-t_in) ###textline_mask_tot_ea = self.run_textline(img_bin) @@ -3586,190 +3548,13 @@ class Eynollah: region_ids = ['region_%04d' % i for i in range(len(co_text_all_org))] return ordered, region_ids - def return_start_and_end_of_common_text_of_textline_ocr(self,textline_image, ind_tot): - width = np.shape(textline_image)[1] - height = np.shape(textline_image)[0] - common_window = int(0.2*width) - width1 = int ( width/2. - common_window ) - width2 = int ( width/2. + common_window ) - - img_sum = np.sum(textline_image[:,:,0], axis=0) - sum_smoothed = gaussian_filter1d(img_sum, 3) - - peaks_real, _ = find_peaks(sum_smoothed, height=0) - - if len(peaks_real)>70: - print(len(peaks_real), 'len(peaks_real)') - peaks_real = peaks_real[(peaks_realwidth1)] + - arg_sort = np.argsort(sum_smoothed[peaks_real]) - arg_sort4 =arg_sort[::-1][:4] - peaks_sort_4 = peaks_real[arg_sort][::-1][:4] - argsort_sorted = np.argsort(peaks_sort_4) - first_4_sorted = peaks_sort_4[argsort_sorted] - y_4_sorted = sum_smoothed[peaks_real][arg_sort4[argsort_sorted]] - #print(first_4_sorted,'first_4_sorted') - - arg_sortnew = np.argsort(y_4_sorted) - peaks_final =np.sort( first_4_sorted[arg_sortnew][2:] ) + - #plt.figure(ind_tot) - #plt.imshow(textline_image) - #plt.plot([peaks_final[0], peaks_final[0]], [0, height-1]) - #plt.plot([peaks_final[1], peaks_final[1]], [0, height-1]) - #plt.savefig('./'+str(ind_tot)+'.png') - - return peaks_final[0], peaks_final[1] - else: - pass - - def return_start_and_end_of_common_text_of_textline_ocr_new_splitted( - self, peaks_real, sum_smoothed, start_split, end_split): - - peaks_real = peaks_real[(peaks_realstart_split)] - - arg_sort = np.argsort(sum_smoothed[peaks_real]) - arg_sort4 =arg_sort[::-1][:4] - peaks_sort_4 = peaks_real[arg_sort][::-1][:4] - argsort_sorted = np.argsort(peaks_sort_4) - - first_4_sorted = peaks_sort_4[argsort_sorted] - y_4_sorted = sum_smoothed[peaks_real][arg_sort4[argsort_sorted]] - #print(first_4_sorted,'first_4_sorted') - - arg_sortnew = np.argsort(y_4_sorted) - peaks_final =np.sort( first_4_sorted[arg_sortnew][3:] ) - return peaks_final[0] - - def return_start_and_end_of_common_text_of_textline_ocr_new(self, textline_image, ind_tot): - width = np.shape(textline_image)[1] - height = np.shape(textline_image)[0] - common_window = int(0.15*width) - - width1 = int ( width/2. - common_window ) - width2 = int ( width/2. + common_window ) - mid = int(width/2.) - - img_sum = np.sum(textline_image[:,:,0], axis=0) - sum_smoothed = gaussian_filter1d(img_sum, 3) - - peaks_real, _ = find_peaks(sum_smoothed, height=0) - if len(peaks_real)>70: - peak_start = self.return_start_and_end_of_common_text_of_textline_ocr_new_splitted( - peaks_real, sum_smoothed, width1, mid+2) - peak_end = self.return_start_and_end_of_common_text_of_textline_ocr_new_splitted( - peaks_real, sum_smoothed, mid-2, width2) - - #plt.figure(ind_tot) - #plt.imshow(textline_image) - #plt.plot([peak_start, peak_start], [0, height-1]) - #plt.plot([peak_end, peak_end], [0, height-1]) - #plt.savefig('./'+str(ind_tot)+'.png') - - return peak_start, peak_end - else: - pass - - def return_ocr_of_textline_without_common_section( - self, - textline_image, - model_ocr, - processor, - device, - width_textline, - h2w_ratio, - ind_tot, - ): - - if h2w_ratio > 0.05: - pixel_values = processor(textline_image, return_tensors="pt").pixel_values - generated_ids = model_ocr.generate(pixel_values.to(device)) - generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] - else: - #width = np.shape(textline_image)[1] - #height = np.shape(textline_image)[0] - #common_window = int(0.3*width) - #width1 = int ( width/2. - common_window ) - #width2 = int ( width/2. + common_window ) - - split_point = return_start_and_end_of_common_text_of_textline_ocr_without_common_section(textline_image) - if split_point: - image1 = textline_image[:, :split_point,:]# image.crop((0, 0, width2, height)) - image2 = textline_image[:, split_point:,:]#image.crop((width1, 0, width, height)) - - #pixel_values1 = processor(image1, return_tensors="pt").pixel_values - #pixel_values2 = processor(image2, return_tensors="pt").pixel_values - - pixel_values_merged = processor([image1,image2], return_tensors="pt").pixel_values - generated_ids_merged = model_ocr.generate(pixel_values_merged.to(device)) - generated_text_merged = processor.batch_decode(generated_ids_merged, skip_special_tokens=True) - - #print(generated_text_merged,'generated_text_merged') - - #generated_ids1 = model_ocr.generate(pixel_values1.to(device)) - #generated_ids2 = model_ocr.generate(pixel_values2.to(device)) - - #generated_text1 = processor.batch_decode(generated_ids1, skip_special_tokens=True)[0] - #generated_text2 = processor.batch_decode(generated_ids2, skip_special_tokens=True)[0] - - #generated_text = generated_text1 + ' ' + generated_text2 - generated_text = generated_text_merged[0] + ' ' + generated_text_merged[1] - - #print(generated_text1,'generated_text1') - #print(generated_text2, 'generated_text2') - #print('########################################') - else: - pixel_values = processor(textline_image, return_tensors="pt").pixel_values - generated_ids = model_ocr.generate(pixel_values.to(device)) - generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] - - #print(generated_text,'generated_text') - #print('########################################') - return generated_text - - def return_ocr_of_textline( - self, textline_image, model_ocr, processor, device, width_textline, h2w_ratio,ind_tot): - - if h2w_ratio > 0.05: - pixel_values = processor(textline_image, return_tensors="pt").pixel_values - generated_ids = model_ocr.generate(pixel_values.to(device)) - generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] - else: - #width = np.shape(textline_image)[1] - #height = np.shape(textline_image)[0] - #common_window = int(0.3*width) - #width1 = int ( width/2. - common_window ) - #width2 = int ( width/2. + common_window ) - - try: - width1, width2 = self.return_start_and_end_of_common_text_of_textline_ocr_new(textline_image, ind_tot) - - image1 = textline_image[:, :width2,:]# image.crop((0, 0, width2, height)) - image2 = textline_image[:, width1:,:]#image.crop((width1, 0, width, height)) - - pixel_values1 = processor(image1, return_tensors="pt").pixel_values - pixel_values2 = processor(image2, return_tensors="pt").pixel_values - - generated_ids1 = model_ocr.generate(pixel_values1.to(device)) - generated_ids2 = model_ocr.generate(pixel_values2.to(device)) - - generated_text1 = processor.batch_decode(generated_ids1, skip_special_tokens=True)[0] - generated_text2 = processor.batch_decode(generated_ids2, skip_special_tokens=True)[0] - #print(generated_text1,'generated_text1') - #print(generated_text2, 'generated_text2') - #print('########################################') - - match = sq(None, generated_text1, generated_text2).find_longest_match( - 0, len(generated_text1), 0, len(generated_text2)) - generated_text = generated_text1 + generated_text2[match.b+match.size:] - except: - pixel_values = processor(textline_image, return_tensors="pt").pixel_values - generated_ids = model_ocr.generate(pixel_values.to(device)) - generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] - - return generated_text + def return_list_of_contours_with_desired_order(self, ls_cons, sorted_indexes): return list(np.array(ls_cons)[np.array(sorted_indexes)]) @@ -4009,8 +3794,6 @@ class Eynollah: enabled_modes.append("Light textline detection") if self.full_layout: enabled_modes.append("Full layout analysis") - if self.ocr: - enabled_modes.append("OCR") if self.tables: enabled_modes.append("Table detection") if enabled_modes: @@ -4130,21 +3913,12 @@ class Eynollah: id_of_texts_tot =['region_0001'] conf_contours_textregions =[0] - if self.ocr and not self.tr: - gc.collect() - ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines( - image_page, all_found_textline_polygons, np.zeros((len(all_found_textline_polygons), 4)), - self.model_zoo.get("ocr"), self.b_s_ocr, self.model_zoo.get("num_to_char"), textline_light=True) - else: - ocr_all_textlines = None - pcgts = self.writer.build_pagexml_no_full_layout( cont_page, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, page_coord, [], [], [], [], [], [], [], slopes, [], [], cont_page, [], [], - ocr_all_textlines=ocr_all_textlines, conf_contours_textregion=conf_contours_textregions, skip_layout_reading_order=True) self.logger.info("Basic processing complete") @@ -4629,94 +4403,6 @@ class Eynollah: boxes_d, textline_mask_tot_d) self.logger.info(f"Detection of reading order took {time.time() - t_order:.1f}s") - ocr_all_textlines = None - ocr_all_textlines_marginals_left = None - ocr_all_textlines_marginals_right = None - ocr_all_textlines_h = None - ocr_all_textlines_drop = None - if self.ocr: - self.logger.info("Step 4.5/5: OCR Processing") - - if not self.tr: - gc.collect() - - if len(all_found_textline_polygons): - ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines( - image_page, all_found_textline_polygons, all_box_coord, - self.model_zoo.get("ocr"), self.b_s_ocr, self.model_zoo.get("num_to_char"), self.textline_light, self.curved_line) - - if len(all_found_textline_polygons_marginals_left): - ocr_all_textlines_marginals_left = return_rnn_cnn_ocr_of_given_textlines( - image_page, all_found_textline_polygons_marginals_left, all_box_coord_marginals_left, - self.model_zoo.get("ocr"), self.b_s_ocr, self.model_zoo.get("num_to_char"), self.textline_light, self.curved_line) - - if len(all_found_textline_polygons_marginals_right): - ocr_all_textlines_marginals_right = return_rnn_cnn_ocr_of_given_textlines( - image_page, all_found_textline_polygons_marginals_right, all_box_coord_marginals_right, - self.model_zoo.get("ocr"), self.b_s_ocr, self.model_zoo.get("num_to_char"), self.textline_light, self.curved_line) - - if self.full_layout and len(all_found_textline_polygons): - ocr_all_textlines_h = return_rnn_cnn_ocr_of_given_textlines( - image_page, all_found_textline_polygons_h, all_box_coord_h, - self.model_zoo.get("ocr"), self.b_s_ocr, self.model_zoo.get("num_to_char"), self.textline_light, self.curved_line) - - if self.full_layout and len(polygons_of_drop_capitals): - ocr_all_textlines_drop = return_rnn_cnn_ocr_of_given_textlines( - image_page, polygons_of_drop_capitals, np.zeros((len(polygons_of_drop_capitals), 4)), - self.model_zoo.get("ocr"), self.b_s_ocr, self.model_zoo.get("num_to_char"), self.textline_light, self.curved_line) - - else: - if self.light_version: - self.logger.info("Using light version OCR") - if self.textline_light: - self.logger.info("Using light text line detection for OCR") - self.logger.info("Processing text lines...") - - gc.collect() - - torch.cuda.empty_cache() - self.model_zoo.get("ocr").to(self.device) - - ind_tot = 0 - #cv2.imwrite('./img_out.png', image_page) - ocr_all_textlines = [] - # FIXME: what about lines in marginals / headings / drop-capitals here? - for indexing, ind_poly_first in enumerate(all_found_textline_polygons): - ocr_textline_in_textregion = [] - for indexing2, ind_poly in enumerate(ind_poly_first): - if not (self.textline_light or self.curved_line): - ind_poly = copy.deepcopy(ind_poly) - box_ind = all_box_coord[indexing] - #print(ind_poly,np.shape(ind_poly), 'ind_poly') - #print(box_ind) - ind_poly = return_textline_contour_with_added_box_coordinate(ind_poly, box_ind) - #print(ind_poly_copy) - ind_poly[ind_poly<0] = 0 - x, y, w, h = cv2.boundingRect(ind_poly) - #print(ind_poly_copy, np.shape(ind_poly_copy)) - #print(x, y, w, h, h/float(w),'ratio') - h2w_ratio = h/float(w) - mask_poly = np.zeros(image_page.shape) - if not self.light_version: - img_poly_on_img = np.copy(image_page) - else: - img_poly_on_img = np.copy(img_bin_light) - mask_poly = cv2.fillPoly(mask_poly, pts=[ind_poly], color=(1, 1, 1)) - - if self.textline_light: - mask_poly = cv2.dilate(mask_poly, KERNEL, iterations=1) - img_poly_on_img[:,:,0][mask_poly[:,:,0] ==0] = 255 - img_poly_on_img[:,:,1][mask_poly[:,:,0] ==0] = 255 - img_poly_on_img[:,:,2][mask_poly[:,:,0] ==0] = 255 - - img_croped = img_poly_on_img[y:y+h, x:x+w, :] - #cv2.imwrite('./extracted_lines/'+str(ind_tot)+'.jpg', img_croped) - text_ocr = self.return_ocr_of_textline_without_common_section( - img_croped, self.model_zoo.get("ocr"), self.model_zoo.get("trocr_processor"), self.device, w, h2w_ratio, ind_tot) - ocr_textline_in_textregion.append(text_ocr) - ind_tot = ind_tot +1 - ocr_all_textlines.append(ocr_textline_in_textregion) - self.logger.info("Step 5/5: Output Generation") if self.full_layout: @@ -4728,9 +4414,7 @@ class Eynollah: all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_h, slopes_marginals_left, slopes_marginals_right, - cont_page, polygons_seplines, ocr_all_textlines, ocr_all_textlines_h, - ocr_all_textlines_marginals_left, ocr_all_textlines_marginals_right, - ocr_all_textlines_drop, + cont_page, polygons_seplines, conf_contours_textregions, conf_contours_textregions_h) else: pcgts = self.writer.build_pagexml_no_full_layout( @@ -4741,9 +4425,6 @@ class Eynollah: all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_marginals_left, slopes_marginals_right, cont_page, polygons_seplines, contours_tables, - ocr_all_textlines=ocr_all_textlines, - ocr_all_textlines_marginals_left=ocr_all_textlines_marginals_left, - ocr_all_textlines_marginals_right=ocr_all_textlines_marginals_right, conf_contours_textregions=conf_contours_textregions) return pcgts diff --git a/src/eynollah/eynollah_ocr.py b/src/eynollah/eynollah_ocr.py index 0f3eda6..61de12c 100644 --- a/src/eynollah/eynollah_ocr.py +++ b/src/eynollah/eynollah_ocr.py @@ -473,8 +473,7 @@ class Eynollah_ocr: img = cv2.imread(dir_img) if dir_in_bin is not None: cropped_lines_bin = [] - dir_img_bin = os.path.join(dir_in_bin, file_name+'.png') - img_bin = cv2.imread(dir_img_bin) + img_bin = cv2.imread(os.path.join(dir_in_bin, file_name+'.png')) if dir_out_image_text: out_image_with_text = os.path.join(dir_out_image_text, file_name+'.png') diff --git a/tests/cli_tests/test_layout.py b/tests/cli_tests/test_layout.py index a34514e..bc354c8 100644 --- a/tests/cli_tests/test_layout.py +++ b/tests/cli_tests/test_layout.py @@ -13,12 +13,6 @@ from ocrd_models.constants import NAMESPACES as NS "--textline_light", "--light_version"], # -ep ... # -eoi ... - # FIXME: find out whether OCR extra was installed, otherwise skip these - ["--do_ocr"], - ["--do_ocr", "--light_version", "--textline_light"], - ["--do_ocr", "--transformer_ocr"], - #["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light"], - ["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light", "--full-layout"], # --skip_layout_and_reading_order ], ids=str) def test_run_eynollah_layout_filename(