diff --git a/requirements.txt b/requirements.txt index 9ed0584..4bc0c6a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,6 @@ numpy <1.24.0 scikit-learn >= 0.23.2 tensorflow < 2.13 numba <= 0.58.1 +scikit-image loky +biopython diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py index c189aca..9dc326d 100644 --- a/src/eynollah/cli.py +++ b/src/eynollah/cli.py @@ -3,6 +3,8 @@ import click from ocrd_utils import initLogging, getLevelName, getLogger from eynollah.eynollah import Eynollah, Eynollah_ocr from eynollah.sbb_binarize import SbbBinarizer +from eynollah.image_enhancer import Enhancer +from eynollah.mb_ro_on_layout import machine_based_reading_order_on_layout @click.group() def main(): @@ -12,38 +14,37 @@ def main(): @click.option( "--dir_xml", "-dx", - help="directory of GT page-xml files", + help="directory of page-xml files", type=click.Path(exists=True, file_okay=False), ) @click.option( - "--dir_out_modal_image", - "-domi", - help="directory where ground truth images would be written", + "--xml_file", + "-xml", + help="xml filename", + type=click.Path(exists=True, dir_okay=False), +) +@click.option( + "--dir_out", + "-do", + help="directory for output images", type=click.Path(exists=True, file_okay=False), ) @click.option( - "--dir_out_classes", - "-docl", - help="directory where ground truth classes would be written", + "--model", + "-m", + help="directory of models", type=click.Path(exists=True, file_okay=False), + required=True, ) -@click.option( - "--input_height", - "-ih", - help="input height", -) -@click.option( - "--input_width", - "-iw", - help="input width", -) -@click.option( - "--min_area_size", - "-min", - help="min area size of regions considered for reading order training.", -) -def machine_based_reading_order(dir_xml, dir_out_modal_image, dir_out_classes, input_height, input_width, min_area_size): - xml_files_ind = os.listdir(dir_xml) + +def machine_based_reading_order(dir_xml, xml_file, dir_out, model): + raedingorder_object = machine_based_reading_order_on_layout(model, dir_out=dir_out, logger=getLogger('enhancement')) + + if dir_xml: + raedingorder_object.run(dir_in=dir_xml) + else: + raedingorder_object.run(xml_filename=xml_file) + @main.command() @click.option('--patches/--no-patches', default=True, help='by enabling this parameter you let the model to see the image in patches.') @@ -70,6 +71,81 @@ def binarization(patches, model_dir, input_image, output_image, dir_in, dir_out) +@main.command() +@click.option( + "--image", + "-i", + help="image filename", + type=click.Path(exists=True, dir_okay=False), +) + +@click.option( + "--out", + "-o", + help="directory to write output xml data", + type=click.Path(exists=True, file_okay=False), + required=True, +) +@click.option( + "--overwrite", + "-O", + help="overwrite (instead of skipping) if output xml exists", + is_flag=True, +) +@click.option( + "--dir_in", + "-di", + help="directory of images", + type=click.Path(exists=True, file_okay=False), +) +@click.option( + "--model", + "-m", + help="directory of models", + type=click.Path(exists=True, file_okay=False), + required=True, +) + +@click.option( + "--num_col_upper", + "-ncu", + help="lower limit of columns in document image", +) +@click.option( + "--num_col_lower", + "-ncl", + help="upper limit of columns in document image", +) +@click.option( + "--save_org_scale/--no_save_org_scale", + "-sos/-nosos", + is_flag=True, + help="if this parameter set to true, this tool will save the enhanced image in org scale.", +) +@click.option( + "--log_level", + "-l", + type=click.Choice(['OFF', 'DEBUG', 'INFO', 'WARN', 'ERROR']), + help="Override log level globally to this", +) + +def enhancement(image, out, overwrite, dir_in, model, num_col_upper, num_col_lower, save_org_scale, log_level): + initLogging() + if log_level: + getLogger('enhancement').setLevel(getLevelName(log_level)) + assert image or dir_in, "Either a single image -i or a dir_in -di is required" + enhancer_object = Enhancer( + model, + logger=getLogger('enhancement'), + dir_out=out, + num_col_upper=num_col_upper, + num_col_lower=num_col_lower, + save_org_scale=save_org_scale, + ) + if dir_in: + enhancer_object.run(dir_in=dir_in, overwrite=overwrite) + else: + enhancer_object.run(image_filename=image, overwrite=overwrite) @main.command() @click.option( @@ -225,6 +301,17 @@ def binarization(patches, model_dir, input_image, output_image, dir_in, dir_out) is_flag=True, help="if this parameter set to true, this tool will try to do ocr", ) +@click.option( + "--transformer_ocr", + "-tr/-notr", + is_flag=True, + help="if this parameter set to true, this tool will apply transformer ocr", +) +@click.option( + "--batch_size_ocr", + "-bs_ocr", + help="number of inference batch size of ocr model. Default b_s for trocr and cnn_rnn models are 2 and 8 respectively", +) @click.option( "--num_col_upper", "-ncu", @@ -235,6 +322,16 @@ def binarization(patches, model_dir, input_image, output_image, dir_in, dir_out) "-ncl", help="upper limit of columns in document image", ) +@click.option( + "--threshold_art_class_layout", + "-tharl", + help="threshold of artifical class in the case of layout detection. The default value is 0.1", +) +@click.option( + "--threshold_art_class_textline", + "-thart", + help="threshold of artifical class in the case of textline detection. The default value is 0.1", +) @click.option( "--skip_layout_and_reading_order", "-slro/-noslro", @@ -248,7 +345,7 @@ def binarization(patches, model_dir, input_image, output_image, dir_in, dir_out) help="Override log level globally to this", ) -def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_deskewed, save_all, extract_only_images, save_page, enable_plotting, allow_enhancement, curved_line, textline_light, full_layout, tables, right2left, input_binary, allow_scaling, headers_off, light_version, reading_order_machine_based, do_ocr, num_col_upper, num_col_lower, skip_layout_and_reading_order, ignore_page_extraction, log_level): +def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_deskewed, save_all, extract_only_images, save_page, enable_plotting, allow_enhancement, curved_line, textline_light, full_layout, tables, right2left, input_binary, allow_scaling, headers_off, light_version, reading_order_machine_based, do_ocr, transformer_ocr, batch_size_ocr, num_col_upper, num_col_lower, threshold_art_class_textline, threshold_art_class_layout, skip_layout_and_reading_order, ignore_page_extraction, log_level): initLogging() if log_level: getLogger('eynollah').setLevel(getLevelName(log_level)) @@ -295,9 +392,13 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ ignore_page_extraction=ignore_page_extraction, reading_order_machine_based=reading_order_machine_based, do_ocr=do_ocr, + transformer_ocr=transformer_ocr, + batch_size_ocr=batch_size_ocr, num_col_upper=num_col_upper, num_col_lower=num_col_lower, skip_layout_and_reading_order=skip_layout_and_reading_order, + threshold_art_class_textline=threshold_art_class_textline, + threshold_art_class_layout=threshold_art_class_layout, ) if dir_in: eynollah.run(dir_in=dir_in, overwrite=overwrite) @@ -306,6 +407,18 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ @main.command() +@click.option( + "--image", + "-i", + help="image filename", + type=click.Path(exists=True, dir_okay=False), +) +@click.option( + "--overwrite", + "-O", + help="overwrite (instead of skipping) if output xml exists", + is_flag=True, +) @click.option( "--dir_in", "-di", @@ -342,7 +455,11 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ "-m", help="directory of models", type=click.Path(exists=True, file_okay=False), - required=True, +) +@click.option( + "--model_name", + help="Specific model file path to use for OCR", + type=click.Path(exists=True, file_okay=False), ) @click.option( "--tr_ocr", @@ -362,18 +479,27 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ is_flag=True, help="if this parameter set to true, cropped textline images will not be masked with textline contour.", ) -@click.option( - "--draw_texts_on_image", - "-dtoi/-ndtoi", - is_flag=True, - help="if this parameter set to true, the predicted texts will be displayed on an image.", -) @click.option( "--prediction_with_both_of_rgb_and_bin", "-brb/-nbrb", is_flag=True, help="If this parameter is set to True, the prediction will be performed using both RGB and binary images. However, this does not necessarily improve results; it may be beneficial for certain document images.", ) +@click.option( + "--batch_size", + "-bs", + help="number of inference batch size. Default b_s for trocr and cnn_rnn models are 2 and 8 respectively", +) +@click.option( + "--dataset_abbrevation", + "-ds_pref", + help="in the case of extracting textline and text from a xml GT file user can add an abbrevation of dataset name to generated dataset", +) +@click.option( + "--min_conf_value_of_textline_text", + "-min_conf", + help="minimum OCR confidence value. Text lines with a confidence value lower than this threshold will not be included in the output XML file.", +) @click.option( "--log_level", "-l", @@ -381,24 +507,37 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ help="Override log level globally to this", ) -def ocr(dir_in, dir_in_bin, out, dir_xmls, dir_out_image_text, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, draw_texts_on_image, prediction_with_both_of_rgb_and_bin, log_level): +def ocr(image, overwrite, dir_in, dir_in_bin, out, dir_xmls, dir_out_image_text, model, model_name, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, prediction_with_both_of_rgb_and_bin, batch_size, dataset_abbrevation, min_conf_value_of_textline_text, log_level): initLogging() if log_level: getLogger('eynollah').setLevel(getLevelName(log_level)) + + assert not model or not model_name, "model directory -m can not be set alongside specific model name --model_name" + assert not export_textline_images_and_text or not tr_ocr, "Exporting textline and text -etit can not be set alongside transformer ocr -tr_ocr" + assert not export_textline_images_and_text or not model, "Exporting textline and text -etit can not be set alongside model -m" + assert not export_textline_images_and_text or not batch_size, "Exporting textline and text -etit can not be set alongside batch size -bs" + assert not export_textline_images_and_text or not dir_in_bin, "Exporting textline and text -etit can not be set alongside directory of bin images -dib" + assert not export_textline_images_and_text or not dir_out_image_text, "Exporting textline and text -etit can not be set alongside directory of images with predicted text -doit" + assert not export_textline_images_and_text or not prediction_with_both_of_rgb_and_bin, "Exporting textline and text -etit can not be set alongside prediction with both rgb and bin -brb" + assert (bool(image) ^ bool(dir_in)), "Either -i (single image) or -di (directory) must be provided, but not both." eynollah_ocr = Eynollah_ocr( + image_filename=image, dir_xmls=dir_xmls, dir_out_image_text=dir_out_image_text, dir_in=dir_in, dir_in_bin=dir_in_bin, dir_out=out, dir_models=model, + model_name=model_name, tr_ocr=tr_ocr, export_textline_images_and_text=export_textline_images_and_text, do_not_mask_with_textline_contour=do_not_mask_with_textline_contour, - draw_texts_on_image=draw_texts_on_image, prediction_with_both_of_rgb_and_bin=prediction_with_both_of_rgb_and_bin, + batch_size=batch_size, + pref_of_dataset=dataset_abbrevation, + min_conf_value_of_textline_text=min_conf_value_of_textline_text, ) - eynollah_ocr.run() + eynollah_ocr.run(overwrite=overwrite) if __name__ == "__main__": main() diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index d47016b..ec2900f 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -22,7 +22,6 @@ from multiprocessing import cpu_count import gc import copy import json - from loky import ProcessPoolExecutor import xml.etree.ElementTree as ET import cv2 @@ -30,9 +29,10 @@ import numpy as np from scipy.signal import find_peaks from scipy.ndimage import gaussian_filter1d from numba import cuda - +from skimage.morphology import skeletonize from ocrd import OcrdPage from ocrd_utils import getLogger, tf_disable_interactive_logs +import statistics try: import torch @@ -77,12 +77,26 @@ from .utils.contour import ( from .utils.rotate import ( rotate_image, rotation_not_90_func, - rotation_not_90_func_full_layout + rotation_not_90_func_full_layout, + rotation_image_new +) +from .utils.utils_ocr import ( + return_textline_contour_with_added_box_coordinate, + preprocess_and_resize_image_for_ocrcnn_model, + return_textlines_split_if_needed, + decode_batch_predictions, + return_rnn_cnn_ocr_of_given_textlines, + fit_text_single_line, + break_curved_line_into_small_pieces_and_then_merge, + get_orientation_moments, + rotate_image_with_padding, + get_contours_and_bounding_boxes ) from .utils.separate_lines import ( textline_contours_postprocessing, separate_lines_new2, return_deskew_slop, + return_deskew_slop_old_mp, do_work_of_slopes_new, do_work_of_slopes_new_curved, do_work_of_slopes_new_light, @@ -198,8 +212,12 @@ class Eynollah: ignore_page_extraction : bool = False, reading_order_machine_based : bool = False, do_ocr : bool = False, + transformer_ocr: bool = False, + batch_size_ocr: Optional[int] = None, num_col_upper : Optional[int] = None, num_col_lower : Optional[int] = None, + threshold_art_class_layout: Optional[float] = None, + threshold_art_class_textline: Optional[float] = None, skip_layout_and_reading_order : bool = False, logger : Optional[Logger] = None, ): @@ -229,6 +247,7 @@ class Eynollah: self.ignore_page_extraction = ignore_page_extraction self.skip_layout_and_reading_order = skip_layout_and_reading_order self.ocr = do_ocr + self.tr = transformer_ocr if num_col_upper: self.num_col_upper = int(num_col_upper) else: @@ -237,6 +256,17 @@ class Eynollah: self.num_col_lower = int(num_col_lower) else: self.num_col_lower = num_col_lower + + if threshold_art_class_layout: + self.threshold_art_class_layout = float(threshold_art_class_layout) + else: + self.threshold_art_class_layout = 0.1 + + if threshold_art_class_textline: + self.threshold_art_class_textline = float(threshold_art_class_textline) + else: + self.threshold_art_class_textline = 0.1 + self.logger = logger if logger else getLogger('eynollah') # for parallelization of CPU-intensive tasks: self.executor = ProcessPoolExecutor(max_workers=cpu_count(), timeout=1200) @@ -259,7 +289,7 @@ class Eynollah: self.model_region_dir_p_ens = dir_models + "/eynollah-main-regions-ensembled_20210425" self.model_region_dir_p_ens_light = dir_models + "/eynollah-main-regions_20220314" self.model_region_dir_p_ens_light_only_images_extraction = dir_models + "/eynollah-main-regions_20231127_672_org_ens_11_13_16_17_18" - self.model_reading_order_dir = dir_models + "/model_ens_reading_order_machine_based" + self.model_reading_order_dir = dir_models + "/model_eynollah_reading_order_20250824"#"/model_mb_ro_aug_ens_11"#"/model_step_3200000_mb_ro"#"/model_ens_reading_order_machine_based"#"/model_mb_ro_aug_ens_8"#"/model_ens_reading_order_machine_based" #"/modelens_12sp_elay_0_3_4__3_6_n" #"/modelens_earlylayout_12spaltige_2_3_5_6_7_8" #"/modelens_early12_sp_2_3_5_6_7_8_9_10_12_14_15_16_18" @@ -286,8 +316,10 @@ class Eynollah: else: #"/eynollah-textline_20210425" self.model_textline_dir = dir_models + "/modelens_textline_0_1__2_4_16092024" - if self.ocr: + if self.ocr and self.tr: self.model_ocr_dir = dir_models + "/trocr_model_ens_of_3_checkpoints_201124" + elif self.ocr and not self.tr: + self.model_ocr_dir = dir_models + "/model_eynollah_ocr_cnnrnn_20250805" if self.tables: if self.light_version: self.model_table_dir = dir_models + "/modelens_table_0t4_201124" @@ -327,11 +359,37 @@ class Eynollah: self.model_region_fl = self.our_load_model(self.model_region_dir_fully) if self.reading_order_machine_based: self.model_reading_order = self.our_load_model(self.model_reading_order_dir) - if self.ocr: + if self.ocr and self.tr: self.model_ocr = VisionEncoderDecoderModel.from_pretrained(self.model_ocr_dir) self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #("microsoft/trocr-base-printed")#("microsoft/trocr-base-handwritten") self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") + elif self.ocr and not self.tr: + model_ocr = load_model(self.model_ocr_dir , compile=False) + + self.prediction_model = tf.keras.models.Model( + model_ocr.get_layer(name = "image").input, + model_ocr.get_layer(name = "dense2").output) + if not batch_size_ocr: + self.b_s_ocr = 8 + else: + self.b_s_ocr = int(batch_size_ocr) + + + with open(os.path.join(self.model_ocr_dir, "characters_org.txt"),"r") as config_file: + characters = json.load(config_file) + + + AUTOTUNE = tf.data.AUTOTUNE + + # Mapping characters to integers. + char_to_num = StringLookup(vocabulary=list(characters), mask_token=None) + + # Mapping integers back to original characters. + self.num_to_char = StringLookup( + vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True + ) + if self.tables: self.model_table = self.our_load_model(self.model_table_dir) @@ -667,6 +725,7 @@ class Eynollah: label_p_pred = self.model_classifier.predict(img_in, verbose=0) num_col = np.argmax(label_p_pred[0]) + 1 + elif (self.num_col_upper and self.num_col_lower) and (self.num_col_upper!=self.num_col_lower): if self.input_binary: img_in = np.copy(img) @@ -784,7 +843,7 @@ class Eynollah: self, patches, img, model, n_batch_inference=1, marginal_of_patch_percent=0.1, thresholding_for_some_classes_in_light_version=False, - thresholding_for_artificial_class_in_light_version=False): + thresholding_for_artificial_class_in_light_version=False, thresholding_for_fl_light_version=False, threshold_art_class_textline=0.1): self.logger.debug("enter do_prediction") img_height_model = model.layers[-1].output_shape[1] @@ -802,10 +861,22 @@ class Eynollah: if thresholding_for_artificial_class_in_light_version: seg_art = label_p_pred[0,:,:,2] - seg_art[seg_art<0.2] = 0 + seg_art[seg_art0] =1 + + skeleton_art = skeletonize(seg_art) + skeleton_art = skeleton_art*1 - seg[seg_art==1]=2 + seg[skeleton_art==1]=2 + + if thresholding_for_fl_light_version: + seg_header = label_p_pred[0,:,:,2] + + seg_header[seg_header<0.2] = 0 + seg_header[seg_header>0] =1 + + seg[seg_header==1]=2 + seg_color = np.repeat(seg[:, :, np.newaxis], 3, axis=2) prediction_true = resize_image(seg_color, img_h_page, img_w_page).astype(np.uint8) return prediction_true @@ -896,14 +967,17 @@ class Eynollah: if thresholding_for_artificial_class_in_light_version: seg_art = label_p_pred[:,:,:,2] - seg_art[seg_art<0.2] = 0 + seg_art[seg_art0] =1 - seg[seg_art==1]=2 + ##seg[seg_art==1]=2 indexer_inside_batch = 0 for i_batch, j_batch in zip(list_i_s, list_j_s): seg_in = seg[indexer_inside_batch] + + if thresholding_for_artificial_class_in_light_version: + seg_in_art = seg_art[indexer_inside_batch] index_y_u_in = list_y_u[indexer_inside_batch] index_y_d_in = list_y_d[indexer_inside_batch] @@ -917,54 +991,107 @@ class Eynollah: seg_in[0:-margin or None, 0:-margin or None, np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + 0:index_x_u_in - margin, 1] = \ + seg_in_art[0:-margin or None, + 0:-margin or None] + elif i_batch == nxf - 1 and j_batch == nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - 0, index_x_d_in + margin:index_x_u_in - 0] = \ seg_in[margin:, margin:, np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + margin:index_x_u_in - 0, 1] = \ + seg_in_art[margin:, + margin:] + elif i_batch == 0 and j_batch == nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - 0, index_x_d_in + 0:index_x_u_in - margin] = \ seg_in[margin:, 0:-margin or None, np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + 0:index_x_u_in - margin, 1] = \ + seg_in_art[margin:, + 0:-margin or None] + elif i_batch == nxf - 1 and j_batch == 0: prediction_true[index_y_d_in + 0:index_y_u_in - margin, index_x_d_in + margin:index_x_u_in - 0] = \ seg_in[0:-margin or None, margin:, np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - 0, 1] = \ + seg_in_art[0:-margin or None, + margin:] + elif i_batch == 0 and j_batch != 0 and j_batch != nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - margin, index_x_d_in + 0:index_x_u_in - margin] = \ seg_in[margin:-margin or None, 0:-margin or None, np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + 0:index_x_u_in - margin, 1] = \ + seg_in_art[margin:-margin or None, + 0:-margin or None] + elif i_batch == nxf - 1 and j_batch != 0 and j_batch != nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - margin, index_x_d_in + margin:index_x_u_in - 0] = \ seg_in[margin:-margin or None, margin:, np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - 0, 1] = \ + seg_in_art[margin:-margin or None, + margin:] + elif i_batch != 0 and i_batch != nxf - 1 and j_batch == 0: prediction_true[index_y_d_in + 0:index_y_u_in - margin, index_x_d_in + margin:index_x_u_in - margin] = \ seg_in[0:-margin or None, margin:-margin or None, np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - margin, 1] = \ + seg_in_art[0:-margin or None, + margin:-margin or None] + elif i_batch != 0 and i_batch != nxf - 1 and j_batch == nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - 0, index_x_d_in + margin:index_x_u_in - margin] = \ seg_in[margin:, margin:-margin or None, np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + margin:index_x_u_in - margin, 1] = \ + seg_in_art[margin:, + margin:-margin or None] + else: prediction_true[index_y_d_in + margin:index_y_u_in - margin, index_x_d_in + margin:index_x_u_in - margin] = \ seg_in[margin:-margin or None, margin:-margin or None, np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - margin, 1] = \ + seg_in_art[margin:-margin or None, + margin:-margin or None] indexer_inside_batch += 1 @@ -979,6 +1106,19 @@ class Eynollah: img_patch[:] = 0 prediction_true = prediction_true.astype(np.uint8) + + if thresholding_for_artificial_class_in_light_version: + kernel_min = np.ones((3, 3), np.uint8) + prediction_true[:,:,0][prediction_true[:,:,0]==2] = 0 + + skeleton_art = skeletonize(prediction_true[:,:,1]) + skeleton_art = skeleton_art*1 + + skeleton_art = skeleton_art.astype('uint8') + + skeleton_art = cv2.dilate(skeleton_art, kernel_min, iterations=1) + + prediction_true[:,:,0][skeleton_art==1]=2 #del model gc.collect() return prediction_true @@ -1117,7 +1257,7 @@ class Eynollah: self, patches, img, model, n_batch_inference=1, marginal_of_patch_percent=0.1, thresholding_for_some_classes_in_light_version=False, - thresholding_for_artificial_class_in_light_version=False): + thresholding_for_artificial_class_in_light_version=False, threshold_art_class_textline=0.1, threshold_art_class_layout=0.1): self.logger.debug("enter do_prediction_new_concept") img_height_model = model.layers[-1].output_shape[1] @@ -1132,19 +1272,28 @@ class Eynollah: label_p_pred = model.predict(img[np.newaxis], verbose=0) seg = np.argmax(label_p_pred, axis=3)[0] - if thresholding_for_artificial_class_in_light_version: - #seg_text = label_p_pred[0,:,:,1] - #seg_text[seg_text<0.2] =0 - #seg_text[seg_text>0] =1 - #seg[seg_text==1]=1 - - seg_art = label_p_pred[0,:,:,4] - seg_art[seg_art<0.2] =0 - seg_art[seg_art>0] =1 - seg[seg_art==1]=4 - seg_color = np.repeat(seg[:, :, np.newaxis], 3, axis=2) prediction_true = resize_image(seg_color, img_h_page, img_w_page).astype(np.uint8) + + if thresholding_for_artificial_class_in_light_version: + kernel_min = np.ones((3, 3), np.uint8) + seg_art = label_p_pred[0,:,:,4] + seg_art[seg_art0] =1 + #seg[seg_art==1]=4 + seg_art = resize_image(seg_art, img_h_page, img_w_page).astype(np.uint8) + + prediction_true[:,:,0][prediction_true[:,:,0]==4] = 0 + + skeleton_art = skeletonize(seg_art) + skeleton_art = skeleton_art*1 + + skeleton_art = skeleton_art.astype('uint8') + + skeleton_art = cv2.dilate(skeleton_art, kernel_min, iterations=1) + + prediction_true[:,:,0][skeleton_art==1] = 4 + return prediction_true , resize_image(label_p_pred[0, :, :, 1] , img_h_page, img_w_page) if img.shape[0] < img_height_model: @@ -1217,26 +1366,29 @@ class Eynollah: if thresholding_for_some_classes_in_light_version: seg_art = label_p_pred[:,:,:,4] - seg_art[seg_art<0.2] =0 + seg_art[seg_art0] =1 seg_line = label_p_pred[:,:,:,3] - seg_line[seg_line>0.1] =1 + seg_line[seg_line>0.4] =1#seg_line[seg_line>0.5] =1#seg_line[seg_line>0.1] =1 seg_line[seg_line<1] =0 - seg[seg_art==1]=4 - seg[(seg_line==1) & (seg==0)]=3 + ##seg[seg_art==1]=4 + #seg[(seg_line==1) & (seg==0)]=3 if thresholding_for_artificial_class_in_light_version: seg_art = label_p_pred[:,:,:,2] - seg_art[seg_art<0.2] = 0 + seg_art[seg_art0] =1 - seg[seg_art==1]=2 + ##seg[seg_art==1]=2 indexer_inside_batch = 0 for i_batch, j_batch in zip(list_i_s, list_j_s): seg_in = seg[indexer_inside_batch] + + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + seg_in_art = seg_art[indexer_inside_batch] index_y_u_in = list_y_u[indexer_inside_batch] index_y_d_in = list_y_d[indexer_inside_batch] @@ -1255,6 +1407,12 @@ class Eynollah: label_p_pred[0, 0:-margin or None, 0:-margin or None, 1] + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + 0:index_x_u_in - margin, 1] = \ + seg_in_art[0:-margin or None, + 0:-margin or None] + elif i_batch == nxf - 1 and j_batch == nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - 0, index_x_d_in + margin:index_x_u_in - 0] = \ @@ -1266,6 +1424,12 @@ class Eynollah: label_p_pred[0, margin:, margin:, 1] + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + margin:index_x_u_in - 0, 1] = \ + seg_in_art[margin:, + margin:] + elif i_batch == 0 and j_batch == nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - 0, index_x_d_in + 0:index_x_u_in - margin] = \ @@ -1277,6 +1441,13 @@ class Eynollah: label_p_pred[0, margin:, 0:-margin or None, 1] + + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + 0:index_x_u_in - margin, 1] = \ + seg_in_art[margin:, + 0:-margin or None] + elif i_batch == nxf - 1 and j_batch == 0: prediction_true[index_y_d_in + 0:index_y_u_in - margin, index_x_d_in + margin:index_x_u_in - 0] = \ @@ -1288,6 +1459,12 @@ class Eynollah: label_p_pred[0, 0:-margin or None, margin:, 1] + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - 0, 1] = \ + seg_in_art[0:-margin or None, + margin:] + elif i_batch == 0 and j_batch != 0 and j_batch != nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - margin, index_x_d_in + 0:index_x_u_in - margin] = \ @@ -1299,6 +1476,11 @@ class Eynollah: label_p_pred[0, margin:-margin or None, 0:-margin or None, 1] + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + 0:index_x_u_in - margin, 1] = \ + seg_in_art[margin:-margin or None, + 0:-margin or None] elif i_batch == nxf - 1 and j_batch != 0 and j_batch != nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - margin, index_x_d_in + margin:index_x_u_in - 0] = \ @@ -1310,6 +1492,11 @@ class Eynollah: label_p_pred[0, margin:-margin or None, margin:, 1] + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - 0, 1] = \ + seg_in_art[margin:-margin or None, + margin:] elif i_batch != 0 and i_batch != nxf - 1 and j_batch == 0: prediction_true[index_y_d_in + 0:index_y_u_in - margin, index_x_d_in + margin:index_x_u_in - margin] = \ @@ -1321,6 +1508,11 @@ class Eynollah: label_p_pred[0, 0:-margin or None, margin:-margin or None, 1] + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - margin, 1] = \ + seg_in_art[0:-margin or None, + margin:-margin or None] elif i_batch != 0 and i_batch != nxf - 1 and j_batch == nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - 0, index_x_d_in + margin:index_x_u_in - margin] = \ @@ -1332,6 +1524,11 @@ class Eynollah: label_p_pred[0, margin:, margin:-margin or None, 1] + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + margin:index_x_u_in - margin, 1] = \ + seg_in_art[margin:, + margin:-margin or None] else: prediction_true[index_y_d_in + margin:index_y_u_in - margin, index_x_d_in + margin:index_x_u_in - margin] = \ @@ -1343,6 +1540,11 @@ class Eynollah: label_p_pred[0, margin:-margin or None, margin:-margin or None, 1] + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - margin, 1] = \ + seg_in_art[margin:-margin or None, + margin:-margin or None] indexer_inside_batch += 1 list_i_s = [] @@ -1356,6 +1558,32 @@ class Eynollah: img_patch[:] = 0 prediction_true = prediction_true.astype(np.uint8) + + if thresholding_for_artificial_class_in_light_version: + kernel_min = np.ones((3, 3), np.uint8) + prediction_true[:,:,0][prediction_true[:,:,0]==2] = 0 + + skeleton_art = skeletonize(prediction_true[:,:,1]) + skeleton_art = skeleton_art*1 + + skeleton_art = skeleton_art.astype('uint8') + + skeleton_art = cv2.dilate(skeleton_art, kernel_min, iterations=1) + + prediction_true[:,:,0][skeleton_art==1]=2 + + if thresholding_for_some_classes_in_light_version: + kernel_min = np.ones((3, 3), np.uint8) + prediction_true[:,:,0][prediction_true[:,:,0]==4] = 0 + + skeleton_art = skeletonize(prediction_true[:,:,1]) + skeleton_art = skeleton_art*1 + + skeleton_art = skeleton_art.astype('uint8') + + skeleton_art = cv2.dilate(skeleton_art, kernel_min, iterations=1) + + prediction_true[:,:,0][skeleton_art==1]=4 gc.collect() return prediction_true, confidence_matrix @@ -1440,10 +1668,11 @@ class Eynollah: model_region = self.model_region_fl if patches else self.model_region_fl_np if self.light_version: - pass + thresholding_for_fl_light_version = True elif not patches: img = otsu_copy_binary(img).astype(np.uint8) prediction_regions = None + thresholding_for_fl_light_version = False elif cols: img = otsu_copy_binary(img).astype(np.uint8) if cols == 1: @@ -1459,7 +1688,7 @@ class Eynollah: else: img = resize_image(img, int(img_height_h * 2500 / float(img_width_h)), 2500).astype(np.uint8) - prediction_regions = self.do_prediction(patches, img, model_region, marginal_of_patch_percent=0.1, n_batch_inference=3) + prediction_regions = self.do_prediction(patches, img, model_region, marginal_of_patch_percent=0.1, n_batch_inference=3, thresholding_for_fl_light_version=thresholding_for_fl_light_version) prediction_regions = resize_image(prediction_regions, img_height_h, img_width_h) self.logger.debug("exit extract_text_regions") return prediction_regions, prediction_regions @@ -1608,7 +1837,7 @@ class Eynollah: prediction_textline = self.do_prediction( use_patches, img, self.model_textline, marginal_of_patch_percent=0.15, n_batch_inference=3, - thresholding_for_artificial_class_in_light_version=self.textline_light) + thresholding_for_artificial_class_in_light_version=self.textline_light, threshold_art_class_textline=self.threshold_art_class_textline) #if not self.textline_light: #if num_col_classifier==1: #prediction_textline_nopatch = self.do_prediction(False, img, self.model_textline) @@ -1622,7 +1851,55 @@ class Eynollah: textline_mask_tot_ea_art = textline_mask_tot_ea_art.astype('uint8') #textline_mask_tot_ea_art = cv2.dilate(textline_mask_tot_ea_art, KERNEL, iterations=1) prediction_textline[:,:][textline_mask_tot_ea_art[:,:]==1]=2 + """ + else: + textline_mask_tot_ea_art = textline_mask_tot_ea_art.astype('uint8') + hor_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (8, 1)) + + kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)) + ##cv2.imwrite('textline_mask_tot_ea_art.png', textline_mask_tot_ea_art) + textline_mask_tot_ea_art = cv2.dilate(textline_mask_tot_ea_art, hor_kernel, iterations=1) + + ###cv2.imwrite('dil_textline_mask_tot_ea_art.png', dil_textline_mask_tot_ea_art) + + textline_mask_tot_ea_art = textline_mask_tot_ea_art.astype('uint8') + + #print(np.shape(dil_textline_mask_tot_ea_art), np.unique(dil_textline_mask_tot_ea_art), 'dil_textline_mask_tot_ea_art') + tsk = time.time() + skeleton_art_textline = skeletonize(textline_mask_tot_ea_art[:,:,0]) + + skeleton_art_textline = skeleton_art_textline*1 + + skeleton_art_textline = skeleton_art_textline.astype('uint8') + + skeleton_art_textline = cv2.dilate(skeleton_art_textline, kernel, iterations=1) + + #print(np.unique(skeleton_art_textline), np.shape(skeleton_art_textline)) + + #print(skeleton_art_textline, np.unique(skeleton_art_textline)) + + #cv2.imwrite('skeleton_art_textline.png', skeleton_art_textline) + + prediction_textline[:,:,0][skeleton_art_textline[:,:]==1]=2 + + #cv2.imwrite('prediction_textline1.png', prediction_textline[:,:,0]) + + ##hor_kernel2 = cv2.getStructuringElement(cv2.MORPH_RECT, (4, 1)) + ##ver_kernel2 = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 3)) + ##textline_mask_tot_ea_main = (prediction_textline[:,:]==1)*1 + ##textline_mask_tot_ea_main = textline_mask_tot_ea_main.astype('uint8') + + ##dil_textline_mask_tot_ea_main = cv2.erode(textline_mask_tot_ea_main, ver_kernel2, iterations=1) + + ##dil_textline_mask_tot_ea_main = cv2.dilate(textline_mask_tot_ea_main, hor_kernel2, iterations=1) + + ##dil_textline_mask_tot_ea_main = cv2.dilate(textline_mask_tot_ea_main, ver_kernel2, iterations=1) + + ##prediction_textline[:,:][dil_textline_mask_tot_ea_main[:,:]==1]=1 + + """ + textline_mask_tot_ea_lines = (prediction_textline[:,:]==1)*1 textline_mask_tot_ea_lines = textline_mask_tot_ea_lines.astype('uint8') if not self.textline_light: @@ -1631,10 +1908,15 @@ class Eynollah: prediction_textline[:,:][textline_mask_tot_ea_lines[:,:]==1]=1 if not self.textline_light: prediction_textline[:,:][old_art[:,:]==1]=2 + + #cv2.imwrite('prediction_textline2.png', prediction_textline[:,:,0]) prediction_textline_longshot = self.do_prediction(False, img, self.model_textline) prediction_textline_longshot_true_size = resize_image(prediction_textline_longshot, img_h, img_w) - + + + #cv2.imwrite('prediction_textline.png', prediction_textline[:,:,0]) + #sys.exit() self.logger.debug('exit textline_contours') return ((prediction_textline[:, :, 0]==1).astype(np.uint8), (prediction_textline_longshot_true_size[:, :, 0]==1).astype(np.uint8)) @@ -1656,8 +1938,8 @@ class Eynollah: y_diff_mean = find_contours_mean_y_diff(textline_con_fil) sigma_des = max(1, int(y_diff_mean * (4.0 / 40.0))) crop_img[crop_img > 0] = 1 - slope_corresponding_textregion = return_deskew_slop(crop_img, sigma_des, - map=self.executor.map, logger=self.logger, plotter=self.plotter) + slope_corresponding_textregion = return_deskew_slop_old_mp(crop_img, sigma_des, + logger=self.logger, plotter=self.plotter) except Exception as why: self.logger.error(why) slope_corresponding_textregion = MAX_SLOPE @@ -1823,7 +2105,7 @@ class Eynollah: ###img_bin = np.copy(prediction_bin) ###else: ###img_bin = np.copy(img_resized) - if self.ocr and not self.input_binary: + if (self.ocr and self.tr) and not self.input_binary: prediction_bin = self.do_prediction(True, img_resized, self.model_bin, n_batch_inference=5) prediction_bin = 255 * (prediction_bin[:,:,0] == 0) prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2) @@ -1840,7 +2122,7 @@ class Eynollah: textline_mask_tot_ea = resize_image(textline_mask_tot_ea,img_height_h, img_width_h ) #print(self.image_org.shape) - #cv2.imwrite('out_13.png', self.image_page_org_size) + #cv2.imwrite('textline.png', textline_mask_tot_ea) #plt.imshwo(self.image_page_org_size) #plt.show() @@ -1852,13 +2134,13 @@ class Eynollah: img_resized.shape[1], img_resized.shape[0], num_col_classifier) prediction_regions_org, confidence_matrix = self.do_prediction_new_concept( True, img_resized, self.model_region_1_2, n_batch_inference=1, - thresholding_for_some_classes_in_light_version=True) + thresholding_for_some_classes_in_light_version=True, threshold_art_class_layout=self.threshold_art_class_layout) else: prediction_regions_org = np.zeros((self.image_org.shape[0], self.image_org.shape[1], 3)) confidence_matrix = np.zeros((self.image_org.shape[0], self.image_org.shape[1])) prediction_regions_page, confidence_matrix_page = self.do_prediction_new_concept( False, self.image_page_org_size, self.model_region_1_2, n_batch_inference=1, - thresholding_for_artificial_class_in_light_version=True) + thresholding_for_artificial_class_in_light_version=True, threshold_art_class_layout=self.threshold_art_class_layout) ys = slice(*self.page_coord[0:2]) xs = slice(*self.page_coord[2:4]) prediction_regions_org[ys, xs] = prediction_regions_page @@ -1871,7 +2153,7 @@ class Eynollah: img_resized.shape[1], img_resized.shape[0], new_h, num_col_classifier) prediction_regions_org, confidence_matrix = self.do_prediction_new_concept( True, img_resized, self.model_region_1_2, n_batch_inference=2, - thresholding_for_some_classes_in_light_version=True) + thresholding_for_some_classes_in_light_version=True, threshold_art_class_layout=self.threshold_art_class_layout) ###prediction_regions_org = self.do_prediction(True, img_bin, self.model_region, n_batch_inference=3, thresholding_for_some_classes_in_light_version=True) #print("inside 3 ", time.time()-t_in) #plt.imshow(prediction_regions_org[:,:,0]) @@ -2809,6 +3091,26 @@ class Eynollah: num_col = num_col + 1 if not num_column_is_classified: num_col_classifier = num_col + 1 + if self.num_col_upper and self.num_col_lower: + if self.num_col_upper == self.num_col_lower: + num_col_classifier = self.num_col_upper + else: + if num_col_classifier < self.num_col_lower: + num_col_classifier = self.num_col_lower + if num_col_classifier > self.num_col_upper: + num_col_classifier = self.num_col_upper + + elif self.num_col_lower and not self.num_col_upper: + if num_col_classifier < self.num_col_lower: + num_col_classifier = self.num_col_lower + + elif self.num_col_upper and not self.num_col_lower: + if num_col_classifier > self.num_col_upper: + num_col_classifier = self.num_col_upper + + else: + pass + except Exception as why: self.logger.error(why) num_col = None @@ -2923,8 +3225,8 @@ class Eynollah: def run_deskew(self, textline_mask_tot_ea): #print(textline_mask_tot_ea.shape, 'textline_mask_tot_ea deskew') - slope_deskew = return_deskew_slop(cv2.erode(textline_mask_tot_ea, KERNEL, iterations=2), 2, 30, True, - map=self.executor.map, logger=self.logger, plotter=self.plotter) + slope_deskew = return_deskew_slop_old_mp(cv2.erode(textline_mask_tot_ea, KERNEL, iterations=2), 2, 30, True, + logger=self.logger, plotter=self.plotter) slope_first = 0 if self.plotter: @@ -2942,7 +3244,6 @@ class Eynollah: text_regions_p_1[mask_lines[:, :] == 1] = 3 text_regions_p = text_regions_p_1[:, :] text_regions_p = np.array(text_regions_p) - if num_col_classifier in (1, 2): try: regions_without_separators = (text_regions_p[:, :] == 1) * 1 @@ -3248,8 +3549,10 @@ class Eynollah: # 6 is the separators lable in old full layout model # 4 is the drop capital class in old full layout model # in the new full layout drop capital is 3 and separators are 5 - - text_regions_p[:,:][regions_fully[:,:,0]==5]=6 + + # the separators in full layout will not be written on layout + if not self.reading_order_machine_based: + text_regions_p[:,:][regions_fully[:,:,0]==5]=6 ###regions_fully[:, :, 0][regions_fully_only_drop[:, :, 0] == 3] = 4 #text_regions_p[:,:][regions_fully[:,:,0]==6]=6 @@ -3318,9 +3621,100 @@ class Eynollah: return model def do_order_of_regions_with_model(self, contours_only_text_parent, contours_only_text_parent_h, text_regions_p): + + height1 =672#448 + width1 = 448#224 + + height2 =672#448 + width2= 448#224 + + height3 =672#448 + width3 = 448#224 + + inference_bs = 3 + + ver_kernel = np.ones((5, 1), dtype=np.uint8) + hor_kernel = np.ones((1, 5), dtype=np.uint8) + + + min_cont_size_to_be_dilated = 10 + if len(contours_only_text_parent)>min_cont_size_to_be_dilated and self.light_version: + cx_conts, cy_conts, x_min_conts, x_max_conts, y_min_conts, y_max_conts, _ = find_new_features_of_contours(contours_only_text_parent) + args_cont_located = np.array(range(len(contours_only_text_parent))) + + diff_y_conts = np.abs(y_max_conts[:]-y_min_conts) + diff_x_conts = np.abs(x_max_conts[:]-x_min_conts) + + mean_x = statistics.mean(diff_x_conts) + median_x = statistics.median(diff_x_conts) + + + diff_x_ratio= diff_x_conts/mean_x + + args_cont_located_excluded = args_cont_located[diff_x_ratio>=1.3] + args_cont_located_included = args_cont_located[diff_x_ratio<1.3] + + contours_only_text_parent_excluded = [contours_only_text_parent[ind] for ind in range(len(contours_only_text_parent)) if diff_x_ratio[ind]>=1.3]#contours_only_text_parent[diff_x_ratio>=1.3] + contours_only_text_parent_included = [contours_only_text_parent[ind] for ind in range(len(contours_only_text_parent)) if diff_x_ratio[ind]<1.3]#contours_only_text_parent[diff_x_ratio<1.3] + + + cx_conts_excluded = [cx_conts[ind] for ind in range(len(cx_conts)) if diff_x_ratio[ind]>=1.3]#cx_conts[diff_x_ratio>=1.3] + cx_conts_included = [cx_conts[ind] for ind in range(len(cx_conts)) if diff_x_ratio[ind]<1.3]#cx_conts[diff_x_ratio<1.3] + + cy_conts_excluded = [cy_conts[ind] for ind in range(len(cy_conts)) if diff_x_ratio[ind]>=1.3]#cy_conts[diff_x_ratio>=1.3] + cy_conts_included = [cy_conts[ind] for ind in range(len(cy_conts)) if diff_x_ratio[ind]<1.3]#cy_conts[diff_x_ratio<1.3] + + #print(diff_x_ratio, 'ratio') + text_regions_p = text_regions_p.astype('uint8') + + if len(contours_only_text_parent_excluded)>0: + textregion_par = np.zeros((text_regions_p.shape[0], text_regions_p.shape[1])).astype('uint8') + textregion_par = cv2.fillPoly(textregion_par, pts=contours_only_text_parent_included, color=(1,1)) + else: + textregion_par = (text_regions_p[:,:]==1)*1 + textregion_par = textregion_par.astype('uint8') + + text_regions_p_textregions_dilated = cv2.erode(textregion_par , hor_kernel, iterations=2) + text_regions_p_textregions_dilated = cv2.dilate(text_regions_p_textregions_dilated , ver_kernel, iterations=4) + text_regions_p_textregions_dilated = cv2.erode(text_regions_p_textregions_dilated , hor_kernel, iterations=1) + text_regions_p_textregions_dilated = cv2.dilate(text_regions_p_textregions_dilated , ver_kernel, iterations=5) + text_regions_p_textregions_dilated[text_regions_p[:,:]>1] = 0 + + + contours_only_dilated, hir_on_text_dilated = return_contours_of_image(text_regions_p_textregions_dilated) + contours_only_dilated = return_parent_contours(contours_only_dilated, hir_on_text_dilated) + + indexes_of_located_cont, center_x_coordinates_of_located, center_y_coordinates_of_located = self.return_indexes_of_contours_loctaed_inside_another_list_of_contours(contours_only_dilated, contours_only_text_parent_included, cx_conts_included, cy_conts_included, args_cont_located_included) + + + if len(args_cont_located_excluded)>0: + for ind in args_cont_located_excluded: + indexes_of_located_cont.append(np.array([ind])) + contours_only_dilated.append(contours_only_text_parent[ind]) + center_y_coordinates_of_located.append(0) + + array_list = [np.array([elem]) if isinstance(elem, int) else elem for elem in indexes_of_located_cont] + flattened_array = np.concatenate([arr.ravel() for arr in array_list]) + #print(len( np.unique(flattened_array)), 'indexes_of_located_cont uniques') + + missing_textregions = list( set(np.array(range(len(contours_only_text_parent))) ) - set(np.unique(flattened_array)) ) + #print(missing_textregions, 'missing_textregions') + + for ind in missing_textregions: + indexes_of_located_cont.append(np.array([ind])) + contours_only_dilated.append(contours_only_text_parent[ind]) + center_y_coordinates_of_located.append(0) + + + if contours_only_text_parent_h: + for vi in range(len(contours_only_text_parent_h)): + indexes_of_located_cont.append(int(vi+len(contours_only_text_parent))) + + array_list = [np.array([elem]) if isinstance(elem, int) else elem for elem in indexes_of_located_cont] + flattened_array = np.concatenate([arr.ravel() for arr in array_list]) + y_len = text_regions_p.shape[0] x_len = text_regions_p.shape[1] - img_poly = np.zeros((y_len,x_len), dtype='uint8') img_poly[text_regions_p[:,:]==1] = 1 @@ -3328,25 +3722,24 @@ class Eynollah: img_poly[text_regions_p[:,:]==3] = 4 img_poly[text_regions_p[:,:]==6] = 5 - - #temp - sep_mask = (img_poly==5)*1 - sep_mask = sep_mask.astype('uint8') - sep_mask = cv2.erode(sep_mask, kernel=KERNEL, iterations=2) - img_poly[img_poly==5] = 0 - img_poly[sep_mask==1] = 5 - # - img_header_and_sep = np.zeros((y_len,x_len), dtype='uint8') if contours_only_text_parent_h: _, cy_main, x_min_main, x_max_main, y_min_main, y_max_main, _ = find_new_features_of_contours( contours_only_text_parent_h) for j in range(len(cy_main)): img_header_and_sep[int(y_max_main[j]):int(y_max_main[j])+12, - int(x_min_main[j]):int(x_max_main[j])] = 1 - co_text_all = contours_only_text_parent + contours_only_text_parent_h + int(x_min_main[j]):int(x_max_main[j])] = 1 + co_text_all_org = contours_only_text_parent + contours_only_text_parent_h + if len(contours_only_text_parent)>min_cont_size_to_be_dilated and self.light_version: + co_text_all = contours_only_dilated + contours_only_text_parent_h + else: + co_text_all = contours_only_text_parent + contours_only_text_parent_h else: - co_text_all = contours_only_text_parent + co_text_all_org = contours_only_text_parent + if len(contours_only_text_parent)>min_cont_size_to_be_dilated and self.light_version: + co_text_all = contours_only_dilated + else: + co_text_all = contours_only_text_parent if not len(co_text_all): return [], [] @@ -3361,21 +3754,13 @@ class Eynollah: cv2.fillPoly(img, pts=[co_text_all[i]], color=(1,)) labels_con[:,:,i] = img - height1 =672#448 - width1 = 448#224 - - height2 =672#448 - width2= 448#224 - - height3 =672#448 - width3 = 448#224 labels_con = resize_image(labels_con.astype(np.uint8), height1, width1).astype(bool) img_header_and_sep = resize_image(img_header_and_sep, height1, width1) img_poly = resize_image(img_poly, height3, width3) - inference_bs = 3 + input_1 = np.zeros((inference_bs, height1, width1, 3)) ordered = [list(range(len(co_text_all)))] index_update = 0 @@ -3425,221 +3810,25 @@ class Eynollah: break ordered = [i[0] for i in ordered] - region_ids = ['region_%04d' % i for i in range(len(co_text_all))] - return ordered, region_ids - - def return_start_and_end_of_common_text_of_textline_ocr(self, textline_image, ind_tot): - width = np.shape(textline_image)[1] - height = np.shape(textline_image)[0] - common_window = int(0.2*width) - - width1 = int ( width/2. - common_window ) - width2 = int ( width/2. + common_window ) - - img_sum = np.sum(textline_image[:,:,0], axis=0) - sum_smoothed = gaussian_filter1d(img_sum, 3) - - peaks_real, _ = find_peaks(sum_smoothed, height=0) - if len(peaks_real)>70: - print(len(peaks_real), 'len(peaks_real)') - - peaks_real = peaks_real[(peaks_realwidth1)] - - arg_sort = np.argsort(sum_smoothed[peaks_real]) - arg_sort4 =arg_sort[::-1][:4] - peaks_sort_4 = peaks_real[arg_sort][::-1][:4] - argsort_sorted = np.argsort(peaks_sort_4) - - first_4_sorted = peaks_sort_4[argsort_sorted] - y_4_sorted = sum_smoothed[peaks_real][arg_sort4[argsort_sorted]] - #print(first_4_sorted,'first_4_sorted') - - arg_sortnew = np.argsort(y_4_sorted) - peaks_final =np.sort( first_4_sorted[arg_sortnew][2:] ) - - #plt.figure(ind_tot) - #plt.imshow(textline_image) - #plt.plot([peaks_final[0], peaks_final[0]], [0, height-1]) - #plt.plot([peaks_final[1], peaks_final[1]], [0, height-1]) - #plt.savefig('./'+str(ind_tot)+'.png') - - return peaks_final[0], peaks_final[1] + + if len(contours_only_text_parent)>min_cont_size_to_be_dilated and self.light_version: + org_contours_indexes = [] + for ind in range(len(ordered)): + region_with_curr_order = ordered[ind] + if region_with_curr_order < len(contours_only_dilated): + if np.isscalar(indexes_of_located_cont[region_with_curr_order]): + org_contours_indexes = org_contours_indexes + [indexes_of_located_cont[region_with_curr_order]] + else: + arg_sort_located_cont = np.argsort(center_y_coordinates_of_located[region_with_curr_order]) + org_contours_indexes = org_contours_indexes + list(np.array(indexes_of_located_cont[region_with_curr_order])[arg_sort_located_cont]) ##org_contours_indexes + list ( + else: + org_contours_indexes = org_contours_indexes + [indexes_of_located_cont[region_with_curr_order]] + + region_ids = ['region_%04d' % i for i in range(len(co_text_all_org))] + return org_contours_indexes, region_ids else: - pass - - def return_start_and_end_of_common_text_of_textline_ocr_without_common_section(self, textline_image, ind_tot): - width = np.shape(textline_image)[1] - height = np.shape(textline_image)[0] - common_window = int(0.06*width) - - width1 = int ( width/2. - common_window ) - width2 = int ( width/2. + common_window ) - - img_sum = np.sum(textline_image[:,:,0], axis=0) - sum_smoothed = gaussian_filter1d(img_sum, 3) - - peaks_real, _ = find_peaks(sum_smoothed, height=0) - if len(peaks_real)>70: - #print(len(peaks_real), 'len(peaks_real)') - - peaks_real = peaks_real[(peaks_realwidth1)] - - arg_max = np.argmax(sum_smoothed[peaks_real]) - peaks_final = peaks_real[arg_max] - - #plt.figure(ind_tot) - #plt.imshow(textline_image) - #plt.plot([peaks_final, peaks_final], [0, height-1]) - ##plt.plot([peaks_final[1], peaks_final[1]], [0, height-1]) - #plt.savefig('./'+str(ind_tot)+'.png') - - return peaks_final - else: - return None - - def return_start_and_end_of_common_text_of_textline_ocr_new_splitted( - self, peaks_real, sum_smoothed, start_split, end_split): - - peaks_real = peaks_real[(peaks_realstart_split)] - - arg_sort = np.argsort(sum_smoothed[peaks_real]) - arg_sort4 =arg_sort[::-1][:4] - peaks_sort_4 = peaks_real[arg_sort][::-1][:4] - argsort_sorted = np.argsort(peaks_sort_4) - - first_4_sorted = peaks_sort_4[argsort_sorted] - y_4_sorted = sum_smoothed[peaks_real][arg_sort4[argsort_sorted]] - #print(first_4_sorted,'first_4_sorted') - - arg_sortnew = np.argsort(y_4_sorted) - peaks_final =np.sort( first_4_sorted[arg_sortnew][3:] ) - return peaks_final[0] - - def return_start_and_end_of_common_text_of_textline_ocr_new(self, textline_image, ind_tot): - width = np.shape(textline_image)[1] - height = np.shape(textline_image)[0] - common_window = int(0.15*width) - - width1 = int ( width/2. - common_window ) - width2 = int ( width/2. + common_window ) - mid = int(width/2.) - - img_sum = np.sum(textline_image[:,:,0], axis=0) - sum_smoothed = gaussian_filter1d(img_sum, 3) - - peaks_real, _ = find_peaks(sum_smoothed, height=0) - if len(peaks_real)>70: - peak_start = self.return_start_and_end_of_common_text_of_textline_ocr_new_splitted( - peaks_real, sum_smoothed, width1, mid+2) - peak_end = self.return_start_and_end_of_common_text_of_textline_ocr_new_splitted( - peaks_real, sum_smoothed, mid-2, width2) - - #plt.figure(ind_tot) - #plt.imshow(textline_image) - #plt.plot([peak_start, peak_start], [0, height-1]) - #plt.plot([peak_end, peak_end], [0, height-1]) - #plt.savefig('./'+str(ind_tot)+'.png') - - return peak_start, peak_end - else: - pass - - def return_ocr_of_textline_without_common_section( - self, textline_image, model_ocr, processor, device, width_textline, h2w_ratio,ind_tot): - - if h2w_ratio > 0.05: - pixel_values = processor(textline_image, return_tensors="pt").pixel_values - generated_ids = model_ocr.generate(pixel_values.to(device)) - generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] - else: - #width = np.shape(textline_image)[1] - #height = np.shape(textline_image)[0] - #common_window = int(0.3*width) - #width1 = int ( width/2. - common_window ) - #width2 = int ( width/2. + common_window ) - - split_point = self.return_start_and_end_of_common_text_of_textline_ocr_without_common_section( - textline_image, ind_tot) - if split_point: - image1 = textline_image[:, :split_point,:]# image.crop((0, 0, width2, height)) - image2 = textline_image[:, split_point:,:]#image.crop((width1, 0, width, height)) - - #pixel_values1 = processor(image1, return_tensors="pt").pixel_values - #pixel_values2 = processor(image2, return_tensors="pt").pixel_values - - pixel_values_merged = processor([image1,image2], return_tensors="pt").pixel_values - generated_ids_merged = model_ocr.generate(pixel_values_merged.to(device)) - generated_text_merged = processor.batch_decode(generated_ids_merged, skip_special_tokens=True) - - #print(generated_text_merged,'generated_text_merged') - - #generated_ids1 = model_ocr.generate(pixel_values1.to(device)) - #generated_ids2 = model_ocr.generate(pixel_values2.to(device)) - - #generated_text1 = processor.batch_decode(generated_ids1, skip_special_tokens=True)[0] - #generated_text2 = processor.batch_decode(generated_ids2, skip_special_tokens=True)[0] - - #generated_text = generated_text1 + ' ' + generated_text2 - generated_text = generated_text_merged[0] + ' ' + generated_text_merged[1] - - #print(generated_text1,'generated_text1') - #print(generated_text2, 'generated_text2') - #print('########################################') - else: - pixel_values = processor(textline_image, return_tensors="pt").pixel_values - generated_ids = model_ocr.generate(pixel_values.to(device)) - generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] - - #print(generated_text,'generated_text') - #print('########################################') - return generated_text - - def return_ocr_of_textline( - self, textline_image, model_ocr, processor, device, width_textline, h2w_ratio,ind_tot): - - if h2w_ratio > 0.05: - pixel_values = processor(textline_image, return_tensors="pt").pixel_values - generated_ids = model_ocr.generate(pixel_values.to(device)) - generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] - else: - #width = np.shape(textline_image)[1] - #height = np.shape(textline_image)[0] - #common_window = int(0.3*width) - #width1 = int ( width/2. - common_window ) - #width2 = int ( width/2. + common_window ) - - try: - width1, width2 = self.return_start_and_end_of_common_text_of_textline_ocr_new(textline_image, ind_tot) - - image1 = textline_image[:, :width2,:]# image.crop((0, 0, width2, height)) - image2 = textline_image[:, width1:,:]#image.crop((width1, 0, width, height)) - - pixel_values1 = processor(image1, return_tensors="pt").pixel_values - pixel_values2 = processor(image2, return_tensors="pt").pixel_values - - generated_ids1 = model_ocr.generate(pixel_values1.to(device)) - generated_ids2 = model_ocr.generate(pixel_values2.to(device)) - - generated_text1 = processor.batch_decode(generated_ids1, skip_special_tokens=True)[0] - generated_text2 = processor.batch_decode(generated_ids2, skip_special_tokens=True)[0] - #print(generated_text1,'generated_text1') - #print(generated_text2, 'generated_text2') - #print('########################################') - - match = sq(None, generated_text1, generated_text2).find_longest_match( - 0, len(generated_text1), 0, len(generated_text2)) - generated_text = generated_text1 + generated_text2[match.b+match.size:] - except: - pixel_values = processor(textline_image, return_tensors="pt").pixel_values - generated_ids = model_ocr.generate(pixel_values.to(device)) - generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] - - return generated_text - - def return_textline_contour_with_added_box_coordinate(self, textline_contour, box_ind): - textline_contour[:,0] = textline_contour[:,0] + box_ind[2] - textline_contour[:,1] = textline_contour[:,1] + box_ind[0] - return textline_contour + region_ids = ['region_%04d' % i for i in range(len(co_text_all_org))] + return ordered, region_ids def return_list_of_contours_with_desired_order(self, ls_cons, sorted_indexes): return [ls_cons[sorted_indexes[index]] for index in range(len(sorted_indexes))] @@ -3811,7 +4000,7 @@ class Eynollah: if dilation_m1<6: dilation_m1 = 6 #print(dilation_m1, 'dilation_m1') - dilation_m1 = 6 + dilation_m1 = 4#6 dilation_m2 = int(dilation_m1/2.) +1 for i in range(len(x_differential)): @@ -4068,6 +4257,29 @@ class Eynollah: contours[ind_u_a_trs].pop(ittrd) return contours + + def return_indexes_of_contours_loctaed_inside_another_list_of_contours(self, contours, contours_loc, cx_main_loc, cy_main_loc, indexes_loc): + indexes_of_located_cont = [] + center_x_coordinates_of_located = [] + center_y_coordinates_of_located = [] + #M_main_tot = [cv2.moments(contours_loc[j]) + #for j in range(len(contours_loc))] + #cx_main_loc = [(M_main_tot[j]["m10"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] + #cy_main_loc = [(M_main_tot[j]["m01"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] + + for ij in range(len(contours)): + results = [cv2.pointPolygonTest(contours[ij], (cx_main_loc[ind], cy_main_loc[ind]), False) + for ind in range(len(cy_main_loc)) ] + results = np.array(results) + indexes_in = np.where((results == 0) | (results == 1)) + indexes = indexes_loc[indexes_in]# [(results == 0) | (results == 1)]#np.where((results == 0) | (results == 1)) + + indexes_of_located_cont.append(indexes) + center_x_coordinates_of_located.append(np.array(cx_main_loc)[indexes_in] ) + center_y_coordinates_of_located.append(np.array(cy_main_loc)[indexes_in] ) + + return indexes_of_located_cont, center_x_coordinates_of_located, center_y_coordinates_of_located + def filter_contours_without_textline_inside( self, contours,text_con_org, contours_textline, contours_only_text_parent_d_ordered, conf_contours_textregions): @@ -4255,6 +4467,43 @@ class Eynollah: return (slopes_rem, all_found_textline_polygons_rem, boxes_text_rem, txt_con_org_rem, contours_only_text_parent_rem, index_by_text_par_con_rem_sort) + + def separate_marginals_to_left_and_right_and_order_from_top_to_down(self, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes_marginals, mid_point_of_page_width): + cx_marg, cy_marg, _, _, _, _, _ = find_new_features_of_contours( + polygons_of_marginals) + + cx_marg = np.array(cx_marg) + cy_marg = np.array(cy_marg) + + poly_marg_left = list( np.array(polygons_of_marginals)[cx_marg < mid_point_of_page_width] ) + poly_marg_right = list( np.array(polygons_of_marginals)[cx_marg >= mid_point_of_page_width] ) + + all_found_textline_polygons_marginals_left = list( np.array(all_found_textline_polygons_marginals)[cx_marg < mid_point_of_page_width] ) + all_found_textline_polygons_marginals_right = list( np.array(all_found_textline_polygons_marginals)[cx_marg >= mid_point_of_page_width] ) + + all_box_coord_marginals_left = list( np.array(all_box_coord_marginals)[cx_marg < mid_point_of_page_width] ) + all_box_coord_marginals_right = list( np.array(all_box_coord_marginals)[cx_marg >= mid_point_of_page_width] ) + + slopes_marg_left = list( np.array(slopes_marginals)[cx_marg < mid_point_of_page_width] ) + slopes_marg_right = list( np.array(slopes_marginals)[cx_marg >= mid_point_of_page_width] ) + + cy_marg_left = cy_marg[cx_marg < mid_point_of_page_width] + cy_marg_right = cy_marg[cx_marg >= mid_point_of_page_width] + + ordered_left_marginals = [poly for _, poly in sorted(zip(cy_marg_left, poly_marg_left), key=lambda x: x[0])] + ordered_right_marginals = [poly for _, poly in sorted(zip(cy_marg_right, poly_marg_right), key=lambda x: x[0])] + + ordered_left_marginals_textline = [poly for _, poly in sorted(zip(cy_marg_left, all_found_textline_polygons_marginals_left), key=lambda x: x[0])] + ordered_right_marginals_textline = [poly for _, poly in sorted(zip(cy_marg_right, all_found_textline_polygons_marginals_right), key=lambda x: x[0])] + + ordered_left_marginals_bbox = [poly for _, poly in sorted(zip(cy_marg_left, all_box_coord_marginals_left), key=lambda x: x[0])] + ordered_right_marginals_bbox = [poly for _, poly in sorted(zip(cy_marg_right, all_box_coord_marginals_right), key=lambda x: x[0])] + + ordered_left_slopes_marginals = [poly for _, poly in sorted(zip(cy_marg_left, slopes_marg_left), key=lambda x: x[0])] + ordered_right_slopes_marginals = [poly for _, poly in sorted(zip(cy_marg_right, slopes_marg_right), key=lambda x: x[0])] + + return ordered_left_marginals, ordered_right_marginals, ordered_left_marginals_textline, ordered_right_marginals_textline, ordered_left_marginals_bbox, ordered_right_marginals_bbox, ordered_left_slopes_marginals, ordered_right_slopes_marginals + def run(self, image_filename : Optional[str] = None, dir_in : Optional[str] = None, overwrite : bool = False): """ @@ -4271,6 +4520,7 @@ class Eynollah: raise ValueError("run requires either a single image filename or a directory") for img_filename in self.ls_imgs: + print(img_filename, 'img_filename') self.logger.info(img_filename) t0 = time.time() @@ -4296,14 +4546,14 @@ class Eynollah: t0 = time.time() img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(self.light_version) self.logger.info("Enhancing took %.1fs ", time.time() - t0) + if self.extract_only_images: text_regions_p_1, erosion_hurts, polygons_lines_xml, polygons_of_images, image_page, page_coord, cont_page = \ self.get_regions_light_v_extract_only_images(img_res, is_image_enhanced, num_col_classifier) - ocr_all_textlines = None pcgts = self.writer.build_pagexml_no_full_layout( [], page_coord, [], [], [], [], - polygons_of_images, [], [], [], [], [], - cont_page, [], [], ocr_all_textlines, []) + polygons_of_images, [], [], [], [], [], [], [], [], [], + cont_page, [], []) if self.plotter: self.plotter.write_images_into_directory(polygons_of_images, image_page) return pcgts @@ -4316,12 +4566,13 @@ class Eynollah: page_coord, image_page, textline_mask_tot_ea, img_bin_light, cont_page = \ self.run_graphics_and_columns_without_layout(textline_mask_tot_ea, img_bin_light) - ##all_found_textline_polygons =self.scale_contours_new(textline_mask_tot_ea) cnt_clean_rot_raw, hir_on_cnt_clean_rot = return_contours_of_image(textline_mask_tot_ea) all_found_textline_polygons = filter_contours_area_of_image( textline_mask_tot_ea, cnt_clean_rot_raw, hir_on_cnt_clean_rot, max_area=1, min_area=0.00001) + + all_found_textline_polygons = all_found_textline_polygons[::-1] all_found_textline_polygons=[ all_found_textline_polygons ] @@ -4329,26 +4580,36 @@ class Eynollah: all_found_textline_polygons) all_found_textline_polygons = self.filter_contours_inside_a_bigger_one( all_found_textline_polygons, None, textline_mask_tot_ea, type_contour="textline") - - + + order_text_new = [0] slopes =[0] id_of_texts_tot =['region_0001'] polygons_of_images = [] - slopes_marginals = [] - polygons_of_marginals = [] - all_found_textline_polygons_marginals = [] - all_box_coord_marginals = [] + slopes_marginals_left = [] + slopes_marginals_right = [] + polygons_of_marginals_left = [] + polygons_of_marginals_right = [] + all_found_textline_polygons_marginals_left = [] + all_found_textline_polygons_marginals_right = [] + all_box_coord_marginals_left = [] + all_box_coord_marginals_right = [] polygons_lines_xml = [] contours_tables = [] - ocr_all_textlines = None - conf_contours_textregions =None + conf_contours_textregions =[0] + + if self.ocr and not self.tr: + gc.collect() + ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons, self.prediction_model, self.b_s_ocr, self.num_to_char, textline_light=True) + else: + ocr_all_textlines = None + pcgts = self.writer.build_pagexml_no_full_layout( cont_page, page_coord, order_text_new, id_of_texts_tot, - all_found_textline_polygons, page_coord, polygons_of_images, polygons_of_marginals, - all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, - cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines, conf_contours_textregions) + all_found_textline_polygons, page_coord, polygons_of_images, polygons_of_marginals_left, polygons_of_marginals_right, + all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_marginals_left, slopes_marginals_right, + cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines=ocr_all_textlines, conf_contours_textregion=conf_contours_textregions, skip_layout_reading_order=self.skip_layout_and_reading_order) return pcgts #print("text region early -1 in %.1fs", time.time() - t0) @@ -4395,13 +4656,11 @@ class Eynollah: #self.logger.info('cont_page %s', cont_page) #plt.imshow(table_prediction) #plt.show() - if not num_col: self.logger.info("No columns detected, outputting an empty PAGE-XML") - ocr_all_textlines = None pcgts = self.writer.build_pagexml_no_full_layout( - [], page_coord, [], [], [], [], [], [], [], [], [], [], - cont_page, [], [], ocr_all_textlines, []) + [], page_coord, [], [], [], [], [], [], [], [], [], [], [], [], [], [], + cont_page, [], []) return pcgts #print("text region early in %.1fs", time.time() - t0) @@ -4572,6 +4831,7 @@ class Eynollah: contours_only_text_parent_d_ordered = [] contours_only_text_parent_d = [] #contours_only_text_parent = [] + if not len(contours_only_text_parent): # stop early empty_marginals = [[]] * len(polygons_of_marginals) @@ -4579,14 +4839,14 @@ class Eynollah: pcgts = self.writer.build_pagexml_full_layout( [], [], page_coord, [], [], [], [], [], [], polygons_of_images, contours_tables, [], - polygons_of_marginals, empty_marginals, empty_marginals, [], [], [], - cont_page, polygons_lines_xml, [], [], []) + polygons_of_marginals, polygons_of_marginals, empty_marginals, empty_marginals, empty_marginals, empty_marginals, [], [], [], [], + cont_page, polygons_lines_xml) else: pcgts = self.writer.build_pagexml_no_full_layout( [], page_coord, [], [], [], [], polygons_of_images, - polygons_of_marginals, empty_marginals, empty_marginals, [], [], - cont_page, polygons_lines_xml, contours_tables, [], []) + polygons_of_marginals, polygons_of_marginals, empty_marginals, empty_marginals, empty_marginals, empty_marginals, [], [], [], + cont_page, polygons_lines_xml, contours_tables) return pcgts @@ -4678,8 +4938,11 @@ class Eynollah: num_col_classifier, scale_param, slope_deskew) all_found_textline_polygons_marginals = small_textlines_to_parent_adherence2( all_found_textline_polygons_marginals, textline_mask_tot_ea, num_col_classifier) - - #print("text region early 6 in %.1fs", time.time() - t0) + + mid_point_of_page_width = text_regions_p.shape[1] / 2. + polygons_of_marginals_left, polygons_of_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes_marginals_left, slopes_marginals_right = self.separate_marginals_to_left_and_right_and_order_from_top_to_down(polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes_marginals, mid_point_of_page_width) + + #print(len(polygons_of_marginals), len(ordered_left_marginals), len(ordered_right_marginals), 'marginals ordred') if self.full_layout: if np.abs(slope_deskew) >= SLOPE_THRESHOLD: contours_only_text_parent_d_ordered = self.return_list_of_contours_with_desired_order( @@ -4710,10 +4973,10 @@ class Eynollah: pixel_img = 4 polygons_of_drop_capitals = return_contours_of_interested_region_by_min_size(text_regions_p, pixel_img) - all_found_textline_polygons = adhere_drop_capital_region_into_corresponding_textline( - text_regions_p, polygons_of_drop_capitals, contours_only_text_parent, contours_only_text_parent_h, - all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, - kernel=KERNEL, curved_line=self.curved_line, textline_light=self.textline_light) + ##all_found_textline_polygons = adhere_drop_capital_region_into_corresponding_textline( + ##text_regions_p, polygons_of_drop_capitals, contours_only_text_parent, contours_only_text_parent_h, + ##all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, + ##kernel=KERNEL, curved_line=self.curved_line, textline_light=self.textline_light) if not self.reading_order_machine_based: pixel_seps = 6 @@ -4759,6 +5022,7 @@ class Eynollah: if self.full_layout: if self.reading_order_machine_based: + tror = time.time() order_text_new, id_of_texts_tot = self.do_order_of_regions_with_model( contours_only_text_parent, contours_only_text_parent_h, text_regions_p) else: @@ -4770,16 +5034,44 @@ class Eynollah: contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d) self.logger.info("detection of reading order took %.1fs", time.time() - t_order) - if self.ocr: - ocr_all_textlines = [] + if self.ocr and not self.tr: + gc.collect() + if len(all_found_textline_polygons)>0: + ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + else: + ocr_all_textlines = None + + if all_found_textline_polygons_marginals_left and len(all_found_textline_polygons_marginals_left)>0: + ocr_all_textlines_marginals_left = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons_marginals_left, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + else: + ocr_all_textlines_marginals_left = None + + if all_found_textline_polygons_marginals_right and len(all_found_textline_polygons_marginals_right)>0: + ocr_all_textlines_marginals_right = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons_marginals_right, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + else: + ocr_all_textlines_marginals_right = None + + if all_found_textline_polygons_h and len(all_found_textline_polygons)>0: + ocr_all_textlines_h = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons_h, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + else: + ocr_all_textlines_h = None + + if polygons_of_drop_capitals and len(polygons_of_drop_capitals)>0: + ocr_all_textlines_drop = return_rnn_cnn_ocr_of_given_textlines(image_page, polygons_of_drop_capitals, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + else: + ocr_all_textlines_drop = None else: ocr_all_textlines = None + ocr_all_textlines_marginals_left = None + ocr_all_textlines_marginals_right = None + ocr_all_textlines_h = None + ocr_all_textlines_drop = None pcgts = self.writer.build_pagexml_full_layout( contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, - polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals, - all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, - cont_page, polygons_lines_xml, ocr_all_textlines, conf_contours_textregions, conf_contours_textregions_h) + polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals_left, polygons_of_marginals_right, + all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_h, slopes_marginals_left, slopes_marginals_right, + cont_page, polygons_lines_xml, ocr_all_textlines, ocr_all_textlines_h, ocr_all_textlines_marginals_left, ocr_all_textlines_marginals_right, ocr_all_textlines_drop, conf_contours_textregions, conf_contours_textregions_h) return pcgts contours_only_text_parent_h = None @@ -4802,7 +5094,7 @@ class Eynollah: order_text_new, id_of_texts_tot = self.do_order_of_regions( contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d) - if self.ocr: + if self.ocr and self.tr: device = cuda.get_current_device() device.reset() gc.collect() @@ -4849,16 +5141,29 @@ class Eynollah: ocr_textline_in_textregion.append(text_ocr) ind_tot = ind_tot +1 ocr_all_textlines.append(ocr_textline_in_textregion) + + elif self.ocr and not self.tr: + gc.collect() + if len(all_found_textline_polygons)>0: + ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + + if all_found_textline_polygons_marginals_left and len(all_found_textline_polygons_marginals_left)>0: + ocr_all_textlines_marginals_left = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons_marginals_left, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + + if all_found_textline_polygons_marginals_right and len(all_found_textline_polygons_marginals_right)>0: + ocr_all_textlines_marginals_right = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons_marginals_right, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) else: ocr_all_textlines = None - #print(ocr_all_textlines) + ocr_all_textlines_marginals_left = None + ocr_all_textlines_marginals_right = None self.logger.info("detection of reading order took %.1fs", time.time() - t_order) + pcgts = self.writer.build_pagexml_no_full_layout( txt_con_org, page_coord, order_text_new, id_of_texts_tot, - all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, - all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, - cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines, conf_contours_textregions) + all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals_left, polygons_of_marginals_right, + all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_marginals_left, slopes_marginals_right, + cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines, ocr_all_textlines_marginals_left, ocr_all_textlines_marginals_right, conf_contours_textregions) return pcgts @@ -4866,221 +5171,117 @@ class Eynollah_ocr: def __init__( self, dir_models, + model_name=None, dir_xmls=None, dir_in=None, + image_filename=None, dir_in_bin=None, dir_out=None, dir_out_image_text=None, tr_ocr=False, + batch_size=None, export_textline_images_and_text=False, do_not_mask_with_textline_contour=False, - draw_texts_on_image=False, prediction_with_both_of_rgb_and_bin=False, + pref_of_dataset=None, + min_conf_value_of_textline_text : Optional[float]=None, logger=None, ): self.dir_in = dir_in + self.image_filename = image_filename self.dir_in_bin = dir_in_bin self.dir_out = dir_out self.dir_xmls = dir_xmls self.dir_models = dir_models + self.model_name = model_name self.tr_ocr = tr_ocr self.export_textline_images_and_text = export_textline_images_and_text self.do_not_mask_with_textline_contour = do_not_mask_with_textline_contour - self.draw_texts_on_image = draw_texts_on_image self.dir_out_image_text = dir_out_image_text self.prediction_with_both_of_rgb_and_bin = prediction_with_both_of_rgb_and_bin - if tr_ocr: - self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed") - self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - self.model_ocr_dir = dir_models + "/trocr_model_ens_of_3_checkpoints_201124" - self.model_ocr = VisionEncoderDecoderModel.from_pretrained(self.model_ocr_dir) - self.model_ocr.to(self.device) - - else: - self.model_ocr_dir = dir_models + "/model_step_1050000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn" - model_ocr = load_model(self.model_ocr_dir , compile=False) - - self.prediction_model = tf.keras.models.Model( - model_ocr.get_layer(name = "image").input, - model_ocr.get_layer(name = "dense2").output) - - - with open(os.path.join(self.model_ocr_dir, "characters_org.txt"),"r") as config_file: - characters = json.load(config_file) - - - AUTOTUNE = tf.data.AUTOTUNE - - # Mapping characters to integers. - char_to_num = StringLookup(vocabulary=list(characters), mask_token=None) - - # Mapping integers back to original characters. - self.num_to_char = StringLookup( - vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True - ) + self.pref_of_dataset = pref_of_dataset + self.logger = logger if logger else getLogger('eynollah') - def decode_batch_predictions(self, pred, max_len = 128): - # input_len is the product of the batch size and the - # number of time steps. - input_len = np.ones(pred.shape[0]) * pred.shape[1] - - # Decode CTC predictions using greedy search. - # decoded is a tuple with 2 elements. - decoded = tf.keras.backend.ctc_decode(pred, - input_length = input_len, - beam_width = 100) - # The outputs are in the first element of the tuple. - # Additionally, the first element is actually a list, - # therefore we take the first element of that list as well. - #print(decoded,'decoded') - decoded = decoded[0][0][:, :max_len] - - #print(decoded, decoded.shape,'decoded') - - output = [] - for d in decoded: - # Convert the predicted indices to the corresponding chars. - d = tf.strings.reduce_join(self.num_to_char(d)) - d = d.numpy().decode("utf-8") - output.append(d) - return output - - - def distortion_free_resize(self, image, img_size): - w, h = img_size - image = tf.image.resize(image, size=(h, w), preserve_aspect_ratio=True) - - # Check tha amount of padding needed to be done. - pad_height = h - tf.shape(image)[0] - pad_width = w - tf.shape(image)[1] - - # Only necessary if you want to do same amount of padding on both sides. - if pad_height % 2 != 0: - height = pad_height // 2 - pad_height_top = height + 1 - pad_height_bottom = height - else: - pad_height_top = pad_height_bottom = pad_height // 2 - - if pad_width % 2 != 0: - width = pad_width // 2 - pad_width_left = width + 1 - pad_width_right = width - else: - pad_width_left = pad_width_right = pad_width // 2 - - image = tf.pad( - image, - paddings=[ - [pad_height_top, pad_height_bottom], - [pad_width_left, pad_width_right], - [0, 0], - ], - ) - - image = tf.transpose(image, (1, 0, 2)) - image = tf.image.flip_left_right(image) - return image - - def return_start_and_end_of_common_text_of_textline_ocr_without_common_section(self, textline_image): - width = np.shape(textline_image)[1] - height = np.shape(textline_image)[0] - common_window = int(0.22*width) - - width1 = int ( width/2. - common_window ) - width2 = int ( width/2. + common_window ) - - img_sum = np.sum(textline_image[:,:,0], axis=0) - sum_smoothed = gaussian_filter1d(img_sum, 3) - - peaks_real, _ = find_peaks(sum_smoothed, height=0) - - if len(peaks_real)>35: - - #peaks_real = peaks_real[(peaks_realwidth1)] - argsort = np.argsort(sum_smoothed[peaks_real])[::-1] - peaks_real_top_six = peaks_real[argsort[:6]] - midpoint = textline_image.shape[1] / 2. - arg_closest = np.argmin(np.abs(peaks_real_top_six - midpoint)) - - #arg_max = np.argmax(sum_smoothed[peaks_real]) - - peaks_final = peaks_real_top_six[arg_closest]#peaks_real[arg_max] - - return peaks_final - else: - return None - - # Function to fit text inside the given area - def fit_text_single_line(self, draw, text, font_path, max_width, max_height): - initial_font_size = 50 - font_size = initial_font_size - while font_size > 10: # Minimum font size - font = ImageFont.truetype(font_path, font_size) - text_bbox = draw.textbbox((0, 0), text, font=font) # Get text bounding box - text_width = text_bbox[2] - text_bbox[0] - text_height = text_bbox[3] - text_bbox[1] - - if text_width <= max_width and text_height <= max_height: - return font # Return the best-fitting font - - font_size -= 2 # Reduce font size and retry - - return ImageFont.truetype(font_path, 10) # Smallest font fallback - - def return_textlines_split_if_needed(self, textline_image, textline_image_bin): - - split_point = self.return_start_and_end_of_common_text_of_textline_ocr_without_common_section(textline_image) - if split_point: - image1 = textline_image[:, :split_point,:]# image.crop((0, 0, width2, height)) - image2 = textline_image[:, split_point:,:]#image.crop((width1, 0, width, height)) - if self.prediction_with_both_of_rgb_and_bin: - image1_bin = textline_image_bin[:, :split_point,:]# image.crop((0, 0, width2, height)) - image2_bin = textline_image_bin[:, split_point:,:]#image.crop((width1, 0, width, height)) - return [image1, image2], [image1_bin, image2_bin] + if not export_textline_images_and_text: + if min_conf_value_of_textline_text: + self.min_conf_value_of_textline_text = float(min_conf_value_of_textline_text) else: - return [image1, image2], None + self.min_conf_value_of_textline_text = 0.3 + if tr_ocr: + self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed") + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + if self.model_name: + self.model_ocr_dir = self.model_name + else: + self.model_ocr_dir = dir_models + "/trocr_model_ens_of_3_checkpoints_201124" + self.model_ocr = VisionEncoderDecoderModel.from_pretrained(self.model_ocr_dir) + self.model_ocr.to(self.device) + if not batch_size: + self.b_s = 2 + else: + self.b_s = int(batch_size) + + else: + if self.model_name: + self.model_ocr_dir = self.model_name + else: + self.model_ocr_dir = dir_models + "/model_eynollah_ocr_cnnrnn_20250805" + model_ocr = load_model(self.model_ocr_dir , compile=False) + + self.prediction_model = tf.keras.models.Model( + model_ocr.get_layer(name = "image").input, + model_ocr.get_layer(name = "dense2").output) + if not batch_size: + self.b_s = 8 + else: + self.b_s = int(batch_size) + + with open(os.path.join(self.model_ocr_dir, "characters_org.txt"),"r") as config_file: + characters = json.load(config_file) + + AUTOTUNE = tf.data.AUTOTUNE + + # Mapping characters to integers. + char_to_num = StringLookup(vocabulary=list(characters), mask_token=None) + + # Mapping integers back to original characters. + self.num_to_char = StringLookup( + vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True + ) + self.end_character = len(characters) + 2 + + def run(self, overwrite : bool = False): + if self.dir_in: + ls_imgs = os.listdir(self.dir_in) else: - return None, None - def preprocess_and_resize_image_for_ocrcnn_model(self, img, image_height, image_width): - ratio = image_height /float(img.shape[0]) - w_ratio = int(ratio * img.shape[1]) - - if w_ratio <= image_width: - width_new = w_ratio - else: - width_new = image_width - - if width_new == 0: - width_new = img.shape[1] - - ##if width_new+32 >= image_width: - ##width_new = width_new - 32 - - ###patch_zero = np.zeros((32, 32, 3))#+255 - ###patch_zero[9:19,8:18,:] = 0 - - - img = resize_image(img, image_height, width_new) - img_fin = np.ones((image_height, image_width, 3))*255 - ###img_fin[:,:32,:] = patch_zero[:,:,:] - ###img_fin[:,32:32+width_new,:] = img[:,:,:] - img_fin[:,:width_new,:] = img[:,:,:] - img_fin = img_fin / 255. - return img_fin - - def run(self): - ls_imgs = os.listdir(self.dir_in) + ls_imgs = [self.image_filename] if self.tr_ocr: - b_s = 2 + tr_ocr_input_height_and_width = 384 for ind_img in ls_imgs: - t0 = time.time() - file_name = ind_img.split('.')[0] - dir_img = os.path.join(self.dir_in, ind_img) + if self.dir_in: + file_name = Path(ind_img).stem + dir_img = os.path.join(self.dir_in, ind_img) + else: + file_name = Path(self.image_filename).stem + dir_img = self.image_filename dir_xml = os.path.join(self.dir_xmls, file_name+'.xml') out_file_ocr = os.path.join(self.dir_out, file_name+'.xml') + + if os.path.exists(out_file_ocr): + if overwrite: + self.logger.warning("will overwrite existing output file '%s'", out_file_ocr) + else: + self.logger.warning("will skip input for existing output file '%s'", out_file_ocr) + continue + img = cv2.imread(dir_img) + + if self.dir_out_image_text: + out_image_with_text = os.path.join(self.dir_out_image_text, file_name+'.png') + image_text = Image.new("RGB", (img.shape[1], img.shape[0]), "white") + draw = ImageDraw.Draw(image_text) + total_bb_coordinates = [] ##file_name = Path(dir_xmls).stem tree1 = ET.parse(dir_xml, parser = ET.XMLParser(encoding="utf-8")) @@ -5111,6 +5312,9 @@ class Eynollah_ocr: textline_coords = np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) x,y,w,h = cv2.boundingRect(textline_coords) + if self.dir_out_image_text: + total_bb_coordinates.append([x,y,w,h]) + h2w_ratio = h/float(w) img_poly_on_img = np.copy(img) @@ -5122,15 +5326,15 @@ class Eynollah_ocr: img_crop[mask_poly==0] = 255 if h2w_ratio > 0.1: - cropped_lines.append(img_crop) + cropped_lines.append(resize_image(img_crop, tr_ocr_input_height_and_width, tr_ocr_input_height_and_width) ) cropped_lines_meging_indexing.append(0) else: - splited_images, _ = self.return_textlines_split_if_needed(img_crop, None) + splited_images, _ = return_textlines_split_if_needed(img_crop, None) #print(splited_images) if splited_images: - cropped_lines.append(splited_images[0]) + cropped_lines.append(resize_image(splited_images[0], tr_ocr_input_height_and_width, tr_ocr_input_height_and_width)) cropped_lines_meging_indexing.append(1) - cropped_lines.append(splited_images[1]) + cropped_lines.append(resize_image(splited_images[1], tr_ocr_input_height_and_width, tr_ocr_input_height_and_width)) cropped_lines_meging_indexing.append(-1) else: cropped_lines.append(img_crop) @@ -5139,28 +5343,59 @@ class Eynollah_ocr: extracted_texts = [] - n_iterations = math.ceil(len(cropped_lines) / b_s) + n_iterations = math.ceil(len(cropped_lines) / self.b_s) for i in range(n_iterations): if i==(n_iterations-1): - n_start = i*b_s + n_start = i*self.b_s imgs = cropped_lines[n_start:] else: - n_start = i*b_s - n_end = (i+1)*b_s + n_start = i*self.b_s + n_end = (i+1)*self.b_s imgs = cropped_lines[n_start:n_end] pixel_values_merged = self.processor(imgs, return_tensors="pt").pixel_values generated_ids_merged = self.model_ocr.generate(pixel_values_merged.to(self.device)) generated_text_merged = self.processor.batch_decode(generated_ids_merged, skip_special_tokens=True) extracted_texts = extracted_texts + generated_text_merged + + del cropped_lines + gc.collect() - extracted_texts_merged = [extracted_texts[ind] if cropped_lines_meging_indexing[ind]==0 else extracted_texts[ind]+extracted_texts[ind+1] if cropped_lines_meging_indexing[ind]==1 else None for ind in range(len(cropped_lines_meging_indexing))] + extracted_texts_merged = [extracted_texts[ind] if cropped_lines_meging_indexing[ind]==0 else extracted_texts[ind]+" "+extracted_texts[ind+1] if cropped_lines_meging_indexing[ind]==1 else None for ind in range(len(cropped_lines_meging_indexing))] extracted_texts_merged = [ind for ind in extracted_texts_merged if ind is not None] #print(extracted_texts_merged, len(extracted_texts_merged)) unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer) + + if self.dir_out_image_text: + + font_path = "Charis-7.000/Charis-Regular.ttf" # Make sure this file exists! + font = ImageFont.truetype(font_path, 40) + + for indexer_text, bb_ind in enumerate(total_bb_coordinates): + + + x_bb = bb_ind[0] + y_bb = bb_ind[1] + w_bb = bb_ind[2] + h_bb = bb_ind[3] + + font = fit_text_single_line(draw, extracted_texts_merged[indexer_text], font_path, w_bb, int(h_bb*0.4) ) + + ##draw.rectangle([x_bb, y_bb, x_bb + w_bb, y_bb + h_bb], outline="red", width=2) + + text_bbox = draw.textbbox((0, 0), extracted_texts_merged[indexer_text], font=font) + text_width = text_bbox[2] - text_bbox[0] + text_height = text_bbox[3] - text_bbox[1] + + text_x = x_bb + (w_bb - text_width) // 2 # Center horizontally + text_y = y_bb + (h_bb - text_height) // 2 # Center vertically + + # Draw the text + draw.text((text_x, text_y), extracted_texts_merged[indexer_text], fill="black", font=font) + image_text.save(out_image_with_text) #print(len(unique_cropped_lines_region_indexer), 'unique_cropped_lines_region_indexer') text_by_textregion = [] @@ -5200,28 +5435,41 @@ class Eynollah_ocr: tree1.write(out_file_ocr,xml_declaration=True,method='xml',encoding="utf8",default_namespace=None) #print("Job done in %.1fs", time.time() - t0) else: - max_len = 512 - padding_token = 299 + ###max_len = 280#512#280#512 + ###padding_token = 1500#299#1500#299 image_width = 512#max_len * 4 image_height = 32 - b_s = 8 img_size=(image_width, image_height) for ind_img in ls_imgs: - t0 = time.time() - file_name = ind_img.split('.')[0] - dir_img = os.path.join(self.dir_in, ind_img) + if self.dir_in: + file_name = Path(ind_img).stem + dir_img = os.path.join(self.dir_in, ind_img) + else: + file_name = Path(self.image_filename).stem + dir_img = self.image_filename + + #file_name = Path(ind_img).stem + #dir_img = os.path.join(self.dir_in, ind_img) dir_xml = os.path.join(self.dir_xmls, file_name+'.xml') out_file_ocr = os.path.join(self.dir_out, file_name+'.xml') + + if os.path.exists(out_file_ocr): + if overwrite: + self.logger.warning("will overwrite existing output file '%s'", out_file_ocr) + else: + self.logger.warning("will skip input for existing output file '%s'", out_file_ocr) + continue + img = cv2.imread(dir_img) if self.prediction_with_both_of_rgb_and_bin: cropped_lines_bin = [] dir_img_bin = os.path.join(self.dir_in_bin, file_name+'.png') img_bin = cv2.imread(dir_img_bin) - if self.draw_texts_on_image: + if self.dir_out_image_text: out_image_with_text = os.path.join(self.dir_out_image_text, file_name+'.png') image_text = Image.new("RGB", (img.shape[1], img.shape[0]), "white") draw = ImageDraw.Draw(image_text) @@ -5238,6 +5486,7 @@ class Eynollah_ocr: region_tags=np.unique([x for x in alltags if x.endswith('TextRegion')]) cropped_lines = [] + cropped_lines_ver_index = [] cropped_lines_region_indexer = [] cropped_lines_meging_indexing = [] @@ -5245,6 +5494,10 @@ class Eynollah_ocr: indexer_text_region = 0 indexer_textlines = 0 for nn in root1.iter(region_tags): + try: + type_textregion = nn.attrib['type'] + except: + type_textregion = 'paragraph' for child_textregion in nn: if child_textregion.tag.endswith("TextLine"): for child_textlines in child_textregion: @@ -5255,9 +5508,15 @@ class Eynollah_ocr: x,y,w,h = cv2.boundingRect(textline_coords) - if self.draw_texts_on_image: - total_bb_coordinates.append([x,y,w,h]) + angle_radians = math.atan2(h, w) + # Convert to degrees + angle_degrees = math.degrees(angle_radians) + if type_textregion=='drop-capital': + angle_degrees = 0 + if self.dir_out_image_text: + total_bb_coordinates.append([x,y,w,h]) + w_scaled = w * image_height/float(h) img_poly_on_img = np.copy(img) @@ -5268,112 +5527,317 @@ class Eynollah_ocr: mask_poly = np.zeros(img.shape) mask_poly = cv2.fillPoly(mask_poly, pts=[textline_coords], color=(1, 1, 1)) + mask_poly = mask_poly[y:y+h, x:x+w, :] img_crop = img_poly_on_img[y:y+h, x:x+w, :] - if not self.do_not_mask_with_textline_contour: - img_crop[mask_poly==0] = 255 - if self.prediction_with_both_of_rgb_and_bin: - img_crop_bin[mask_poly==0] = 255 + + if self.export_textline_images_and_text: + if not self.do_not_mask_with_textline_contour: + img_crop[mask_poly==0] = 255 + + else: + #print(file_name, angle_degrees,w*h , mask_poly[:,:,0].sum(), mask_poly[:,:,0].sum() /float(w*h) , 'didi') + + if angle_degrees > 3: + better_des_slope = get_orientation_moments(textline_coords) + + img_crop = rotate_image_with_padding(img_crop, better_des_slope ) + + if self.prediction_with_both_of_rgb_and_bin: + img_crop_bin = rotate_image_with_padding(img_crop_bin, better_des_slope ) + + mask_poly = rotate_image_with_padding(mask_poly, better_des_slope ) + mask_poly = mask_poly.astype('uint8') + + #new bounding box + x_n, y_n, w_n, h_n = get_contours_and_bounding_boxes(mask_poly[:,:,0]) + + mask_poly = mask_poly[y_n:y_n+h_n, x_n:x_n+w_n, :] + img_crop = img_crop[y_n:y_n+h_n, x_n:x_n+w_n, :] + + if not self.do_not_mask_with_textline_contour: + img_crop[mask_poly==0] = 255 + + if self.prediction_with_both_of_rgb_and_bin: + img_crop_bin = img_crop_bin[y_n:y_n+h_n, x_n:x_n+w_n, :] + if not self.do_not_mask_with_textline_contour: + img_crop_bin[mask_poly==0] = 255 + + if mask_poly[:,:,0].sum() /float(w_n*h_n) < 0.50 and w_scaled > 90: + if self.prediction_with_both_of_rgb_and_bin: + img_crop, img_crop_bin = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly, img_crop_bin) + else: + img_crop, _ = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly) + + + else: + better_des_slope = 0 + if not self.do_not_mask_with_textline_contour: + img_crop[mask_poly==0] = 255 + if self.prediction_with_both_of_rgb_and_bin: + if not self.do_not_mask_with_textline_contour: + img_crop_bin[mask_poly==0] = 255 + if type_textregion=='drop-capital': + pass + else: + if mask_poly[:,:,0].sum() /float(w*h) < 0.50 and w_scaled > 90: + if self.prediction_with_both_of_rgb_and_bin: + img_crop, img_crop_bin = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly, img_crop_bin) + else: + img_crop, _ = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly) if not self.export_textline_images_and_text: - if w_scaled < 1.5*image_width: - img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) + if w_scaled < 750:#1.5*image_width: + img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) cropped_lines.append(img_fin) + if abs(better_des_slope) > 45: + cropped_lines_ver_index.append(1) + else: + cropped_lines_ver_index.append(0) + cropped_lines_meging_indexing.append(0) if self.prediction_with_both_of_rgb_and_bin: - img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(img_crop_bin, image_height, image_width) + img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop_bin, image_height, image_width) cropped_lines_bin.append(img_fin) else: if self.prediction_with_both_of_rgb_and_bin: - splited_images, splited_images_bin = self.return_textlines_split_if_needed(img_crop, img_crop_bin) + splited_images, splited_images_bin = return_textlines_split_if_needed(img_crop, img_crop_bin, prediction_with_both_of_rgb_and_bin=self.prediction_with_both_of_rgb_and_bin) else: - splited_images, splited_images_bin = self.return_textlines_split_if_needed(img_crop, None) + splited_images, splited_images_bin = return_textlines_split_if_needed(img_crop, None) if splited_images: - img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(splited_images[0], image_height, image_width) + img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images[0], image_height, image_width) cropped_lines.append(img_fin) cropped_lines_meging_indexing.append(1) - img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(splited_images[1], image_height, image_width) + + if abs(better_des_slope) > 45: + cropped_lines_ver_index.append(1) + else: + cropped_lines_ver_index.append(0) + + img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images[1], image_height, image_width) cropped_lines.append(img_fin) cropped_lines_meging_indexing.append(-1) + if abs(better_des_slope) > 45: + cropped_lines_ver_index.append(1) + else: + cropped_lines_ver_index.append(0) + if self.prediction_with_both_of_rgb_and_bin: - img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(splited_images_bin[0], image_height, image_width) + img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images_bin[0], image_height, image_width) cropped_lines_bin.append(img_fin) - img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(splited_images_bin[1], image_height, image_width) + img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images_bin[1], image_height, image_width) cropped_lines_bin.append(img_fin) else: - img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) + img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) cropped_lines.append(img_fin) cropped_lines_meging_indexing.append(0) + if abs(better_des_slope) > 45: + cropped_lines_ver_index.append(1) + else: + cropped_lines_ver_index.append(0) + if self.prediction_with_both_of_rgb_and_bin: - img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(img_crop_bin, image_height, image_width) + img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop_bin, image_height, image_width) cropped_lines_bin.append(img_fin) if self.export_textline_images_and_text: - if child_textlines.tag.endswith("TextEquiv"): - for cheild_text in child_textlines: - if cheild_text.tag.endswith("Unicode"): - textline_text = cheild_text.text - if textline_text: - with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'.txt'), 'w') as text_file: - text_file.write(textline_text) + if img_crop.shape[0]==0 or img_crop.shape[1]==0: + pass + else: + if child_textlines.tag.endswith("TextEquiv"): + for cheild_text in child_textlines: + if cheild_text.tag.endswith("Unicode"): + textline_text = cheild_text.text + if textline_text: + if self.do_not_mask_with_textline_contour: + if self.pref_of_dataset: + with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_'+self.pref_of_dataset+'.txt'), 'w') as text_file: + text_file.write(textline_text) - cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'.png'), img_crop ) - - indexer_textlines+=1 + cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_'+self.pref_of_dataset+'.png'), img_crop ) + else: + with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'.txt'), 'w') as text_file: + text_file.write(textline_text) + + cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'.png'), img_crop ) + else: + if self.pref_of_dataset: + with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_'+self.pref_of_dataset+'_masked.txt'), 'w') as text_file: + text_file.write(textline_text) + + cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_'+self.pref_of_dataset+'_masked.png'), img_crop ) + else: + with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_masked.txt'), 'w') as text_file: + text_file.write(textline_text) + + cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_masked.png'), img_crop ) + + indexer_textlines+=1 if not self.export_textline_images_and_text: indexer_text_region = indexer_text_region +1 if not self.export_textline_images_and_text: extracted_texts = [] + extracted_conf_value = [] - n_iterations = math.ceil(len(cropped_lines) / b_s) + n_iterations = math.ceil(len(cropped_lines) / self.b_s) for i in range(n_iterations): if i==(n_iterations-1): - n_start = i*b_s + n_start = i*self.b_s imgs = cropped_lines[n_start:] imgs = np.array(imgs) imgs = imgs.reshape(imgs.shape[0], image_height, image_width, 3) + + ver_imgs = np.array( cropped_lines_ver_index[n_start:] ) + indices_ver = np.where(ver_imgs == 1)[0] + + #print(indices_ver, 'indices_ver') + if len(indices_ver)>0: + imgs_ver_flipped = imgs[indices_ver, : ,: ,:] + imgs_ver_flipped = imgs_ver_flipped[:,::-1,::-1,:] + #print(imgs_ver_flipped, 'imgs_ver_flipped') + + else: + imgs_ver_flipped = None + if self.prediction_with_both_of_rgb_and_bin: imgs_bin = cropped_lines_bin[n_start:] imgs_bin = np.array(imgs_bin) imgs_bin = imgs_bin.reshape(imgs_bin.shape[0], image_height, image_width, 3) + + if len(indices_ver)>0: + imgs_bin_ver_flipped = imgs_bin[indices_ver, : ,: ,:] + imgs_bin_ver_flipped = imgs_bin_ver_flipped[:,::-1,::-1,:] + #print(imgs_ver_flipped, 'imgs_ver_flipped') + + else: + imgs_bin_ver_flipped = None else: - n_start = i*b_s - n_end = (i+1)*b_s + n_start = i*self.b_s + n_end = (i+1)*self.b_s imgs = cropped_lines[n_start:n_end] - imgs = np.array(imgs).reshape(b_s, image_height, image_width, 3) + imgs = np.array(imgs).reshape(self.b_s, image_height, image_width, 3) + + ver_imgs = np.array( cropped_lines_ver_index[n_start:n_end] ) + indices_ver = np.where(ver_imgs == 1)[0] + #print(indices_ver, 'indices_ver') + + if len(indices_ver)>0: + imgs_ver_flipped = imgs[indices_ver, : ,: ,:] + imgs_ver_flipped = imgs_ver_flipped[:,::-1,::-1,:] + #print(imgs_ver_flipped, 'imgs_ver_flipped') + else: + imgs_ver_flipped = None + if self.prediction_with_both_of_rgb_and_bin: imgs_bin = cropped_lines_bin[n_start:n_end] - imgs_bin = np.array(imgs_bin).reshape(b_s, image_height, image_width, 3) + imgs_bin = np.array(imgs_bin).reshape(self.b_s, image_height, image_width, 3) + + + if len(indices_ver)>0: + imgs_bin_ver_flipped = imgs_bin[indices_ver, : ,: ,:] + imgs_bin_ver_flipped = imgs_bin_ver_flipped[:,::-1,::-1,:] + #print(imgs_ver_flipped, 'imgs_ver_flipped') + else: + imgs_bin_ver_flipped = None preds = self.prediction_model.predict(imgs, verbose=0) + + if len(indices_ver)>0: + preds_flipped = self.prediction_model.predict(imgs_ver_flipped, verbose=0) + preds_max_fliped = np.max(preds_flipped, axis=2 ) + preds_max_args_flipped = np.argmax(preds_flipped, axis=2 ) + pred_max_not_unk_mask_bool_flipped = preds_max_args_flipped[:,:]!=self.end_character + masked_means_flipped = np.sum(preds_max_fliped * pred_max_not_unk_mask_bool_flipped, axis=1) / np.sum(pred_max_not_unk_mask_bool_flipped, axis=1) + masked_means_flipped[np.isnan(masked_means_flipped)] = 0 + + preds_max = np.max(preds, axis=2 ) + preds_max_args = np.argmax(preds, axis=2 ) + pred_max_not_unk_mask_bool = preds_max_args[:,:]!=self.end_character + + masked_means = np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / np.sum(pred_max_not_unk_mask_bool, axis=1) + masked_means[np.isnan(masked_means)] = 0 + + masked_means_ver = masked_means[indices_ver] + #print(masked_means_ver, 'pred_max_not_unk') + + indices_where_flipped_conf_value_is_higher = np.where(masked_means_flipped > masked_means_ver)[0] + + #print(indices_where_flipped_conf_value_is_higher, 'indices_where_flipped_conf_value_is_higher') + if len(indices_where_flipped_conf_value_is_higher)>0: + indices_to_be_replaced = indices_ver[indices_where_flipped_conf_value_is_higher] + preds[indices_to_be_replaced,:,:] = preds_flipped[indices_where_flipped_conf_value_is_higher, :, :] if self.prediction_with_both_of_rgb_and_bin: preds_bin = self.prediction_model.predict(imgs_bin, verbose=0) + + if len(indices_ver)>0: + preds_flipped = self.prediction_model.predict(imgs_bin_ver_flipped, verbose=0) + preds_max_fliped = np.max(preds_flipped, axis=2 ) + preds_max_args_flipped = np.argmax(preds_flipped, axis=2 ) + pred_max_not_unk_mask_bool_flipped = preds_max_args_flipped[:,:]!=self.end_character + masked_means_flipped = np.sum(preds_max_fliped * pred_max_not_unk_mask_bool_flipped, axis=1) / np.sum(pred_max_not_unk_mask_bool_flipped, axis=1) + masked_means_flipped[np.isnan(masked_means_flipped)] = 0 + + preds_max = np.max(preds, axis=2 ) + preds_max_args = np.argmax(preds, axis=2 ) + pred_max_not_unk_mask_bool = preds_max_args[:,:]!=self.end_character + + masked_means = np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / np.sum(pred_max_not_unk_mask_bool, axis=1) + masked_means[np.isnan(masked_means)] = 0 + + masked_means_ver = masked_means[indices_ver] + #print(masked_means_ver, 'pred_max_not_unk') + + indices_where_flipped_conf_value_is_higher = np.where(masked_means_flipped > masked_means_ver)[0] + + #print(indices_where_flipped_conf_value_is_higher, 'indices_where_flipped_conf_value_is_higher') + if len(indices_where_flipped_conf_value_is_higher)>0: + indices_to_be_replaced = indices_ver[indices_where_flipped_conf_value_is_higher] + preds_bin[indices_to_be_replaced,:,:] = preds_flipped[indices_where_flipped_conf_value_is_higher, :, :] + preds = (preds + preds_bin) / 2. + - pred_texts = self.decode_batch_predictions(preds) + pred_texts = decode_batch_predictions(preds, self.num_to_char) + + preds_max = np.max(preds, axis=2 ) + preds_max_args = np.argmax(preds, axis=2 ) + pred_max_not_unk_mask_bool = preds_max_args[:,:]!=self.end_character + masked_means = np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / np.sum(pred_max_not_unk_mask_bool, axis=1) for ib in range(imgs.shape[0]): pred_texts_ib = pred_texts[ib].replace("[UNK]", "") - extracted_texts.append(pred_texts_ib) - + if masked_means[ib] >= self.min_conf_value_of_textline_text: + extracted_texts.append(pred_texts_ib) + extracted_conf_value.append(masked_means[ib]) + else: + extracted_texts.append("") + extracted_conf_value.append(0) + del cropped_lines + if self.prediction_with_both_of_rgb_and_bin: + del cropped_lines_bin + gc.collect() + extracted_texts_merged = [extracted_texts[ind] if cropped_lines_meging_indexing[ind]==0 else extracted_texts[ind]+" "+extracted_texts[ind+1] if cropped_lines_meging_indexing[ind]==1 else None for ind in range(len(cropped_lines_meging_indexing))] + + extracted_conf_value_merged = [extracted_conf_value[ind] if cropped_lines_meging_indexing[ind]==0 else (extracted_conf_value[ind]+extracted_conf_value[ind+1])/2. if cropped_lines_meging_indexing[ind]==1 else None for ind in range(len(cropped_lines_meging_indexing))] + extracted_conf_value_merged = [extracted_conf_value_merged[ind_cfm] for ind_cfm in range(len(extracted_texts_merged)) if extracted_texts_merged[ind_cfm] is not None] extracted_texts_merged = [ind for ind in extracted_texts_merged if ind is not None] unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer) - if self.draw_texts_on_image: + if self.dir_out_image_text: - font_path = "NotoSans-Regular.ttf" # Make sure this file exists! + font_path = "Charis-7.000/Charis-Regular.ttf" # Make sure this file exists! font = ImageFont.truetype(font_path, 40) for indexer_text, bb_ind in enumerate(total_bb_coordinates): @@ -5384,7 +5848,7 @@ class Eynollah_ocr: w_bb = bb_ind[2] h_bb = bb_ind[3] - font = self.fit_text_single_line(draw, extracted_texts_merged[indexer_text], font_path, w_bb, int(h_bb*0.4) ) + font = fit_text_single_line(draw, extracted_texts_merged[indexer_text], font_path, w_bb, int(h_bb*0.4) ) ##draw.rectangle([x_bb, y_bb, x_bb + w_bb, y_bb + h_bb], outline="red", width=2) @@ -5402,11 +5866,40 @@ class Eynollah_ocr: text_by_textregion = [] for ind in unique_cropped_lines_region_indexer: extracted_texts_merged_un = np.array(extracted_texts_merged)[np.array(cropped_lines_region_indexer)==ind] - text_by_textregion.append("".join(extracted_texts_merged_un)) + if len(extracted_texts_merged_un)>1: + text_by_textregion_ind = "" + next_glue = "" + for indt in range(len(extracted_texts_merged_un)): + if extracted_texts_merged_un[indt].endswith('⸗') or extracted_texts_merged_un[indt].endswith('-'): + text_by_textregion_ind = text_by_textregion_ind + next_glue + extracted_texts_merged_un[indt][:-1] + next_glue = "" + else: + text_by_textregion_ind = text_by_textregion_ind + next_glue + extracted_texts_merged_un[indt] + next_glue = " " + text_by_textregion.append(text_by_textregion_ind) + + else: + text_by_textregion.append(" ".join(extracted_texts_merged_un)) + #print(text_by_textregion, 'text_by_textregiontext_by_textregiontext_by_textregiontext_by_textregiontext_by_textregion') + + ###index_tot_regions = [] + ###tot_region_ref = [] + + ###for jj in root1.iter(link+'RegionRefIndexed'): + ###index_tot_regions.append(jj.attrib['index']) + ###tot_region_ref.append(jj.attrib['regionRef']) + + ###id_to_order = {tid: ro for tid, ro in zip(tot_region_ref, index_tot_regions)} + + #id_textregions = [] + #textregions_by_existing_ids = [] indexer = 0 indexer_textregion = 0 for nn in root1.iter(region_tags): + #id_textregion = nn.attrib['id'] + #id_textregions.append(id_textregion) + #textregions_by_existing_ids.append(text_by_textregion[indexer_textregion]) is_textregion_text = False for childtest in nn: @@ -5430,6 +5923,7 @@ class Eynollah_ocr: if not is_textline_text: text_subelement = ET.SubElement(child_textregion, 'TextEquiv') + text_subelement.set('conf', f"{extracted_conf_value_merged[indexer]:.2f}") unicode_textline = ET.SubElement(text_subelement, 'Unicode') unicode_textline.text = extracted_texts_merged[indexer] else: @@ -5437,6 +5931,7 @@ class Eynollah_ocr: if childtest3.tag.endswith("TextEquiv"): for child_uc in childtest3: if child_uc.tag.endswith("Unicode"): + childtest3.set('conf', f"{extracted_conf_value_merged[indexer]:.2f}") child_uc.text = extracted_texts_merged[indexer] indexer = indexer + 1 @@ -5451,7 +5946,17 @@ class Eynollah_ocr: else: unicode_textregion.text = text_by_textregion[indexer_textregion] indexer_textregion = indexer_textregion + 1 - + + ###sample_order = [(id_to_order[tid], text) for tid, text in zip(id_textregions, textregions_by_existing_ids) if tid in id_to_order] + + ##ordered_texts_sample = [text for _, text in sorted(sample_order)] + ##tot_page_text = ' '.join(ordered_texts_sample) + + ##for page_element in root1.iter(link+'Page'): + ##text_page = ET.SubElement(page_element, 'TextEquiv') + ##unicode_textpage = ET.SubElement(text_page, 'Unicode') + ##unicode_textpage.text = tot_page_text + ET.register_namespace("",name_space) tree1.write(out_file_ocr,xml_declaration=True,method='xml',encoding="utf8",default_namespace=None) #print("Job done in %.1fs", time.time() - t0) diff --git a/src/eynollah/image_enhancer.py b/src/eynollah/image_enhancer.py new file mode 100644 index 0000000..983712d --- /dev/null +++ b/src/eynollah/image_enhancer.py @@ -0,0 +1,735 @@ +""" +Image enhancer. The output can be written as same scale of input or in new predicted scale. +""" + +from logging import Logger +from difflib import SequenceMatcher as sq +from PIL import Image, ImageDraw, ImageFont +import math +import os +import sys +import time +from typing import Optional +import atexit +import warnings +from functools import partial +from pathlib import Path +from multiprocessing import cpu_count +import gc +import copy +from loky import ProcessPoolExecutor +import xml.etree.ElementTree as ET +import cv2 +import numpy as np +from ocrd import OcrdPage +from ocrd_utils import getLogger, tf_disable_interactive_logs +import statistics +from tensorflow.keras.models import load_model +from .utils.resize import resize_image +from .utils import ( + crop_image_inside_box +) + +DPI_THRESHOLD = 298 +KERNEL = np.ones((5, 5), np.uint8) + + +class Enhancer: + def __init__( + self, + dir_models : str, + dir_out : Optional[str] = None, + num_col_upper : Optional[int] = None, + num_col_lower : Optional[int] = None, + save_org_scale : bool = False, + logger : Optional[Logger] = None, + ): + self.dir_out = dir_out + self.input_binary = False + self.light_version = False + self.save_org_scale = save_org_scale + if num_col_upper: + self.num_col_upper = int(num_col_upper) + else: + self.num_col_upper = num_col_upper + if num_col_lower: + self.num_col_lower = int(num_col_lower) + else: + self.num_col_lower = num_col_lower + + self.logger = logger if logger else getLogger('enhancement') + # for parallelization of CPU-intensive tasks: + self.executor = ProcessPoolExecutor(max_workers=cpu_count(), timeout=1200) + atexit.register(self.executor.shutdown) + self.dir_models = dir_models + self.model_dir_of_enhancement = dir_models + "/eynollah-enhancement_20210425" + self.model_dir_of_col_classifier = dir_models + "/eynollah-column-classifier_20210425" + self.model_page_dir = dir_models + "/eynollah-page-extraction_20210425" + + try: + for device in tf.config.list_physical_devices('GPU'): + tf.config.experimental.set_memory_growth(device, True) + except: + self.logger.warning("no GPU device available") + + self.model_page = self.our_load_model(self.model_page_dir) + self.model_classifier = self.our_load_model(self.model_dir_of_col_classifier) + self.model_enhancement = self.our_load_model(self.model_dir_of_enhancement) + + def cache_images(self, image_filename=None, image_pil=None, dpi=None): + ret = {} + t_c0 = time.time() + if image_filename: + ret['img'] = cv2.imread(image_filename) + if self.light_version: + self.dpi = 100 + else: + self.dpi = 0#check_dpi(image_filename) + else: + ret['img'] = pil2cv(image_pil) + if self.light_version: + self.dpi = 100 + else: + self.dpi = 0#check_dpi(image_pil) + ret['img_grayscale'] = cv2.cvtColor(ret['img'], cv2.COLOR_BGR2GRAY) + for prefix in ('', '_grayscale'): + ret[f'img{prefix}_uint8'] = ret[f'img{prefix}'].astype(np.uint8) + self._imgs = ret + if dpi is not None: + self.dpi = dpi + + def reset_file_name_dir(self, image_filename): + t_c = time.time() + self.cache_images(image_filename=image_filename) + self.output_filename = os.path.join(self.dir_out, Path(image_filename).stem +'.png') + + def imread(self, grayscale=False, uint8=True): + key = 'img' + if grayscale: + key += '_grayscale' + if uint8: + key += '_uint8' + return self._imgs[key].copy() + + def isNaN(self, num): + return num != num + + @staticmethod + def our_load_model(model_file): + if model_file.endswith('.h5') and Path(model_file[:-3]).exists(): + # prefer SavedModel over HDF5 format if it exists + model_file = model_file[:-3] + try: + model = load_model(model_file, compile=False) + except: + model = load_model(model_file, compile=False, custom_objects={ + "PatchEncoder": PatchEncoder, "Patches": Patches}) + return model + + def predict_enhancement(self, img): + self.logger.debug("enter predict_enhancement") + + img_height_model = self.model_enhancement.layers[-1].output_shape[1] + img_width_model = self.model_enhancement.layers[-1].output_shape[2] + if img.shape[0] < img_height_model: + img = cv2.resize(img, (img.shape[1], img_width_model), interpolation=cv2.INTER_NEAREST) + if img.shape[1] < img_width_model: + img = cv2.resize(img, (img_height_model, img.shape[0]), interpolation=cv2.INTER_NEAREST) + margin = int(0.1 * img_width_model) + width_mid = img_width_model - 2 * margin + height_mid = img_height_model - 2 * margin + img = img / 255. + img_h = img.shape[0] + img_w = img.shape[1] + + prediction_true = np.zeros((img_h, img_w, 3)) + nxf = img_w / float(width_mid) + nyf = img_h / float(height_mid) + nxf = int(nxf) + 1 if nxf > int(nxf) else int(nxf) + nyf = int(nyf) + 1 if nyf > int(nyf) else int(nyf) + + for i in range(nxf): + for j in range(nyf): + if i == 0: + index_x_d = i * width_mid + index_x_u = index_x_d + img_width_model + else: + index_x_d = i * width_mid + index_x_u = index_x_d + img_width_model + if j == 0: + index_y_d = j * height_mid + index_y_u = index_y_d + img_height_model + else: + index_y_d = j * height_mid + index_y_u = index_y_d + img_height_model + + if index_x_u > img_w: + index_x_u = img_w + index_x_d = img_w - img_width_model + if index_y_u > img_h: + index_y_u = img_h + index_y_d = img_h - img_height_model + + img_patch = img[np.newaxis, index_y_d:index_y_u, index_x_d:index_x_u, :] + label_p_pred = self.model_enhancement.predict(img_patch, verbose=0) + seg = label_p_pred[0, :, :, :] * 255 + + if i == 0 and j == 0: + prediction_true[index_y_d + 0:index_y_u - margin, + index_x_d + 0:index_x_u - margin] = \ + seg[0:-margin or None, + 0:-margin or None] + elif i == nxf - 1 and j == nyf - 1: + prediction_true[index_y_d + margin:index_y_u - 0, + index_x_d + margin:index_x_u - 0] = \ + seg[margin:, + margin:] + elif i == 0 and j == nyf - 1: + prediction_true[index_y_d + margin:index_y_u - 0, + index_x_d + 0:index_x_u - margin] = \ + seg[margin:, + 0:-margin or None] + elif i == nxf - 1 and j == 0: + prediction_true[index_y_d + 0:index_y_u - margin, + index_x_d + margin:index_x_u - 0] = \ + seg[0:-margin or None, + margin:] + elif i == 0 and j != 0 and j != nyf - 1: + prediction_true[index_y_d + margin:index_y_u - margin, + index_x_d + 0:index_x_u - margin] = \ + seg[margin:-margin or None, + 0:-margin or None] + elif i == nxf - 1 and j != 0 and j != nyf - 1: + prediction_true[index_y_d + margin:index_y_u - margin, + index_x_d + margin:index_x_u - 0] = \ + seg[margin:-margin or None, + margin:] + elif i != 0 and i != nxf - 1 and j == 0: + prediction_true[index_y_d + 0:index_y_u - margin, + index_x_d + margin:index_x_u - margin] = \ + seg[0:-margin or None, + margin:-margin or None] + elif i != 0 and i != nxf - 1 and j == nyf - 1: + prediction_true[index_y_d + margin:index_y_u - 0, + index_x_d + margin:index_x_u - margin] = \ + seg[margin:, + margin:-margin or None] + else: + prediction_true[index_y_d + margin:index_y_u - margin, + index_x_d + margin:index_x_u - margin] = \ + seg[margin:-margin or None, + margin:-margin or None] + + prediction_true = prediction_true.astype(int) + return prediction_true + + def calculate_width_height_by_columns(self, img, num_col, width_early, label_p_pred): + self.logger.debug("enter calculate_width_height_by_columns") + if num_col == 1: + img_w_new = 2000 + elif num_col == 2: + img_w_new = 2400 + elif num_col == 3: + img_w_new = 3000 + elif num_col == 4: + img_w_new = 4000 + elif num_col == 5: + img_w_new = 5000 + elif num_col == 6: + img_w_new = 6500 + else: + img_w_new = width_early + img_h_new = img_w_new * img.shape[0] // img.shape[1] + + if img_h_new >= 8000: + img_new = np.copy(img) + num_column_is_classified = False + else: + img_new = resize_image(img, img_h_new, img_w_new) + num_column_is_classified = True + + return img_new, num_column_is_classified + + def early_page_for_num_of_column_classification(self,img_bin): + self.logger.debug("enter early_page_for_num_of_column_classification") + if self.input_binary: + img = np.copy(img_bin).astype(np.uint8) + else: + img = self.imread() + img = cv2.GaussianBlur(img, (5, 5), 0) + img_page_prediction = self.do_prediction(False, img, self.model_page) + + imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY) + _, thresh = cv2.threshold(imgray, 0, 255, 0) + thresh = cv2.dilate(thresh, KERNEL, iterations=3) + contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + if len(contours)>0: + cnt_size = np.array([cv2.contourArea(contours[j]) + for j in range(len(contours))]) + cnt = contours[np.argmax(cnt_size)] + box = cv2.boundingRect(cnt) + else: + box = [0, 0, img.shape[1], img.shape[0]] + cropped_page, page_coord = crop_image_inside_box(box, img) + + self.logger.debug("exit early_page_for_num_of_column_classification") + return cropped_page, page_coord + + def calculate_width_height_by_columns_1_2(self, img, num_col, width_early, label_p_pred): + self.logger.debug("enter calculate_width_height_by_columns") + if num_col == 1: + img_w_new = 1000 + else: + img_w_new = 1300 + img_h_new = img_w_new * img.shape[0] // img.shape[1] + + if label_p_pred[0][int(num_col - 1)] < 0.9 and img_w_new < width_early: + img_new = np.copy(img) + num_column_is_classified = False + #elif label_p_pred[0][int(num_col - 1)] < 0.8 and img_h_new >= 8000: + elif img_h_new >= 8000: + img_new = np.copy(img) + num_column_is_classified = False + else: + img_new = resize_image(img, img_h_new, img_w_new) + num_column_is_classified = True + + return img_new, num_column_is_classified + + def resize_and_enhance_image_with_column_classifier(self, light_version): + self.logger.debug("enter resize_and_enhance_image_with_column_classifier") + dpi = 0#self.dpi + self.logger.info("Detected %s DPI", dpi) + if self.input_binary: + img = self.imread() + prediction_bin = self.do_prediction(True, img, self.model_bin, n_batch_inference=5) + prediction_bin = 255 * (prediction_bin[:,:,0]==0) + prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2).astype(np.uint8) + img= np.copy(prediction_bin) + img_bin = prediction_bin + else: + img = self.imread() + self.h_org, self.w_org = img.shape[:2] + img_bin = None + + width_early = img.shape[1] + t1 = time.time() + _, page_coord = self.early_page_for_num_of_column_classification(img_bin) + + self.image_page_org_size = img[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3], :] + self.page_coord = page_coord + + if self.num_col_upper and not self.num_col_lower: + num_col = self.num_col_upper + label_p_pred = [np.ones(6)] + elif self.num_col_lower and not self.num_col_upper: + num_col = self.num_col_lower + label_p_pred = [np.ones(6)] + elif not self.num_col_upper and not self.num_col_lower: + if self.input_binary: + img_in = np.copy(img) + img_in = img_in / 255.0 + img_in = cv2.resize(img_in, (448, 448), interpolation=cv2.INTER_NEAREST) + img_in = img_in.reshape(1, 448, 448, 3) + else: + img_1ch = self.imread(grayscale=True) + width_early = img_1ch.shape[1] + img_1ch = img_1ch[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]] + + img_1ch = img_1ch / 255.0 + img_1ch = cv2.resize(img_1ch, (448, 448), interpolation=cv2.INTER_NEAREST) + img_in = np.zeros((1, img_1ch.shape[0], img_1ch.shape[1], 3)) + img_in[0, :, :, 0] = img_1ch[:, :] + img_in[0, :, :, 1] = img_1ch[:, :] + img_in[0, :, :, 2] = img_1ch[:, :] + + label_p_pred = self.model_classifier.predict(img_in, verbose=0) + num_col = np.argmax(label_p_pred[0]) + 1 + elif (self.num_col_upper and self.num_col_lower) and (self.num_col_upper!=self.num_col_lower): + if self.input_binary: + img_in = np.copy(img) + img_in = img_in / 255.0 + img_in = cv2.resize(img_in, (448, 448), interpolation=cv2.INTER_NEAREST) + img_in = img_in.reshape(1, 448, 448, 3) + else: + img_1ch = self.imread(grayscale=True) + width_early = img_1ch.shape[1] + img_1ch = img_1ch[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]] + + img_1ch = img_1ch / 255.0 + img_1ch = cv2.resize(img_1ch, (448, 448), interpolation=cv2.INTER_NEAREST) + img_in = np.zeros((1, img_1ch.shape[0], img_1ch.shape[1], 3)) + img_in[0, :, :, 0] = img_1ch[:, :] + img_in[0, :, :, 1] = img_1ch[:, :] + img_in[0, :, :, 2] = img_1ch[:, :] + + label_p_pred = self.model_classifier.predict(img_in, verbose=0) + num_col = np.argmax(label_p_pred[0]) + 1 + + if num_col > self.num_col_upper: + num_col = self.num_col_upper + label_p_pred = [np.ones(6)] + if num_col < self.num_col_lower: + num_col = self.num_col_lower + label_p_pred = [np.ones(6)] + else: + num_col = self.num_col_upper + label_p_pred = [np.ones(6)] + + self.logger.info("Found %d columns (%s)", num_col, np.around(label_p_pred, decimals=5)) + + if dpi < DPI_THRESHOLD: + if light_version and num_col in (1,2): + img_new, num_column_is_classified = self.calculate_width_height_by_columns_1_2( + img, num_col, width_early, label_p_pred) + else: + img_new, num_column_is_classified = self.calculate_width_height_by_columns( + img, num_col, width_early, label_p_pred) + if light_version: + image_res = np.copy(img_new) + else: + image_res = self.predict_enhancement(img_new) + is_image_enhanced = True + + else: + num_column_is_classified = True + image_res = np.copy(img) + is_image_enhanced = False + + self.logger.debug("exit resize_and_enhance_image_with_column_classifier") + return is_image_enhanced, img, image_res, num_col, num_column_is_classified, img_bin + def do_prediction( + self, patches, img, model, + n_batch_inference=1, marginal_of_patch_percent=0.1, + thresholding_for_some_classes_in_light_version=False, + thresholding_for_artificial_class_in_light_version=False, thresholding_for_fl_light_version=False, threshold_art_class_textline=0.1): + + self.logger.debug("enter do_prediction") + img_height_model = model.layers[-1].output_shape[1] + img_width_model = model.layers[-1].output_shape[2] + + if not patches: + img_h_page = img.shape[0] + img_w_page = img.shape[1] + img = img / float(255.0) + img = resize_image(img, img_height_model, img_width_model) + + label_p_pred = model.predict(img[np.newaxis], verbose=0) + seg = np.argmax(label_p_pred, axis=3)[0] + + if thresholding_for_artificial_class_in_light_version: + seg_art = label_p_pred[0,:,:,2] + + seg_art[seg_art0] =1 + + skeleton_art = skeletonize(seg_art) + skeleton_art = skeleton_art*1 + + seg[skeleton_art==1]=2 + + if thresholding_for_fl_light_version: + seg_header = label_p_pred[0,:,:,2] + + seg_header[seg_header<0.2] = 0 + seg_header[seg_header>0] =1 + + seg[seg_header==1]=2 + + seg_color = np.repeat(seg[:, :, np.newaxis], 3, axis=2) + prediction_true = resize_image(seg_color, img_h_page, img_w_page).astype(np.uint8) + return prediction_true + + if img.shape[0] < img_height_model: + img = resize_image(img, img_height_model, img.shape[1]) + if img.shape[1] < img_width_model: + img = resize_image(img, img.shape[0], img_width_model) + + self.logger.debug("Patch size: %sx%s", img_height_model, img_width_model) + margin = int(marginal_of_patch_percent * img_height_model) + width_mid = img_width_model - 2 * margin + height_mid = img_height_model - 2 * margin + img = img / 255. + #img = img.astype(np.float16) + img_h = img.shape[0] + img_w = img.shape[1] + prediction_true = np.zeros((img_h, img_w, 3)) + mask_true = np.zeros((img_h, img_w)) + nxf = img_w / float(width_mid) + nyf = img_h / float(height_mid) + nxf = int(nxf) + 1 if nxf > int(nxf) else int(nxf) + nyf = int(nyf) + 1 if nyf > int(nyf) else int(nyf) + + list_i_s = [] + list_j_s = [] + list_x_u = [] + list_x_d = [] + list_y_u = [] + list_y_d = [] + + batch_indexer = 0 + img_patch = np.zeros((n_batch_inference, img_height_model, img_width_model, 3)) + for i in range(nxf): + for j in range(nyf): + if i == 0: + index_x_d = i * width_mid + index_x_u = index_x_d + img_width_model + else: + index_x_d = i * width_mid + index_x_u = index_x_d + img_width_model + if j == 0: + index_y_d = j * height_mid + index_y_u = index_y_d + img_height_model + else: + index_y_d = j * height_mid + index_y_u = index_y_d + img_height_model + if index_x_u > img_w: + index_x_u = img_w + index_x_d = img_w - img_width_model + if index_y_u > img_h: + index_y_u = img_h + index_y_d = img_h - img_height_model + + list_i_s.append(i) + list_j_s.append(j) + list_x_u.append(index_x_u) + list_x_d.append(index_x_d) + list_y_d.append(index_y_d) + list_y_u.append(index_y_u) + + img_patch[batch_indexer,:,:,:] = img[index_y_d:index_y_u, index_x_d:index_x_u, :] + batch_indexer += 1 + + if (batch_indexer == n_batch_inference or + # last batch + i == nxf - 1 and j == nyf - 1): + self.logger.debug("predicting patches on %s", str(img_patch.shape)) + label_p_pred = model.predict(img_patch, verbose=0) + seg = np.argmax(label_p_pred, axis=3) + + if thresholding_for_some_classes_in_light_version: + seg_not_base = label_p_pred[:,:,:,4] + seg_not_base[seg_not_base>0.03] =1 + seg_not_base[seg_not_base<1] =0 + + seg_line = label_p_pred[:,:,:,3] + seg_line[seg_line>0.1] =1 + seg_line[seg_line<1] =0 + + seg_background = label_p_pred[:,:,:,0] + seg_background[seg_background>0.25] =1 + seg_background[seg_background<1] =0 + + seg[seg_not_base==1]=4 + seg[seg_background==1]=0 + seg[(seg_line==1) & (seg==0)]=3 + if thresholding_for_artificial_class_in_light_version: + seg_art = label_p_pred[:,:,:,2] + + seg_art[seg_art0] =1 + + ##seg[seg_art==1]=2 + + indexer_inside_batch = 0 + for i_batch, j_batch in zip(list_i_s, list_j_s): + seg_in = seg[indexer_inside_batch] + + if thresholding_for_artificial_class_in_light_version: + seg_in_art = seg_art[indexer_inside_batch] + + index_y_u_in = list_y_u[indexer_inside_batch] + index_y_d_in = list_y_d[indexer_inside_batch] + + index_x_u_in = list_x_u[indexer_inside_batch] + index_x_d_in = list_x_d[indexer_inside_batch] + + if i_batch == 0 and j_batch == 0: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + 0:index_x_u_in - margin] = \ + seg_in[0:-margin or None, + 0:-margin or None, + np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + 0:index_x_u_in - margin, 1] = \ + seg_in_art[0:-margin or None, + 0:-margin or None] + + elif i_batch == nxf - 1 and j_batch == nyf - 1: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + margin:index_x_u_in - 0] = \ + seg_in[margin:, + margin:, + np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + margin:index_x_u_in - 0, 1] = \ + seg_in_art[margin:, + margin:] + + elif i_batch == 0 and j_batch == nyf - 1: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + 0:index_x_u_in - margin] = \ + seg_in[margin:, + 0:-margin or None, + np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + 0:index_x_u_in - margin, 1] = \ + seg_in_art[margin:, + 0:-margin or None] + + elif i_batch == nxf - 1 and j_batch == 0: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - 0] = \ + seg_in[0:-margin or None, + margin:, + np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - 0, 1] = \ + seg_in_art[0:-margin or None, + margin:] + + elif i_batch == 0 and j_batch != 0 and j_batch != nyf - 1: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + 0:index_x_u_in - margin] = \ + seg_in[margin:-margin or None, + 0:-margin or None, + np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + 0:index_x_u_in - margin, 1] = \ + seg_in_art[margin:-margin or None, + 0:-margin or None] + + elif i_batch == nxf - 1 and j_batch != 0 and j_batch != nyf - 1: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - 0] = \ + seg_in[margin:-margin or None, + margin:, + np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - 0, 1] = \ + seg_in_art[margin:-margin or None, + margin:] + + elif i_batch != 0 and i_batch != nxf - 1 and j_batch == 0: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - margin] = \ + seg_in[0:-margin or None, + margin:-margin or None, + np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - margin, 1] = \ + seg_in_art[0:-margin or None, + margin:-margin or None] + + elif i_batch != 0 and i_batch != nxf - 1 and j_batch == nyf - 1: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + margin:index_x_u_in - margin] = \ + seg_in[margin:, + margin:-margin or None, + np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + margin:index_x_u_in - margin, 1] = \ + seg_in_art[margin:, + margin:-margin or None] + + else: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - margin] = \ + seg_in[margin:-margin or None, + margin:-margin or None, + np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - margin, 1] = \ + seg_in_art[margin:-margin or None, + margin:-margin or None] + indexer_inside_batch += 1 + + + list_i_s = [] + list_j_s = [] + list_x_u = [] + list_x_d = [] + list_y_u = [] + list_y_d = [] + + batch_indexer = 0 + img_patch[:] = 0 + + prediction_true = prediction_true.astype(np.uint8) + + if thresholding_for_artificial_class_in_light_version: + kernel_min = np.ones((3, 3), np.uint8) + prediction_true[:,:,0][prediction_true[:,:,0]==2] = 0 + + skeleton_art = skeletonize(prediction_true[:,:,1]) + skeleton_art = skeleton_art*1 + + skeleton_art = skeleton_art.astype('uint8') + + skeleton_art = cv2.dilate(skeleton_art, kernel_min, iterations=1) + + prediction_true[:,:,0][skeleton_art==1]=2 + #del model + gc.collect() + return prediction_true + + def run_enhancement(self, light_version): + t_in = time.time() + self.logger.info("Resizing and enhancing image...") + is_image_enhanced, img_org, img_res, num_col_classifier, num_column_is_classified, img_bin = \ + self.resize_and_enhance_image_with_column_classifier(light_version) + + self.logger.info("Image was %senhanced.", '' if is_image_enhanced else 'not ') + return img_res, is_image_enhanced, num_col_classifier, num_column_is_classified + + + def run_single(self): + t0 = time.time() + img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(light_version=False) + + return img_res + + + def run(self, image_filename : Optional[str] = None, dir_in : Optional[str] = None, overwrite : bool = False): + """ + Get image and scales, then extract the page of scanned image + """ + self.logger.debug("enter run") + t0_tot = time.time() + + if dir_in: + self.ls_imgs = os.listdir(dir_in) + elif image_filename: + self.ls_imgs = [image_filename] + else: + raise ValueError("run requires either a single image filename or a directory") + + for img_filename in self.ls_imgs: + self.logger.info(img_filename) + t0 = time.time() + + self.reset_file_name_dir(os.path.join(dir_in or "", img_filename)) + #print("text region early -11 in %.1fs", time.time() - t0) + + if os.path.exists(self.output_filename): + if overwrite: + self.logger.warning("will overwrite existing output file '%s'", self.output_filename) + else: + self.logger.warning("will skip input for existing output file '%s'", self.output_filename) + continue + + image_enhanced = self.run_single() + if self.save_org_scale: + image_enhanced = resize_image(image_enhanced, self.h_org, self.w_org) + + cv2.imwrite(self.output_filename, image_enhanced) + diff --git a/src/eynollah/mb_ro_on_layout.py b/src/eynollah/mb_ro_on_layout.py new file mode 100644 index 0000000..c03d831 --- /dev/null +++ b/src/eynollah/mb_ro_on_layout.py @@ -0,0 +1,1142 @@ +""" +Image enhancer. The output can be written as same scale of input or in new predicted scale. +""" + +from logging import Logger +from difflib import SequenceMatcher as sq +from PIL import Image, ImageDraw, ImageFont +import math +import os +import sys +import time +from typing import Optional +import atexit +import warnings +from functools import partial +from pathlib import Path +from multiprocessing import cpu_count +import gc +import copy +from loky import ProcessPoolExecutor +import xml.etree.ElementTree as ET +import cv2 +import numpy as np +from ocrd import OcrdPage +from ocrd_utils import getLogger, tf_disable_interactive_logs +import statistics +from tensorflow.keras.models import load_model +from .utils.resize import resize_image +from .utils import ( + crop_image_inside_box +) + +from .utils.contour import ( + filter_contours_area_of_image, + filter_contours_area_of_image_tables, + find_contours_mean_y_diff, + find_new_features_of_contours, + find_features_of_contours, + get_text_region_boxes_by_given_contours, + get_textregion_contours_in_org_image, + get_textregion_contours_in_org_image_light, + return_contours_of_image, + return_contours_of_interested_region, + return_contours_of_interested_region_by_min_size, + return_contours_of_interested_textline, + return_parent_contours, +) + +DPI_THRESHOLD = 298 +KERNEL = np.ones((5, 5), np.uint8) + + +class machine_based_reading_order_on_layout: + def __init__( + self, + dir_models : str, + dir_out : Optional[str] = None, + logger : Optional[Logger] = None, + ): + self.dir_out = dir_out + + self.logger = logger if logger else getLogger('mbro on layout') + # for parallelization of CPU-intensive tasks: + self.executor = ProcessPoolExecutor(max_workers=cpu_count(), timeout=1200) + atexit.register(self.executor.shutdown) + self.dir_models = dir_models + self.model_reading_order_dir = dir_models + "/model_eynollah_reading_order_20250824"#"/model_ens_reading_order_machine_based" + + try: + for device in tf.config.list_physical_devices('GPU'): + tf.config.experimental.set_memory_growth(device, True) + except: + self.logger.warning("no GPU device available") + + self.model_reading_order = self.our_load_model(self.model_reading_order_dir) + self.light_version = True + + + def cache_images(self, image_filename=None, image_pil=None, dpi=None): + ret = {} + t_c0 = time.time() + if image_filename: + ret['img'] = cv2.imread(image_filename) + if self.light_version: + self.dpi = 100 + else: + self.dpi = 0#check_dpi(image_filename) + else: + ret['img'] = pil2cv(image_pil) + if self.light_version: + self.dpi = 100 + else: + self.dpi = 0#check_dpi(image_pil) + ret['img_grayscale'] = cv2.cvtColor(ret['img'], cv2.COLOR_BGR2GRAY) + for prefix in ('', '_grayscale'): + ret[f'img{prefix}_uint8'] = ret[f'img{prefix}'].astype(np.uint8) + self._imgs = ret + if dpi is not None: + self.dpi = dpi + + def reset_file_name_dir(self, image_filename): + t_c = time.time() + self.cache_images(image_filename=image_filename) + self.output_filename = os.path.join(self.dir_out, Path(image_filename).stem +'.png') + + def imread(self, grayscale=False, uint8=True): + key = 'img' + if grayscale: + key += '_grayscale' + if uint8: + key += '_uint8' + return self._imgs[key].copy() + + def isNaN(self, num): + return num != num + + @staticmethod + def our_load_model(model_file): + if model_file.endswith('.h5') and Path(model_file[:-3]).exists(): + # prefer SavedModel over HDF5 format if it exists + model_file = model_file[:-3] + try: + model = load_model(model_file, compile=False) + except: + model = load_model(model_file, compile=False, custom_objects={ + "PatchEncoder": PatchEncoder, "Patches": Patches}) + return model + + def predict_enhancement(self, img): + self.logger.debug("enter predict_enhancement") + + img_height_model = self.model_enhancement.layers[-1].output_shape[1] + img_width_model = self.model_enhancement.layers[-1].output_shape[2] + if img.shape[0] < img_height_model: + img = cv2.resize(img, (img.shape[1], img_width_model), interpolation=cv2.INTER_NEAREST) + if img.shape[1] < img_width_model: + img = cv2.resize(img, (img_height_model, img.shape[0]), interpolation=cv2.INTER_NEAREST) + margin = int(0.1 * img_width_model) + width_mid = img_width_model - 2 * margin + height_mid = img_height_model - 2 * margin + img = img / 255. + img_h = img.shape[0] + img_w = img.shape[1] + + prediction_true = np.zeros((img_h, img_w, 3)) + nxf = img_w / float(width_mid) + nyf = img_h / float(height_mid) + nxf = int(nxf) + 1 if nxf > int(nxf) else int(nxf) + nyf = int(nyf) + 1 if nyf > int(nyf) else int(nyf) + + for i in range(nxf): + for j in range(nyf): + if i == 0: + index_x_d = i * width_mid + index_x_u = index_x_d + img_width_model + else: + index_x_d = i * width_mid + index_x_u = index_x_d + img_width_model + if j == 0: + index_y_d = j * height_mid + index_y_u = index_y_d + img_height_model + else: + index_y_d = j * height_mid + index_y_u = index_y_d + img_height_model + + if index_x_u > img_w: + index_x_u = img_w + index_x_d = img_w - img_width_model + if index_y_u > img_h: + index_y_u = img_h + index_y_d = img_h - img_height_model + + img_patch = img[np.newaxis, index_y_d:index_y_u, index_x_d:index_x_u, :] + label_p_pred = self.model_enhancement.predict(img_patch, verbose=0) + seg = label_p_pred[0, :, :, :] * 255 + + if i == 0 and j == 0: + prediction_true[index_y_d + 0:index_y_u - margin, + index_x_d + 0:index_x_u - margin] = \ + seg[0:-margin or None, + 0:-margin or None] + elif i == nxf - 1 and j == nyf - 1: + prediction_true[index_y_d + margin:index_y_u - 0, + index_x_d + margin:index_x_u - 0] = \ + seg[margin:, + margin:] + elif i == 0 and j == nyf - 1: + prediction_true[index_y_d + margin:index_y_u - 0, + index_x_d + 0:index_x_u - margin] = \ + seg[margin:, + 0:-margin or None] + elif i == nxf - 1 and j == 0: + prediction_true[index_y_d + 0:index_y_u - margin, + index_x_d + margin:index_x_u - 0] = \ + seg[0:-margin or None, + margin:] + elif i == 0 and j != 0 and j != nyf - 1: + prediction_true[index_y_d + margin:index_y_u - margin, + index_x_d + 0:index_x_u - margin] = \ + seg[margin:-margin or None, + 0:-margin or None] + elif i == nxf - 1 and j != 0 and j != nyf - 1: + prediction_true[index_y_d + margin:index_y_u - margin, + index_x_d + margin:index_x_u - 0] = \ + seg[margin:-margin or None, + margin:] + elif i != 0 and i != nxf - 1 and j == 0: + prediction_true[index_y_d + 0:index_y_u - margin, + index_x_d + margin:index_x_u - margin] = \ + seg[0:-margin or None, + margin:-margin or None] + elif i != 0 and i != nxf - 1 and j == nyf - 1: + prediction_true[index_y_d + margin:index_y_u - 0, + index_x_d + margin:index_x_u - margin] = \ + seg[margin:, + margin:-margin or None] + else: + prediction_true[index_y_d + margin:index_y_u - margin, + index_x_d + margin:index_x_u - margin] = \ + seg[margin:-margin or None, + margin:-margin or None] + + prediction_true = prediction_true.astype(int) + return prediction_true + + def calculate_width_height_by_columns(self, img, num_col, width_early, label_p_pred): + self.logger.debug("enter calculate_width_height_by_columns") + if num_col == 1: + img_w_new = 2000 + elif num_col == 2: + img_w_new = 2400 + elif num_col == 3: + img_w_new = 3000 + elif num_col == 4: + img_w_new = 4000 + elif num_col == 5: + img_w_new = 5000 + elif num_col == 6: + img_w_new = 6500 + else: + img_w_new = width_early + img_h_new = img_w_new * img.shape[0] // img.shape[1] + + if img_h_new >= 8000: + img_new = np.copy(img) + num_column_is_classified = False + else: + img_new = resize_image(img, img_h_new, img_w_new) + num_column_is_classified = True + + return img_new, num_column_is_classified + + def early_page_for_num_of_column_classification(self,img_bin): + self.logger.debug("enter early_page_for_num_of_column_classification") + if self.input_binary: + img = np.copy(img_bin).astype(np.uint8) + else: + img = self.imread() + img = cv2.GaussianBlur(img, (5, 5), 0) + img_page_prediction = self.do_prediction(False, img, self.model_page) + + imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY) + _, thresh = cv2.threshold(imgray, 0, 255, 0) + thresh = cv2.dilate(thresh, KERNEL, iterations=3) + contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + if len(contours)>0: + cnt_size = np.array([cv2.contourArea(contours[j]) + for j in range(len(contours))]) + cnt = contours[np.argmax(cnt_size)] + box = cv2.boundingRect(cnt) + else: + box = [0, 0, img.shape[1], img.shape[0]] + cropped_page, page_coord = crop_image_inside_box(box, img) + + self.logger.debug("exit early_page_for_num_of_column_classification") + return cropped_page, page_coord + + def calculate_width_height_by_columns_1_2(self, img, num_col, width_early, label_p_pred): + self.logger.debug("enter calculate_width_height_by_columns") + if num_col == 1: + img_w_new = 1000 + else: + img_w_new = 1300 + img_h_new = img_w_new * img.shape[0] // img.shape[1] + + if label_p_pred[0][int(num_col - 1)] < 0.9 and img_w_new < width_early: + img_new = np.copy(img) + num_column_is_classified = False + #elif label_p_pred[0][int(num_col - 1)] < 0.8 and img_h_new >= 8000: + elif img_h_new >= 8000: + img_new = np.copy(img) + num_column_is_classified = False + else: + img_new = resize_image(img, img_h_new, img_w_new) + num_column_is_classified = True + + return img_new, num_column_is_classified + + def resize_and_enhance_image_with_column_classifier(self, light_version): + self.logger.debug("enter resize_and_enhance_image_with_column_classifier") + dpi = 0#self.dpi + self.logger.info("Detected %s DPI", dpi) + if self.input_binary: + img = self.imread() + prediction_bin = self.do_prediction(True, img, self.model_bin, n_batch_inference=5) + prediction_bin = 255 * (prediction_bin[:,:,0]==0) + prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2).astype(np.uint8) + img= np.copy(prediction_bin) + img_bin = prediction_bin + else: + img = self.imread() + self.h_org, self.w_org = img.shape[:2] + img_bin = None + + width_early = img.shape[1] + t1 = time.time() + _, page_coord = self.early_page_for_num_of_column_classification(img_bin) + + self.image_page_org_size = img[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3], :] + self.page_coord = page_coord + + if self.num_col_upper and not self.num_col_lower: + num_col = self.num_col_upper + label_p_pred = [np.ones(6)] + elif self.num_col_lower and not self.num_col_upper: + num_col = self.num_col_lower + label_p_pred = [np.ones(6)] + elif not self.num_col_upper and not self.num_col_lower: + if self.input_binary: + img_in = np.copy(img) + img_in = img_in / 255.0 + img_in = cv2.resize(img_in, (448, 448), interpolation=cv2.INTER_NEAREST) + img_in = img_in.reshape(1, 448, 448, 3) + else: + img_1ch = self.imread(grayscale=True) + width_early = img_1ch.shape[1] + img_1ch = img_1ch[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]] + + img_1ch = img_1ch / 255.0 + img_1ch = cv2.resize(img_1ch, (448, 448), interpolation=cv2.INTER_NEAREST) + img_in = np.zeros((1, img_1ch.shape[0], img_1ch.shape[1], 3)) + img_in[0, :, :, 0] = img_1ch[:, :] + img_in[0, :, :, 1] = img_1ch[:, :] + img_in[0, :, :, 2] = img_1ch[:, :] + + label_p_pred = self.model_classifier.predict(img_in, verbose=0) + num_col = np.argmax(label_p_pred[0]) + 1 + elif (self.num_col_upper and self.num_col_lower) and (self.num_col_upper!=self.num_col_lower): + if self.input_binary: + img_in = np.copy(img) + img_in = img_in / 255.0 + img_in = cv2.resize(img_in, (448, 448), interpolation=cv2.INTER_NEAREST) + img_in = img_in.reshape(1, 448, 448, 3) + else: + img_1ch = self.imread(grayscale=True) + width_early = img_1ch.shape[1] + img_1ch = img_1ch[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]] + + img_1ch = img_1ch / 255.0 + img_1ch = cv2.resize(img_1ch, (448, 448), interpolation=cv2.INTER_NEAREST) + img_in = np.zeros((1, img_1ch.shape[0], img_1ch.shape[1], 3)) + img_in[0, :, :, 0] = img_1ch[:, :] + img_in[0, :, :, 1] = img_1ch[:, :] + img_in[0, :, :, 2] = img_1ch[:, :] + + label_p_pred = self.model_classifier.predict(img_in, verbose=0) + num_col = np.argmax(label_p_pred[0]) + 1 + + if num_col > self.num_col_upper: + num_col = self.num_col_upper + label_p_pred = [np.ones(6)] + if num_col < self.num_col_lower: + num_col = self.num_col_lower + label_p_pred = [np.ones(6)] + else: + num_col = self.num_col_upper + label_p_pred = [np.ones(6)] + + self.logger.info("Found %d columns (%s)", num_col, np.around(label_p_pred, decimals=5)) + + if dpi < DPI_THRESHOLD: + if light_version and num_col in (1,2): + img_new, num_column_is_classified = self.calculate_width_height_by_columns_1_2( + img, num_col, width_early, label_p_pred) + else: + img_new, num_column_is_classified = self.calculate_width_height_by_columns( + img, num_col, width_early, label_p_pred) + if light_version: + image_res = np.copy(img_new) + else: + image_res = self.predict_enhancement(img_new) + is_image_enhanced = True + + else: + num_column_is_classified = True + image_res = np.copy(img) + is_image_enhanced = False + + self.logger.debug("exit resize_and_enhance_image_with_column_classifier") + return is_image_enhanced, img, image_res, num_col, num_column_is_classified, img_bin + def read_xml(self, xml_file): + file_name = Path(xml_file).stem + tree1 = ET.parse(xml_file, parser = ET.XMLParser(encoding='utf-8')) + root1=tree1.getroot() + alltags=[elem.tag for elem in root1.iter()] + link=alltags[0].split('}')[0]+'}' + + index_tot_regions = [] + tot_region_ref = [] + + for jj in root1.iter(link+'Page'): + y_len=int(jj.attrib['imageHeight']) + x_len=int(jj.attrib['imageWidth']) + + for jj in root1.iter(link+'RegionRefIndexed'): + index_tot_regions.append(jj.attrib['index']) + tot_region_ref.append(jj.attrib['regionRef']) + + if (link+'PrintSpace' in alltags) or (link+'Border' in alltags): + co_printspace = [] + if link+'PrintSpace' in alltags: + region_tags_printspace = np.unique([x for x in alltags if x.endswith('PrintSpace')]) + elif link+'Border' in alltags: + region_tags_printspace = np.unique([x for x in alltags if x.endswith('Border')]) + + for tag in region_tags_printspace: + if link+'PrintSpace' in alltags: + tag_endings_printspace = ['}PrintSpace','}printspace'] + elif link+'Border' in alltags: + tag_endings_printspace = ['}Border','}border'] + + if tag.endswith(tag_endings_printspace[0]) or tag.endswith(tag_endings_printspace[1]): + for nn in root1.iter(tag): + c_t_in = [] + sumi = 0 + for vv in nn.iter(): + # check the format of coords + if vv.tag == link + 'Coords': + coords = bool(vv.attrib) + if coords: + p_h = vv.attrib['points'].split(' ') + c_t_in.append( + np.array([[int(x.split(',')[0]), int(x.split(',')[1])] for x in p_h])) + break + else: + pass + + if vv.tag == link + 'Point': + c_t_in.append([int(float(vv.attrib['x'])), int(float(vv.attrib['y']))]) + sumi += 1 + elif vv.tag != link + 'Point' and sumi >= 1: + break + co_printspace.append(np.array(c_t_in)) + img_printspace = np.zeros( (y_len,x_len,3) ) + img_printspace=cv2.fillPoly(img_printspace, pts =co_printspace, color=(1,1,1)) + img_printspace = img_printspace.astype(np.uint8) + + imgray = cv2.cvtColor(img_printspace, cv2.COLOR_BGR2GRAY) + _, thresh = cv2.threshold(imgray, 0, 255, 0) + contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + cnt_size = np.array([cv2.contourArea(contours[j]) for j in range(len(contours))]) + cnt = contours[np.argmax(cnt_size)] + x, y, w, h = cv2.boundingRect(cnt) + + bb_coord_printspace = [x, y, w, h] + + else: + bb_coord_printspace = None + + + region_tags=np.unique([x for x in alltags if x.endswith('Region')]) + co_text_paragraph=[] + co_text_drop=[] + co_text_heading=[] + co_text_header=[] + co_text_marginalia=[] + co_text_catch=[] + co_text_page_number=[] + co_text_signature_mark=[] + co_sep=[] + co_img=[] + co_table=[] + co_graphic=[] + co_graphic_text_annotation=[] + co_graphic_decoration=[] + co_noise=[] + + co_text_paragraph_text=[] + co_text_drop_text=[] + co_text_heading_text=[] + co_text_header_text=[] + co_text_marginalia_text=[] + co_text_catch_text=[] + co_text_page_number_text=[] + co_text_signature_mark_text=[] + co_sep_text=[] + co_img_text=[] + co_table_text=[] + co_graphic_text=[] + co_graphic_text_annotation_text=[] + co_graphic_decoration_text=[] + co_noise_text=[] + + id_paragraph = [] + id_header = [] + id_heading = [] + id_marginalia = [] + + for tag in region_tags: + if tag.endswith('}TextRegion') or tag.endswith('}Textregion'): + for nn in root1.iter(tag): + for child2 in nn: + tag2 = child2.tag + if tag2.endswith('}TextEquiv') or tag2.endswith('}TextEquiv'): + for childtext2 in child2: + if childtext2.tag.endswith('}Unicode') or childtext2.tag.endswith('}Unicode'): + if "type" in nn.attrib and nn.attrib['type']=='drop-capital': + co_text_drop_text.append(childtext2.text) + elif "type" in nn.attrib and nn.attrib['type']=='heading': + co_text_heading_text.append(childtext2.text) + elif "type" in nn.attrib and nn.attrib['type']=='signature-mark': + co_text_signature_mark_text.append(childtext2.text) + elif "type" in nn.attrib and nn.attrib['type']=='header': + co_text_header_text.append(childtext2.text) + ###elif "type" in nn.attrib and nn.attrib['type']=='catch-word': + ###co_text_catch_text.append(childtext2.text) + ###elif "type" in nn.attrib and nn.attrib['type']=='page-number': + ###co_text_page_number_text.append(childtext2.text) + elif "type" in nn.attrib and nn.attrib['type']=='marginalia': + co_text_marginalia_text.append(childtext2.text) + else: + co_text_paragraph_text.append(childtext2.text) + c_t_in_drop=[] + c_t_in_paragraph=[] + c_t_in_heading=[] + c_t_in_header=[] + c_t_in_page_number=[] + c_t_in_signature_mark=[] + c_t_in_catch=[] + c_t_in_marginalia=[] + + + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + + coords=bool(vv.attrib) + if coords: + #print('birda1') + p_h=vv.attrib['points'].split(' ') + + + + if "type" in nn.attrib and nn.attrib['type']=='drop-capital': + + c_t_in_drop.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + elif "type" in nn.attrib and nn.attrib['type']=='heading': + ##id_heading.append(nn.attrib['id']) + c_t_in_heading.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + + elif "type" in nn.attrib and nn.attrib['type']=='signature-mark': + + c_t_in_signature_mark.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + #print(c_t_in_paragraph) + elif "type" in nn.attrib and nn.attrib['type']=='header': + #id_header.append(nn.attrib['id']) + c_t_in_header.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + + ###elif "type" in nn.attrib and nn.attrib['type']=='catch-word': + ###c_t_in_catch.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + + ###elif "type" in nn.attrib and nn.attrib['type']=='page-number': + + ###c_t_in_page_number.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + elif "type" in nn.attrib and nn.attrib['type']=='marginalia': + #id_marginalia.append(nn.attrib['id']) + + c_t_in_marginalia.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + else: + #id_paragraph.append(nn.attrib['id']) + + c_t_in_paragraph.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + break + else: + pass + + + if vv.tag==link+'Point': + if "type" in nn.attrib and nn.attrib['type']=='drop-capital': + + c_t_in_drop.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + elif "type" in nn.attrib and nn.attrib['type']=='heading': + #id_heading.append(nn.attrib['id']) + c_t_in_heading.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + + elif "type" in nn.attrib and nn.attrib['type']=='signature-mark': + + c_t_in_signature_mark.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + elif "type" in nn.attrib and nn.attrib['type']=='header': + #id_header.append(nn.attrib['id']) + c_t_in_header.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + + ###elif "type" in nn.attrib and nn.attrib['type']=='catch-word': + ###c_t_in_catch.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + ###sumi+=1 + + ###elif "type" in nn.attrib and nn.attrib['type']=='page-number': + + ###c_t_in_page_number.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + ###sumi+=1 + + elif "type" in nn.attrib and nn.attrib['type']=='marginalia': + #id_marginalia.append(nn.attrib['id']) + + c_t_in_marginalia.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + else: + #id_paragraph.append(nn.attrib['id']) + c_t_in_paragraph.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + elif vv.tag!=link+'Point' and sumi>=1: + break + + if len(c_t_in_drop)>0: + co_text_drop.append(np.array(c_t_in_drop)) + if len(c_t_in_paragraph)>0: + co_text_paragraph.append(np.array(c_t_in_paragraph)) + id_paragraph.append(nn.attrib['id']) + if len(c_t_in_heading)>0: + co_text_heading.append(np.array(c_t_in_heading)) + id_heading.append(nn.attrib['id']) + + if len(c_t_in_header)>0: + co_text_header.append(np.array(c_t_in_header)) + id_header.append(nn.attrib['id']) + if len(c_t_in_page_number)>0: + co_text_page_number.append(np.array(c_t_in_page_number)) + if len(c_t_in_catch)>0: + co_text_catch.append(np.array(c_t_in_catch)) + + if len(c_t_in_signature_mark)>0: + co_text_signature_mark.append(np.array(c_t_in_signature_mark)) + + if len(c_t_in_marginalia)>0: + co_text_marginalia.append(np.array(c_t_in_marginalia)) + id_marginalia.append(nn.attrib['id']) + + + elif tag.endswith('}GraphicRegion') or tag.endswith('}graphicregion'): + for nn in root1.iter(tag): + c_t_in=[] + c_t_in_text_annotation=[] + c_t_in_decoration=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + + if "type" in nn.attrib and nn.attrib['type']=='handwritten-annotation': + c_t_in_text_annotation.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + elif "type" in nn.attrib and nn.attrib['type']=='decoration': + c_t_in_decoration.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + else: + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + + break + else: + pass + + + if vv.tag==link+'Point': + if "type" in nn.attrib and nn.attrib['type']=='handwritten-annotation': + c_t_in_text_annotation.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + elif "type" in nn.attrib and nn.attrib['type']=='decoration': + c_t_in_decoration.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + else: + c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + if len(c_t_in_text_annotation)>0: + co_graphic_text_annotation.append(np.array(c_t_in_text_annotation)) + if len(c_t_in_decoration)>0: + co_graphic_decoration.append(np.array(c_t_in_decoration)) + if len(c_t_in)>0: + co_graphic.append(np.array(c_t_in)) + + + + elif tag.endswith('}ImageRegion') or tag.endswith('}imageregion'): + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + elif vv.tag!=link+'Point' and sumi>=1: + break + co_img.append(np.array(c_t_in)) + co_img_text.append(' ') + + + elif tag.endswith('}SeparatorRegion') or tag.endswith('}separatorregion'): + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + elif vv.tag!=link+'Point' and sumi>=1: + break + co_sep.append(np.array(c_t_in)) + + + + elif tag.endswith('}TableRegion') or tag.endswith('}tableregion'): + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + elif vv.tag!=link+'Point' and sumi>=1: + break + co_table.append(np.array(c_t_in)) + co_table_text.append(' ') + + elif tag.endswith('}NoiseRegion') or tag.endswith('}noiseregion'): + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + elif vv.tag!=link+'Point' and sumi>=1: + break + co_noise.append(np.array(c_t_in)) + co_noise_text.append(' ') + + img = np.zeros( (y_len,x_len,3) ) + img_poly=cv2.fillPoly(img, pts =co_text_paragraph, color=(1,1,1)) + + img_poly=cv2.fillPoly(img, pts =co_text_heading, color=(2,2,2)) + img_poly=cv2.fillPoly(img, pts =co_text_header, color=(2,2,2)) + img_poly=cv2.fillPoly(img, pts =co_text_marginalia, color=(3,3,3)) + img_poly=cv2.fillPoly(img, pts =co_img, color=(4,4,4)) + img_poly=cv2.fillPoly(img, pts =co_sep, color=(5,5,5)) + + return tree1, root1, bb_coord_printspace, file_name, id_paragraph, id_header+id_heading, co_text_paragraph, co_text_header+co_text_heading,\ + tot_region_ref,x_len, y_len,index_tot_regions, img_poly + + def return_indexes_of_contours_loctaed_inside_another_list_of_contours(self, contours, contours_loc, cx_main_loc, cy_main_loc, indexes_loc): + indexes_of_located_cont = [] + center_x_coordinates_of_located = [] + center_y_coordinates_of_located = [] + #M_main_tot = [cv2.moments(contours_loc[j]) + #for j in range(len(contours_loc))] + #cx_main_loc = [(M_main_tot[j]["m10"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] + #cy_main_loc = [(M_main_tot[j]["m01"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] + + for ij in range(len(contours)): + results = [cv2.pointPolygonTest(contours[ij], (cx_main_loc[ind], cy_main_loc[ind]), False) + for ind in range(len(cy_main_loc)) ] + results = np.array(results) + indexes_in = np.where((results == 0) | (results == 1)) + indexes = indexes_loc[indexes_in]# [(results == 0) | (results == 1)]#np.where((results == 0) | (results == 1)) + + indexes_of_located_cont.append(indexes) + center_x_coordinates_of_located.append(np.array(cx_main_loc)[indexes_in] ) + center_y_coordinates_of_located.append(np.array(cy_main_loc)[indexes_in] ) + + return indexes_of_located_cont, center_x_coordinates_of_located, center_y_coordinates_of_located + + def do_order_of_regions_with_model(self, contours_only_text_parent, contours_only_text_parent_h, text_regions_p): + height1 =672#448 + width1 = 448#224 + + height2 =672#448 + width2= 448#224 + + height3 =672#448 + width3 = 448#224 + + inference_bs = 3 + + ver_kernel = np.ones((5, 1), dtype=np.uint8) + hor_kernel = np.ones((1, 5), dtype=np.uint8) + + + min_cont_size_to_be_dilated = 10 + if len(contours_only_text_parent)>min_cont_size_to_be_dilated and self.light_version: + cx_conts, cy_conts, x_min_conts, x_max_conts, y_min_conts, y_max_conts, _ = find_new_features_of_contours(contours_only_text_parent) + args_cont_located = np.array(range(len(contours_only_text_parent))) + + diff_y_conts = np.abs(y_max_conts[:]-y_min_conts) + diff_x_conts = np.abs(x_max_conts[:]-x_min_conts) + + mean_x = statistics.mean(diff_x_conts) + median_x = statistics.median(diff_x_conts) + + + diff_x_ratio= diff_x_conts/mean_x + + args_cont_located_excluded = args_cont_located[diff_x_ratio>=1.3] + args_cont_located_included = args_cont_located[diff_x_ratio<1.3] + + contours_only_text_parent_excluded = [contours_only_text_parent[ind] for ind in range(len(contours_only_text_parent)) if diff_x_ratio[ind]>=1.3]#contours_only_text_parent[diff_x_ratio>=1.3] + contours_only_text_parent_included = [contours_only_text_parent[ind] for ind in range(len(contours_only_text_parent)) if diff_x_ratio[ind]<1.3]#contours_only_text_parent[diff_x_ratio<1.3] + + + cx_conts_excluded = [cx_conts[ind] for ind in range(len(cx_conts)) if diff_x_ratio[ind]>=1.3]#cx_conts[diff_x_ratio>=1.3] + cx_conts_included = [cx_conts[ind] for ind in range(len(cx_conts)) if diff_x_ratio[ind]<1.3]#cx_conts[diff_x_ratio<1.3] + + cy_conts_excluded = [cy_conts[ind] for ind in range(len(cy_conts)) if diff_x_ratio[ind]>=1.3]#cy_conts[diff_x_ratio>=1.3] + cy_conts_included = [cy_conts[ind] for ind in range(len(cy_conts)) if diff_x_ratio[ind]<1.3]#cy_conts[diff_x_ratio<1.3] + + #print(diff_x_ratio, 'ratio') + text_regions_p = text_regions_p.astype('uint8') + + if len(contours_only_text_parent_excluded)>0: + textregion_par = np.zeros((text_regions_p.shape[0], text_regions_p.shape[1])).astype('uint8') + textregion_par = cv2.fillPoly(textregion_par, pts=contours_only_text_parent_included, color=(1,1)) + else: + textregion_par = (text_regions_p[:,:]==1)*1 + textregion_par = textregion_par.astype('uint8') + + text_regions_p_textregions_dilated = cv2.erode(textregion_par , hor_kernel, iterations=2) + text_regions_p_textregions_dilated = cv2.dilate(text_regions_p_textregions_dilated , ver_kernel, iterations=4) + text_regions_p_textregions_dilated = cv2.erode(text_regions_p_textregions_dilated , hor_kernel, iterations=1) + text_regions_p_textregions_dilated = cv2.dilate(text_regions_p_textregions_dilated , ver_kernel, iterations=5) + text_regions_p_textregions_dilated[text_regions_p[:,:]>1] = 0 + + + contours_only_dilated, hir_on_text_dilated = return_contours_of_image(text_regions_p_textregions_dilated) + contours_only_dilated = return_parent_contours(contours_only_dilated, hir_on_text_dilated) + + indexes_of_located_cont, center_x_coordinates_of_located, center_y_coordinates_of_located = self.return_indexes_of_contours_loctaed_inside_another_list_of_contours(contours_only_dilated, contours_only_text_parent_included, cx_conts_included, cy_conts_included, args_cont_located_included) + + + if len(args_cont_located_excluded)>0: + for ind in args_cont_located_excluded: + indexes_of_located_cont.append(np.array([ind])) + contours_only_dilated.append(contours_only_text_parent[ind]) + center_y_coordinates_of_located.append(0) + + array_list = [np.array([elem]) if isinstance(elem, int) else elem for elem in indexes_of_located_cont] + flattened_array = np.concatenate([arr.ravel() for arr in array_list]) + #print(len( np.unique(flattened_array)), 'indexes_of_located_cont uniques') + + missing_textregions = list( set(np.array(range(len(contours_only_text_parent))) ) - set(np.unique(flattened_array)) ) + #print(missing_textregions, 'missing_textregions') + + for ind in missing_textregions: + indexes_of_located_cont.append(np.array([ind])) + contours_only_dilated.append(contours_only_text_parent[ind]) + center_y_coordinates_of_located.append(0) + + + if contours_only_text_parent_h: + for vi in range(len(contours_only_text_parent_h)): + indexes_of_located_cont.append(int(vi+len(contours_only_text_parent))) + + array_list = [np.array([elem]) if isinstance(elem, int) else elem for elem in indexes_of_located_cont] + flattened_array = np.concatenate([arr.ravel() for arr in array_list]) + + y_len = text_regions_p.shape[0] + x_len = text_regions_p.shape[1] + + img_poly = np.zeros((y_len,x_len), dtype='uint8') + ###img_poly[text_regions_p[:,:]==1] = 1 + ###img_poly[text_regions_p[:,:]==2] = 2 + ###img_poly[text_regions_p[:,:]==3] = 4 + ###img_poly[text_regions_p[:,:]==6] = 5 + + ##img_poly[text_regions_p[:,:]==1] = 1 + ##img_poly[text_regions_p[:,:]==2] = 2 + ##img_poly[text_regions_p[:,:]==3] = 3 + ##img_poly[text_regions_p[:,:]==4] = 4 + ##img_poly[text_regions_p[:,:]==5] = 5 + + img_poly = np.copy(text_regions_p) + + img_header_and_sep = np.zeros((y_len,x_len), dtype='uint8') + if contours_only_text_parent_h: + _, cy_main, x_min_main, x_max_main, y_min_main, y_max_main, _ = find_new_features_of_contours( + contours_only_text_parent_h) + for j in range(len(cy_main)): + img_header_and_sep[int(y_max_main[j]):int(y_max_main[j])+12, + int(x_min_main[j]):int(x_max_main[j])] = 1 + co_text_all_org = contours_only_text_parent + contours_only_text_parent_h + if len(contours_only_text_parent)>min_cont_size_to_be_dilated and self.light_version: + co_text_all = contours_only_dilated + contours_only_text_parent_h + else: + co_text_all = contours_only_text_parent + contours_only_text_parent_h + else: + co_text_all_org = contours_only_text_parent + if len(contours_only_text_parent)>min_cont_size_to_be_dilated and self.light_version: + co_text_all = contours_only_dilated + else: + co_text_all = contours_only_text_parent + + if not len(co_text_all): + return [], [] + + labels_con = np.zeros((int(y_len /6.), int(x_len/6.), len(co_text_all)), dtype=bool) + + co_text_all = [(i/6).astype(int) for i in co_text_all] + for i in range(len(co_text_all)): + img = labels_con[:,:,i].astype(np.uint8) + + #img = cv2.resize(img, (int(img.shape[1]/6), int(img.shape[0]/6)), interpolation=cv2.INTER_NEAREST) + + cv2.fillPoly(img, pts=[co_text_all[i]], color=(1,)) + labels_con[:,:,i] = img + + + labels_con = resize_image(labels_con.astype(np.uint8), height1, width1).astype(bool) + img_header_and_sep = resize_image(img_header_and_sep, height1, width1) + img_poly = resize_image(img_poly, height3, width3) + + + + input_1 = np.zeros((inference_bs, height1, width1, 3)) + ordered = [list(range(len(co_text_all)))] + index_update = 0 + #print(labels_con.shape[2],"number of regions for reading order") + while index_update>=0: + ij_list = ordered.pop(index_update) + i = ij_list.pop(0) + + ante_list = [] + post_list = [] + tot_counter = 0 + batch = [] + for j in ij_list: + img1 = labels_con[:,:,i].astype(float) + img2 = labels_con[:,:,j].astype(float) + img1[img_poly==5] = 2 + img2[img_poly==5] = 2 + img1[img_header_and_sep==1] = 3 + img2[img_header_and_sep==1] = 3 + + input_1[len(batch), :, :, 0] = img1 / 3. + input_1[len(batch), :, :, 2] = img2 / 3. + input_1[len(batch), :, :, 1] = img_poly / 5. + + tot_counter += 1 + batch.append(j) + if tot_counter % inference_bs == 0 or tot_counter == len(ij_list): + y_pr = self.model_reading_order.predict(input_1 , verbose=0) + for jb, j in enumerate(batch): + if y_pr[jb][0]>=0.5: + post_list.append(j) + else: + ante_list.append(j) + batch = [] + + if len(ante_list): + ordered.insert(index_update, ante_list) + index_update += 1 + ordered.insert(index_update, [i]) + if len(post_list): + ordered.insert(index_update + 1, post_list) + + index_update = -1 + for index_next, ij_list in enumerate(ordered): + if len(ij_list) > 1: + index_update = index_next + break + + ordered = [i[0] for i in ordered] + + ##id_all_text = np.array(id_all_text)[index_sort] + + + if len(contours_only_text_parent)>min_cont_size_to_be_dilated and self.light_version: + org_contours_indexes = [] + for ind in range(len(ordered)): + region_with_curr_order = ordered[ind] + if region_with_curr_order < len(contours_only_dilated): + if np.isscalar(indexes_of_located_cont[region_with_curr_order]): + org_contours_indexes = org_contours_indexes + [indexes_of_located_cont[region_with_curr_order]] + else: + arg_sort_located_cont = np.argsort(center_y_coordinates_of_located[region_with_curr_order]) + org_contours_indexes = org_contours_indexes + list(np.array(indexes_of_located_cont[region_with_curr_order])[arg_sort_located_cont]) ##org_contours_indexes + list ( + else: + org_contours_indexes = org_contours_indexes + [indexes_of_located_cont[region_with_curr_order]] + + region_ids = ['region_%04d' % i for i in range(len(co_text_all_org))] + return org_contours_indexes, region_ids + else: + region_ids = ['region_%04d' % i for i in range(len(co_text_all_org))] + return ordered, region_ids + + + + + def run(self, xml_filename : Optional[str] = None, dir_in : Optional[str] = None, overwrite : bool = False): + """ + Get image and scales, then extract the page of scanned image + """ + self.logger.debug("enter run") + t0_tot = time.time() + + if dir_in: + self.ls_xmls = os.listdir(dir_in) + elif xml_filename: + self.ls_xmls = [xml_filename] + else: + raise ValueError("run requires either a single image filename or a directory") + + for xml_filename in self.ls_xmls: + self.logger.info(xml_filename) + t0 = time.time() + + if dir_in: + xml_file = os.path.join(dir_in, xml_filename) + else: + xml_file = xml_filename + + tree_xml, root_xml, bb_coord_printspace, file_name, id_paragraph, id_header, co_text_paragraph, co_text_header, tot_region_ref, x_len, y_len, index_tot_regions, img_poly = self.read_xml(xml_file) + + id_all_text = id_paragraph + id_header + + order_text_new, id_of_texts_tot = self.do_order_of_regions_with_model(co_text_paragraph, co_text_header, img_poly[:,:,0]) + + id_all_text = np.array(id_all_text)[order_text_new] + + alltags=[elem.tag for elem in root_xml.iter()] + + + + link=alltags[0].split('}')[0]+'}' + name_space = alltags[0].split('}')[0] + name_space = name_space.split('{')[1] + + page_element = root_xml.find(link+'Page') + + + old_ro = root_xml.find(".//{*}ReadingOrder") + + if old_ro is not None: + page_element.remove(old_ro) + + #print(old_ro, 'old_ro') + ro_subelement = ET.Element('ReadingOrder') + + ro_subelement2 = ET.SubElement(ro_subelement, 'OrderedGroup') + ro_subelement2.set('id', "ro357564684568544579089") + + for index, id_text in enumerate(id_all_text): + new_element_2 = ET.SubElement(ro_subelement2, 'RegionRefIndexed') + new_element_2.set('regionRef', id_all_text[index]) + new_element_2.set('index', str(index)) + + if (link+'PrintSpace' in alltags) or (link+'Border' in alltags): + page_element.insert(1, ro_subelement) + else: + page_element.insert(0, ro_subelement) + + alltags=[elem.tag for elem in root_xml.iter()] + + ET.register_namespace("",name_space) + tree_xml.write(os.path.join(self.dir_out, file_name+'.xml'),xml_declaration=True,method='xml',encoding="utf8",default_namespace=None) + + #sys.exit() + diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index c5962f8..ca86047 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -992,7 +992,7 @@ def check_any_text_region_in_model_one_is_main_or_header_light( (regions_model_full[:,:,0]==2)).sum() pixels_main = all_pixels - pixels_header - if (pixels_header/float(pixels_main)>=0.3) and ( (length_con[ii]/float(height_con[ii]) )>=1.3 ): + if ( (pixels_header/float(pixels_main)>=0.6) and ( (length_con[ii]/float(height_con[ii]) )>=1.3 ) and ( (length_con[ii]/float(height_con[ii]) )<=3 )) or ( (pixels_header/float(pixels_main)>=0.3) and ( (length_con[ii]/float(height_con[ii]) )>=3 ) ): regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ]=2 contours_only_text_parent_head.append(con) if contours_only_text_parent_d_ordered is not None: @@ -1801,8 +1801,8 @@ def return_boxes_of_images_by_order_of_reading_new( #print(y_type_2_up,x_starting_up,x_ending_up,'didid') nodes_in = [] for ij in range(len(x_starting_up)): - nodes_in = nodes_in + list(range(x_starting_up[ij], - x_ending_up[ij])) + nodes_in = nodes_in + list(range(int(x_starting_up[ij]), + int(x_ending_up[ij]))) nodes_in = np.unique(nodes_in) #print(nodes_in,'nodes_in') @@ -1825,8 +1825,8 @@ def return_boxes_of_images_by_order_of_reading_new( elif len(y_diff_main_separator_up)==0: nodes_in = [] for ij in range(len(x_starting_up)): - nodes_in = nodes_in + list(range(x_starting_up[ij], - x_ending_up[ij])) + nodes_in = nodes_in + list(range(int(x_starting_up[ij]), + int(x_ending_up[ij]))) nodes_in = np.unique(nodes_in) #print(nodes_in,'nodes_in2') #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))') @@ -1866,8 +1866,8 @@ def return_boxes_of_images_by_order_of_reading_new( columns_covered_by_mothers = [] for dj in range(len(x_start_without_mother)): columns_covered_by_mothers = columns_covered_by_mothers + \ - list(range(x_start_without_mother[dj], - x_end_without_mother[dj])) + list(range(int(x_start_without_mother[dj]), + int(x_end_without_mother[dj]))) columns_covered_by_mothers = list(set(columns_covered_by_mothers)) all_columns=np.arange(len(peaks_neg_tot)-1) @@ -1909,8 +1909,8 @@ def return_boxes_of_images_by_order_of_reading_new( columns_covered_by_mothers = [] for dj in range(len(x_start_without_mother)): columns_covered_by_mothers = columns_covered_by_mothers + \ - list(range(x_start_without_mother[dj], - x_end_without_mother[dj])) + list(range(int(x_start_without_mother[dj]), + int(x_end_without_mother[dj]))) columns_covered_by_mothers = list(set(columns_covered_by_mothers)) all_columns=np.arange(len(peaks_neg_tot)-1) @@ -1926,8 +1926,8 @@ def return_boxes_of_images_by_order_of_reading_new( columns_covered_by_with_child_no_mothers = [] for dj in range(len(x_end_with_child_without_mother)): columns_covered_by_with_child_no_mothers = columns_covered_by_with_child_no_mothers + \ - list(range(x_start_with_child_without_mother[dj], - x_end_with_child_without_mother[dj])) + list(range(int(x_start_with_child_without_mother[dj]), + int(x_end_with_child_without_mother[dj]))) columns_covered_by_with_child_no_mothers = list(set(columns_covered_by_with_child_no_mothers)) all_columns = np.arange(len(peaks_neg_tot)-1) @@ -1970,8 +1970,8 @@ def return_boxes_of_images_by_order_of_reading_new( columns_covered_by_mothers = [] for dj in range(len(x_starting_all_between_nm_wc)): columns_covered_by_mothers = columns_covered_by_mothers + \ - list(range(x_starting_all_between_nm_wc[dj], - x_ending_all_between_nm_wc[dj])) + list(range(int(x_starting_all_between_nm_wc[dj]), + int(x_ending_all_between_nm_wc[dj]))) columns_covered_by_mothers = list(set(columns_covered_by_mothers)) all_columns=np.arange(i_s_nc, x_end_biggest_column) @@ -1979,8 +1979,8 @@ def return_boxes_of_images_by_order_of_reading_new( should_longest_line_be_extended=0 if (len(x_diff_all_between_nm_wc) > 0 and - set(list(range(x_starting_all_between_nm_wc[biggest], - x_ending_all_between_nm_wc[biggest])) + + set(list(range(int(x_starting_all_between_nm_wc[biggest]), + int(x_ending_all_between_nm_wc[biggest]))) + list(columns_not_covered)) != set(all_columns)): should_longest_line_be_extended=1 index_lines_so_close_to_top_separator = \ @@ -2012,7 +2012,7 @@ def return_boxes_of_images_by_order_of_reading_new( x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, np.array(columns_not_covered) + 1) ind_args_between=np.arange(len(x_ending_all_between_nm_wc)) - for column in range(i_s_nc, x_end_biggest_column): + for column in range(int(i_s_nc), int(x_end_biggest_column)): ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column] #print('babali2') #print(ind_args_in_col,'ind_args_in_col') @@ -2064,7 +2064,7 @@ def return_boxes_of_images_by_order_of_reading_new( x_end_itself=x_end_copy.pop(il) #print(y_copy,'y_copy2') - for column in range(x_start_itself, x_end_itself+1): + for column in range(int(x_start_itself), int(x_end_itself)+1): #print(column,'cols') y_in_cols=[] for yic in range(len(y_copy)): @@ -2095,11 +2095,11 @@ def return_boxes_of_images_by_order_of_reading_new( all_columns = np.arange(len(peaks_neg_tot)-1) columns_covered_by_lines_covered_more_than_2col = [] for dj in range(len(x_starting)): - if set(list(range(x_starting[dj],x_ending[dj]))) == set(all_columns): + if set(list(range(int(x_starting[dj]),int(x_ending[dj]) ))) == set(all_columns): pass else: columns_covered_by_lines_covered_more_than_2col = columns_covered_by_lines_covered_more_than_2col + \ - list(range(x_starting[dj],x_ending[dj])) + list(range(int(x_starting[dj]),int(x_ending[dj]) )) columns_covered_by_lines_covered_more_than_2col = list(set(columns_covered_by_lines_covered_more_than_2col)) columns_not_covered = list(set(all_columns) - set(columns_covered_by_lines_covered_more_than_2col)) @@ -2124,7 +2124,7 @@ def return_boxes_of_images_by_order_of_reading_new( x_ending = np.append(x_ending, np.array(columns_not_covered) + 1) ind_args=np.array(range(len(y_type_2))) - #ind_args=np.array(ind_args) + for column in range(len(peaks_neg_tot)-1): #print(column,'column') ind_args_in_col=ind_args[x_starting==column] @@ -2155,8 +2155,7 @@ def return_boxes_of_images_by_order_of_reading_new( x_start_itself=x_start_copy.pop(il) x_end_itself=x_end_copy.pop(il) - #print(y_copy,'y_copy2') - for column in range(x_start_itself, x_end_itself+1): + for column in range(int(x_start_itself), int(x_end_itself)+1): #print(column,'cols') y_in_cols=[] for yic in range(len(y_copy)): diff --git a/src/eynollah/utils/marginals.py b/src/eynollah/utils/marginals.py index a29e50d..ac8dc1d 100644 --- a/src/eynollah/utils/marginals.py +++ b/src/eynollah/utils/marginals.py @@ -10,7 +10,6 @@ def get_marginals(text_with_lines, text_regions, num_col, slope_deskew, light_ve mask_marginals=np.zeros((text_with_lines.shape[0],text_with_lines.shape[1])) mask_marginals=mask_marginals.astype(np.uint8) - text_with_lines=text_with_lines.astype(np.uint8) ##text_with_lines=cv2.erode(text_with_lines,self.kernel,iterations=3) @@ -26,8 +25,12 @@ def get_marginals(text_with_lines, text_regions, num_col, slope_deskew, light_ve text_with_lines=resize_image(text_with_lines,int(text_with_lines.shape[0]*1.8),text_with_lines.shape[1]) text_with_lines=cv2.erode(text_with_lines,kernel,iterations=7) text_with_lines=resize_image(text_with_lines,text_with_lines_eroded.shape[0],text_with_lines_eroded.shape[1]) - - + + + if light_version: + kernel_hor = np.ones((1, 5), dtype=np.uint8) + text_with_lines = cv2.erode(text_with_lines,kernel_hor,iterations=6) + text_with_lines_y=text_with_lines.sum(axis=0) text_with_lines_y_eroded=text_with_lines_eroded.sum(axis=0) @@ -40,8 +43,10 @@ def get_marginals(text_with_lines, text_regions, num_col, slope_deskew, light_ve elif thickness_along_y_percent>=30 and thickness_along_y_percent<50: min_textline_thickness=20 else: - min_textline_thickness=40 - + if light_version: + min_textline_thickness=45 + else: + min_textline_thickness=40 if thickness_along_y_percent>=14: diff --git a/src/eynollah/utils/separate_lines.py b/src/eynollah/utils/separate_lines.py index 0322579..ead5cfb 100644 --- a/src/eynollah/utils/separate_lines.py +++ b/src/eynollah/utils/separate_lines.py @@ -5,6 +5,8 @@ import numpy as np import cv2 from scipy.signal import find_peaks from scipy.ndimage import gaussian_filter1d +from multiprocessing import Process, Queue, cpu_count +from multiprocessing import Pool from .rotate import rotate_image from .resize import resize_image from .contour import ( @@ -1466,7 +1468,7 @@ def return_deskew_slop(img_patch_org, sigma_des,n_tot_angles=100, main_page=False, logger=None, plotter=None, map=map): if main_page and plotter: plotter.save_plot_of_textline_density(img_patch_org) - + img_int=np.zeros((img_patch_org.shape[0],img_patch_org.shape[1])) img_int[:,:]=img_patch_org[:,:]#img_patch_org[:,:,0] @@ -1487,7 +1489,7 @@ def return_deskew_slop(img_patch_org, sigma_des,n_tot_angles=100, angles = np.linspace(angle - 22.5, angle + 22.5, n_tot_angles) angle = get_smallest_skew(img_resized, sigma_des, angles, map=map, logger=logger, plotter=plotter) elif main_page: - angles = np.linspace(-12, 12, n_tot_angles)#np.array([0 , 45 , 90 , -45]) + angles = np.array (list(np.linspace(-12, -7, int(n_tot_angles/4))) + list(np.linspace(-6, 6, n_tot_angles- 2* int(n_tot_angles/4))) + list(np.linspace(7, 12, int(n_tot_angles/4))))#np.linspace(-12, 12, n_tot_angles)#np.array([0 , 45 , 90 , -45]) angle = get_smallest_skew(img_resized, sigma_des, angles, map=map, logger=logger, plotter=plotter) early_slope_edge=11 @@ -1526,6 +1528,107 @@ def get_smallest_skew(img, sigma_des, angles, logger=None, plotter=None, map=map angle = 0 return angle + +def return_deskew_slop_old_mp(img_patch_org, sigma_des,n_tot_angles=100, + main_page=False, logger=None, plotter=None): + if main_page and plotter: + plotter.save_plot_of_textline_density(img_patch_org) + + img_int=np.zeros((img_patch_org.shape[0],img_patch_org.shape[1])) + img_int[:,:]=img_patch_org[:,:]#img_patch_org[:,:,0] + + max_shape=np.max(img_int.shape) + img_resized=np.zeros((int( max_shape*(1.1) ) , int( max_shape*(1.1) ) )) + + onset_x=int((img_resized.shape[1]-img_int.shape[1])/2.) + onset_y=int((img_resized.shape[0]-img_int.shape[0])/2.) + + img_resized[ onset_y:onset_y+img_int.shape[0] , onset_x:onset_x+img_int.shape[1] ]=img_int[:,:] + + if main_page and img_patch_org.shape[1] > img_patch_org.shape[0]: + angles = np.array([-45, 0, 45, 90,]) + angle = get_smallest_skew_omp(img_resized, sigma_des, angles, plotter=plotter) + + angles = np.linspace(angle - 22.5, angle + 22.5, n_tot_angles) + angle = get_smallest_skew_omp(img_resized, sigma_des, angles, plotter=plotter) + elif main_page: + angles = np.linspace(-12, 12, n_tot_angles)#np.array([0 , 45 , 90 , -45]) + angle = get_smallest_skew_omp(img_resized, sigma_des, angles, plotter=plotter) + + early_slope_edge=11 + if abs(angle) > early_slope_edge: + if angle < 0: + angles = np.linspace(-90, -12, n_tot_angles) + else: + angles = np.linspace(90, 12, n_tot_angles) + angle = get_smallest_skew_omp(img_resized, sigma_des, angles, plotter=plotter) + else: + angles = np.linspace(-25, 25, int(0.5 * n_tot_angles) + 10) + angle = get_smallest_skew_omp(img_resized, sigma_des, angles, plotter=plotter) + + early_slope_edge=22 + if abs(angle) > early_slope_edge: + if angle < 0: + angles = np.linspace(-90, -25, int(0.5 * n_tot_angles) + 10) + else: + angles = np.linspace(90, 25, int(0.5 * n_tot_angles) + 10) + angle = get_smallest_skew_omp(img_resized, sigma_des, angles, plotter=plotter) + + return angle + +def do_image_rotation_omp(queue_of_all_params,angles_per_process, img_resized, sigma_des): + vars_per_each_subprocess = [] + angles_per_each_subprocess = [] + for mv in range(len(angles_per_process)): + img_rot=rotate_image(img_resized,angles_per_process[mv]) + img_rot[img_rot!=0]=1 + try: + var_spectrum=find_num_col_deskew(img_rot,sigma_des,20.3 ) + except: + var_spectrum=0 + vars_per_each_subprocess.append(var_spectrum) + angles_per_each_subprocess.append(angles_per_process[mv]) + + queue_of_all_params.put([vars_per_each_subprocess, angles_per_each_subprocess]) + +def get_smallest_skew_omp(img_resized, sigma_des, angles, plotter=None): + num_cores = cpu_count() + + queue_of_all_params = Queue() + processes = [] + nh = np.linspace(0, len(angles), num_cores + 1) + + for i in range(num_cores): + angles_per_process = angles[int(nh[i]) : int(nh[i + 1])] + processes.append(Process(target=do_image_rotation_omp, args=(queue_of_all_params, angles_per_process, img_resized, sigma_des))) + + for i in range(num_cores): + processes[i].start() + + var_res=[] + all_angles = [] + for i in range(num_cores): + list_all_par = queue_of_all_params.get(True) + vars_for_subprocess = list_all_par[0] + angles_sub_process = list_all_par[1] + for j in range(len(vars_for_subprocess)): + var_res.append(vars_for_subprocess[j]) + all_angles.append(angles_sub_process[j]) + + for i in range(num_cores): + processes[i].join() + + if plotter: + plotter.save_plot_of_rotation_angle(all_angles, var_res) + + + try: + var_res=np.array(var_res) + ang_int=all_angles[np.argmax(var_res)]#angels_sorted[arg_final]#angels[arg_sort_early[arg_sort[arg_final]]]#angels[arg_fin] + except: + ang_int=0 + return ang_int + def do_work_of_slopes_new( box_text, contour, contour_par, index_r_con, textline_mask_tot_ea, image_page_rotated, slope_deskew, diff --git a/src/eynollah/utils/utils_ocr.py b/src/eynollah/utils/utils_ocr.py new file mode 100644 index 0000000..d974650 --- /dev/null +++ b/src/eynollah/utils/utils_ocr.py @@ -0,0 +1,488 @@ +import numpy as np +import cv2 +import tensorflow as tf +from scipy.signal import find_peaks +from scipy.ndimage import gaussian_filter1d +import math +from PIL import Image, ImageDraw, ImageFont +from Bio import pairwise2 +from .resize import resize_image + +def decode_batch_predictions(pred, num_to_char, max_len = 128): + # input_len is the product of the batch size and the + # number of time steps. + input_len = np.ones(pred.shape[0]) * pred.shape[1] + + # Decode CTC predictions using greedy search. + # decoded is a tuple with 2 elements. + decoded = tf.keras.backend.ctc_decode(pred, + input_length = input_len, + beam_width = 100) + # The outputs are in the first element of the tuple. + # Additionally, the first element is actually a list, + # therefore we take the first element of that list as well. + #print(decoded,'decoded') + decoded = decoded[0][0][:, :max_len] + + #print(decoded, decoded.shape,'decoded') + + output = [] + for d in decoded: + # Convert the predicted indices to the corresponding chars. + d = tf.strings.reduce_join(num_to_char(d)) + d = d.numpy().decode("utf-8") + output.append(d) + return output + + +def distortion_free_resize(image, img_size): + w, h = img_size + image = tf.image.resize(image, size=(h, w), preserve_aspect_ratio=True) + + # Check tha amount of padding needed to be done. + pad_height = h - tf.shape(image)[0] + pad_width = w - tf.shape(image)[1] + + # Only necessary if you want to do same amount of padding on both sides. + if pad_height % 2 != 0: + height = pad_height // 2 + pad_height_top = height + 1 + pad_height_bottom = height + else: + pad_height_top = pad_height_bottom = pad_height // 2 + + if pad_width % 2 != 0: + width = pad_width // 2 + pad_width_left = width + 1 + pad_width_right = width + else: + pad_width_left = pad_width_right = pad_width // 2 + + image = tf.pad( + image, + paddings=[ + [pad_height_top, pad_height_bottom], + [pad_width_left, pad_width_right], + [0, 0], + ], + ) + + image = tf.transpose(image, (1, 0, 2)) + image = tf.image.flip_left_right(image) + return image + +def return_start_and_end_of_common_text_of_textline_ocr_without_common_section(textline_image): + width = np.shape(textline_image)[1] + height = np.shape(textline_image)[0] + common_window = int(0.06*width) + + width1 = int ( width/2. - common_window ) + width2 = int ( width/2. + common_window ) + + img_sum = np.sum(textline_image[:,:,0], axis=0) + sum_smoothed = gaussian_filter1d(img_sum, 3) + + peaks_real, _ = find_peaks(sum_smoothed, height=0) + if len(peaks_real)>70: + + peaks_real = peaks_real[(peaks_realwidth1)] + + arg_max = np.argmax(sum_smoothed[peaks_real]) + peaks_final = peaks_real[arg_max] + return peaks_final + else: + return None +# Function to fit text inside the given area +def fit_text_single_line(draw, text, font_path, max_width, max_height): + initial_font_size = 50 + font_size = initial_font_size + while font_size > 10: # Minimum font size + font = ImageFont.truetype(font_path, font_size) + text_bbox = draw.textbbox((0, 0), text, font=font) # Get text bounding box + text_width = text_bbox[2] - text_bbox[0] + text_height = text_bbox[3] - text_bbox[1] + + if text_width <= max_width and text_height <= max_height: + return font # Return the best-fitting font + + font_size -= 2 # Reduce font size and retry + + return ImageFont.truetype(font_path, 10) # Smallest font fallback + +def return_textlines_split_if_needed(textline_image, textline_image_bin, prediction_with_both_of_rgb_and_bin=False): + + split_point = return_start_and_end_of_common_text_of_textline_ocr_without_common_section(textline_image) + if split_point: + image1 = textline_image[:, :split_point,:]# image.crop((0, 0, width2, height)) + image2 = textline_image[:, split_point:,:]#image.crop((width1, 0, width, height)) + if prediction_with_both_of_rgb_and_bin: + image1_bin = textline_image_bin[:, :split_point,:]# image.crop((0, 0, width2, height)) + image2_bin = textline_image_bin[:, split_point:,:]#image.crop((width1, 0, width, height)) + return [image1, image2], [image1_bin, image2_bin] + else: + return [image1, image2], None + else: + return None, None +def preprocess_and_resize_image_for_ocrcnn_model(img, image_height, image_width): + if img.shape[0]==0 or img.shape[1]==0: + img_fin = np.ones((image_height, image_width, 3)) + else: + ratio = image_height /float(img.shape[0]) + w_ratio = int(ratio * img.shape[1]) + + if w_ratio <= image_width: + width_new = w_ratio + else: + width_new = image_width + + if width_new == 0: + width_new = img.shape[1] + + + img = resize_image(img, image_height, width_new) + img_fin = np.ones((image_height, image_width, 3))*255 + + img_fin[:,:width_new,:] = img[:,:,:] + img_fin = img_fin / 255. + return img_fin + +def get_deskewed_contour_and_bb_and_image(contour, image, deskew_angle): + (h_in, w_in) = image.shape[:2] + center = (w_in // 2, h_in // 2) + + rotation_matrix = cv2.getRotationMatrix2D(center, deskew_angle, 1.0) + + cos_angle = abs(rotation_matrix[0, 0]) + sin_angle = abs(rotation_matrix[0, 1]) + new_w = int((h_in * sin_angle) + (w_in * cos_angle)) + new_h = int((h_in * cos_angle) + (w_in * sin_angle)) + + rotation_matrix[0, 2] += (new_w / 2) - center[0] + rotation_matrix[1, 2] += (new_h / 2) - center[1] + + deskewed_image = cv2.warpAffine(image, rotation_matrix, (new_w, new_h)) + + contour_points = np.array(contour, dtype=np.float32) + transformed_points = cv2.transform(np.array([contour_points]), rotation_matrix)[0] + + x, y, w, h = cv2.boundingRect(np.array(transformed_points, dtype=np.int32)) + cropped_textline = deskewed_image[y:y+h, x:x+w] + + return cropped_textline + +def rotate_image_with_padding(image, angle, border_value=(0,0,0)): + # Get image dimensions + (h, w) = image.shape[:2] + + # Calculate the center of the image + center = (w // 2, h // 2) + + # Get the rotation matrix + rotation_matrix = cv2.getRotationMatrix2D(center, angle, 1.0) + + # Compute the new bounding dimensions + cos = abs(rotation_matrix[0, 0]) + sin = abs(rotation_matrix[0, 1]) + new_w = int((h * sin) + (w * cos)) + new_h = int((h * cos) + (w * sin)) + + # Adjust the rotation matrix to account for translation + rotation_matrix[0, 2] += (new_w / 2) - center[0] + rotation_matrix[1, 2] += (new_h / 2) - center[1] + + # Perform the rotation + try: + rotated_image = cv2.warpAffine(image, rotation_matrix, (new_w, new_h), borderValue=border_value) + except: + rotated_image = np.copy(image) + + return rotated_image + +def get_orientation_moments(contour): + moments = cv2.moments(contour) + if moments["mu20"] - moments["mu02"] == 0: # Avoid division by zero + return 90 if moments["mu11"] > 0 else -90 + else: + angle = 0.5 * np.arctan2(2 * moments["mu11"], moments["mu20"] - moments["mu02"]) + return np.degrees(angle) # Convert radians to degrees + + +def get_orientation_moments_of_mask(mask): + mask=mask.astype('uint8') + contours, _ = cv2.findContours(mask[:,:,0], cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + largest_contour = max(contours, key=cv2.contourArea) if contours else None + + moments = cv2.moments(largest_contour) + if moments["mu20"] - moments["mu02"] == 0: # Avoid division by zero + return 90 if moments["mu11"] > 0 else -90 + else: + angle = 0.5 * np.arctan2(2 * moments["mu11"], moments["mu20"] - moments["mu02"]) + return np.degrees(angle) # Convert radians to degrees + +def get_contours_and_bounding_boxes(mask): + # Find contours in the binary mask + contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + largest_contour = max(contours, key=cv2.contourArea) if contours else None + + # Get the bounding rectangle for the contour + x, y, w, h = cv2.boundingRect(largest_contour) + #bounding_boxes.append((x, y, w, h)) + + return x, y, w, h + +def return_splitting_point_of_image(image_to_spliited): + width = np.shape(image_to_spliited)[1] + height = np.shape(image_to_spliited)[0] + common_window = int(0.03*width) + + width1 = int ( common_window) + width2 = int ( width - common_window ) + + img_sum = np.sum(image_to_spliited[:,:,0], axis=0) + sum_smoothed = gaussian_filter1d(img_sum, 1) + + peaks_real, _ = find_peaks(sum_smoothed, height=0) + peaks_real = peaks_real[(peaks_realwidth1)] + + arg_sort = np.argsort(sum_smoothed[peaks_real]) + peaks_sort_4 = peaks_real[arg_sort][::-1][:3] + + return np.sort(peaks_sort_4) + +def break_curved_line_into_small_pieces_and_then_merge(img_curved, mask_curved, img_bin_curved=None): + peaks_4 = return_splitting_point_of_image(img_curved) + if len(peaks_4)>0: + imgs_tot = [] + + for ind in range(len(peaks_4)+1): + if ind==0: + img = img_curved[:, :peaks_4[ind], :] + if img_bin_curved is not None: + img_bin = img_bin_curved[:, :peaks_4[ind], :] + mask = mask_curved[:, :peaks_4[ind], :] + elif ind==len(peaks_4): + img = img_curved[:, peaks_4[ind-1]:, :] + if img_bin_curved is not None: + img_bin = img_bin_curved[:, peaks_4[ind-1]:, :] + mask = mask_curved[:, peaks_4[ind-1]:, :] + else: + img = img_curved[:, peaks_4[ind-1]:peaks_4[ind], :] + if img_bin_curved is not None: + img_bin = img_bin_curved[:, peaks_4[ind-1]:peaks_4[ind], :] + mask = mask_curved[:, peaks_4[ind-1]:peaks_4[ind], :] + + or_ma = get_orientation_moments_of_mask(mask) + + if img_bin_curved is not None: + imgs_tot.append([img, mask, or_ma, img_bin] ) + else: + imgs_tot.append([img, mask, or_ma] ) + + + w_tot_des_list = [] + w_tot_des = 0 + imgs_deskewed_list = [] + imgs_bin_deskewed_list = [] + + for ind in range(len(imgs_tot)): + img_in = imgs_tot[ind][0] + mask_in = imgs_tot[ind][1] + ori_in = imgs_tot[ind][2] + if img_bin_curved is not None: + img_bin_in = imgs_tot[ind][3] + + if abs(ori_in)<45: + img_in_des = rotate_image_with_padding(img_in, ori_in, border_value=(255,255,255) ) + if img_bin_curved is not None: + img_bin_in_des = rotate_image_with_padding(img_bin_in, ori_in, border_value=(255,255,255) ) + mask_in_des = rotate_image_with_padding(mask_in, ori_in) + mask_in_des = mask_in_des.astype('uint8') + + #new bounding box + x_n, y_n, w_n, h_n = get_contours_and_bounding_boxes(mask_in_des[:,:,0]) + + if w_n==0 or h_n==0: + img_in_des = np.copy(img_in) + if img_bin_curved is not None: + img_bin_in_des = np.copy(img_bin_in) + w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) ) + if w_relative==0: + w_relative = img_in_des.shape[1] + img_in_des = resize_image(img_in_des, 32, w_relative) + if img_bin_curved is not None: + img_bin_in_des = resize_image(img_bin_in_des, 32, w_relative) + else: + mask_in_des = mask_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] + img_in_des = img_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] + if img_bin_curved is not None: + img_bin_in_des = img_bin_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] + + w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) ) + if w_relative==0: + w_relative = img_in_des.shape[1] + img_in_des = resize_image(img_in_des, 32, w_relative) + if img_bin_curved is not None: + img_bin_in_des = resize_image(img_bin_in_des, 32, w_relative) + + + else: + img_in_des = np.copy(img_in) + if img_bin_curved is not None: + img_bin_in_des = np.copy(img_bin_in) + w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) ) + if w_relative==0: + w_relative = img_in_des.shape[1] + img_in_des = resize_image(img_in_des, 32, w_relative) + if img_bin_curved is not None: + img_bin_in_des = resize_image(img_bin_in_des, 32, w_relative) + + w_tot_des+=img_in_des.shape[1] + w_tot_des_list.append(img_in_des.shape[1]) + imgs_deskewed_list.append(img_in_des) + if img_bin_curved is not None: + imgs_bin_deskewed_list.append(img_bin_in_des) + + + + + img_final_deskewed = np.zeros((32, w_tot_des, 3))+255 + if img_bin_curved is not None: + img_bin_final_deskewed = np.zeros((32, w_tot_des, 3))+255 + else: + img_bin_final_deskewed = None + + w_indexer = 0 + for ind in range(len(w_tot_des_list)): + img_final_deskewed[:,w_indexer:w_indexer+w_tot_des_list[ind],:] = imgs_deskewed_list[ind][:,:,:] + if img_bin_curved is not None: + img_bin_final_deskewed[:,w_indexer:w_indexer+w_tot_des_list[ind],:] = imgs_bin_deskewed_list[ind][:,:,:] + w_indexer = w_indexer+w_tot_des_list[ind] + return img_final_deskewed, img_bin_final_deskewed + else: + return img_curved, img_bin_curved + +def return_textline_contour_with_added_box_coordinate(textline_contour, box_ind): + textline_contour[:,0] = textline_contour[:,0] + box_ind[2] + textline_contour[:,1] = textline_contour[:,1] + box_ind[0] + return textline_contour + + +def return_rnn_cnn_ocr_of_given_textlines(image, all_found_textline_polygons, prediction_model, b_s_ocr, num_to_char, textline_light=False, curved_line=False): + max_len = 512 + padding_token = 299 + image_width = 512#max_len * 4 + image_height = 32 + ind_tot = 0 + #cv2.imwrite('./img_out.png', image_page) + ocr_all_textlines = [] + cropped_lines_region_indexer = [] + cropped_lines_meging_indexing = [] + cropped_lines = [] + indexer_text_region = 0 + + for indexing, ind_poly_first in enumerate(all_found_textline_polygons): + #ocr_textline_in_textregion = [] + if len(ind_poly_first)==0: + cropped_lines_region_indexer.append(indexer_text_region) + cropped_lines_meging_indexing.append(0) + img_fin = np.ones((image_height, image_width, 3))*1 + cropped_lines.append(img_fin) + + else: + for indexing2, ind_poly in enumerate(ind_poly_first): + cropped_lines_region_indexer.append(indexer_text_region) + if not (textline_light or curved_line): + ind_poly = copy.deepcopy(ind_poly) + box_ind = all_box_coord[indexing] + + ind_poly = return_textline_contour_with_added_box_coordinate(ind_poly, box_ind) + #print(ind_poly_copy) + ind_poly[ind_poly<0] = 0 + x, y, w, h = cv2.boundingRect(ind_poly) + + w_scaled = w * image_height/float(h) + + mask_poly = np.zeros(image.shape) + + img_poly_on_img = np.copy(image) + + mask_poly = cv2.fillPoly(mask_poly, pts=[ind_poly], color=(1, 1, 1)) + + + + mask_poly = mask_poly[y:y+h, x:x+w, :] + img_crop = img_poly_on_img[y:y+h, x:x+w, :] + + img_crop[mask_poly==0] = 255 + + if w_scaled < 640:#1.5*image_width: + img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) + cropped_lines.append(img_fin) + cropped_lines_meging_indexing.append(0) + else: + splited_images, splited_images_bin = return_textlines_split_if_needed(img_crop, None) + + if splited_images: + img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images[0], image_height, image_width) + cropped_lines.append(img_fin) + cropped_lines_meging_indexing.append(1) + + img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images[1], image_height, image_width) + + cropped_lines.append(img_fin) + cropped_lines_meging_indexing.append(-1) + + else: + img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) + cropped_lines.append(img_fin) + cropped_lines_meging_indexing.append(0) + + indexer_text_region+=1 + + extracted_texts = [] + + n_iterations = math.ceil(len(cropped_lines) / b_s_ocr) + + for i in range(n_iterations): + if i==(n_iterations-1): + n_start = i*b_s_ocr + imgs = cropped_lines[n_start:] + imgs = np.array(imgs) + imgs = imgs.reshape(imgs.shape[0], image_height, image_width, 3) + + + else: + n_start = i*b_s_ocr + n_end = (i+1)*b_s_ocr + imgs = cropped_lines[n_start:n_end] + imgs = np.array(imgs).reshape(b_s_ocr, image_height, image_width, 3) + + + preds = prediction_model.predict(imgs, verbose=0) + + pred_texts = decode_batch_predictions(preds, num_to_char) + + for ib in range(imgs.shape[0]): + pred_texts_ib = pred_texts[ib].replace("[UNK]", "") + extracted_texts.append(pred_texts_ib) + + extracted_texts_merged = [extracted_texts[ind] if cropped_lines_meging_indexing[ind]==0 else extracted_texts[ind]+" "+extracted_texts[ind+1] if cropped_lines_meging_indexing[ind]==1 else None for ind in range(len(cropped_lines_meging_indexing))] + + extracted_texts_merged = [ind for ind in extracted_texts_merged if ind is not None] + unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer) + + ocr_all_textlines = [] + for ind in unique_cropped_lines_region_indexer: + ocr_textline_in_textregion = [] + extracted_texts_merged_un = np.array(extracted_texts_merged)[np.array(cropped_lines_region_indexer)==ind] + for it_ind, text_textline in enumerate(extracted_texts_merged_un): + ocr_textline_in_textregion.append(text_textline) + ocr_all_textlines.append(ocr_textline_in_textregion) + return ocr_all_textlines + +def biopython_align(str1, str2): + alignments = pairwise2.align.globalms(str1, str2, 2, -1, -2, -2) + best_alignment = alignments[0] # Get the best alignment + return best_alignment.seqA, best_alignment.seqB diff --git a/src/eynollah/utils/xml.py b/src/eynollah/utils/xml.py index bd95702..13420df 100644 --- a/src/eynollah/utils/xml.py +++ b/src/eynollah/utils/xml.py @@ -46,16 +46,22 @@ def create_page_xml(imageFilename, height, width): )) return pcgts -def xml_reading_order(page, order_of_texts, id_of_marginalia): +def xml_reading_order(page, order_of_texts, id_of_marginalia_left, id_of_marginalia_right): region_order = ReadingOrderType() og = OrderedGroupType(id="ro357564684568544579089") page.set_ReadingOrder(region_order) region_order.set_OrderedGroup(og) region_counter = EynollahIdCounter() + + for id_marginal in id_of_marginalia_left: + og.add_RegionRefIndexed(RegionRefIndexedType(index=str(region_counter.get('region')), regionRef=id_marginal)) + region_counter.inc('region') + for idx_textregion, _ in enumerate(order_of_texts): og.add_RegionRefIndexed(RegionRefIndexedType(index=str(region_counter.get('region')), regionRef=region_counter.region_id(order_of_texts[idx_textregion] + 1))) region_counter.inc('region') - for id_marginal in id_of_marginalia: + + for id_marginal in id_of_marginalia_right: og.add_RegionRefIndexed(RegionRefIndexedType(index=str(region_counter.get('region')), regionRef=id_marginal)) region_counter.inc('region') diff --git a/src/eynollah/writer.py b/src/eynollah/writer.py index 92e353f..2f9caf3 100644 --- a/src/eynollah/writer.py +++ b/src/eynollah/writer.py @@ -56,10 +56,12 @@ class EynollahXmlWriter(): points_page_print = points_page_print + ' ' return points_page_print[:-1] - def serialize_lines_in_marginal(self, marginal_region, all_found_textline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, slopes_marginals, counter): + def serialize_lines_in_marginal(self, marginal_region, all_found_textline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, slopes_marginals, counter, ocr_all_textlines_textregion): for j in range(len(all_found_textline_polygons_marginals[marginal_idx])): coords = CoordsType() textline = TextLineType(id=counter.next_line_id, Coords=coords) + if ocr_all_textlines_textregion: + textline.set_TextEquiv( [ TextEquivType(Unicode=ocr_all_textlines_textregion[j]) ] ) marginal_region.add_TextLine(textline) marginal_region.set_orientation(-slopes_marginals[marginal_idx]) points_co = '' @@ -119,7 +121,7 @@ class EynollahXmlWriter(): points_co += ',' points_co += str(textline_y_coord) - if (self.curved_line or self.textline_light) and np.abs(slopes[region_idx]) <= 45: + if self.textline_light or (self.curved_line and np.abs(slopes[region_idx]) <= 45): if len(contour_textline) == 2: points_co += str(int((contour_textline[0] + page_coord[2]) / self.scale_x)) points_co += ',' @@ -128,7 +130,7 @@ class EynollahXmlWriter(): points_co += str(int((contour_textline[0][0] + page_coord[2]) / self.scale_x)) points_co += ',' points_co += str(int((contour_textline[0][1] + page_coord[0])/self.scale_y)) - elif (self.curved_line or self.textline_light) and np.abs(slopes[region_idx]) > 45: + elif self.curved_line and np.abs(slopes[region_idx]) > 45: if len(contour_textline)==2: points_co += str(int((contour_textline[0] + region_bboxes[2] + page_coord[2])/self.scale_x)) points_co += ',' @@ -168,7 +170,7 @@ class EynollahXmlWriter(): with open(self.output_filename, 'w') as f: f.write(to_xml(pcgts)) - def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml, found_polygons_tables, ocr_all_textlines, conf_contours_textregion): + def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals_left, found_polygons_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_marginals_left, slopes_marginals_right, cont_page, polygons_lines_to_be_written_in_xml, found_polygons_tables, ocr_all_textlines=None, ocr_all_textlines_marginals_left=None, ocr_all_textlines_marginals_right=None, conf_contours_textregion=None, skip_layout_reading_order=False): self.logger.debug('enter build_pagexml_no_full_layout') # create the file structure @@ -179,12 +181,13 @@ class EynollahXmlWriter(): counter = EynollahIdCounter() if len(found_polygons_text_region) > 0: _counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) - id_of_marginalia = [_counter_marginals.next_region_id for _ in found_polygons_marginals] - xml_reading_order(page, order_of_texts, id_of_marginalia) + id_of_marginalia_left = [_counter_marginals.next_region_id for _ in found_polygons_marginals_left] + id_of_marginalia_right = [_counter_marginals.next_region_id for _ in found_polygons_marginals_right] + xml_reading_order(page, order_of_texts, id_of_marginalia_left, id_of_marginalia_right) for mm in range(len(found_polygons_text_region)): textregion = TextRegionType(id=counter.next_region_id, type_='paragraph', - Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord), conf=conf_contours_textregion[mm]), + Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord, skip_layout_reading_order), conf=conf_contours_textregion[mm]), ) #textregion.set_conf(conf_contours_textregion[mm]) page.add_TextRegion(textregion) @@ -193,12 +196,29 @@ class EynollahXmlWriter(): else: ocr_textlines = None self.serialize_lines_in_region(textregion, all_found_textline_polygons, mm, page_coord, all_box_coord, slopes, counter, ocr_textlines) - - for mm in range(len(found_polygons_marginals)): + + for mm in range(len(found_polygons_marginals_left)): marginal = TextRegionType(id=counter.next_region_id, type_='marginalia', - Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord))) + Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals_left[mm], page_coord))) page.add_TextRegion(marginal) - self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter) + if ocr_all_textlines_marginals_left: + ocr_textlines = ocr_all_textlines_marginals_left[mm] + else: + ocr_textlines = None + + #print(ocr_textlines, mm, len(all_found_textline_polygons_marginals_left[mm]) ) + self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals_left, mm, page_coord, all_box_coord_marginals_left, slopes_marginals_left, counter, ocr_textlines) + + for mm in range(len(found_polygons_marginals_right)): + marginal = TextRegionType(id=counter.next_region_id, type_='marginalia', + Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals_right[mm], page_coord))) + page.add_TextRegion(marginal) + if ocr_all_textlines_marginals_right: + ocr_textlines = ocr_all_textlines_marginals_right[mm] + else: + ocr_textlines = None + + self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals_right, mm, page_coord, all_box_coord_marginals_right, slopes_marginals_right, counter, ocr_textlines) for mm in range(len(found_polygons_text_region_img)): img_region = ImageRegionType(id=counter.next_region_id, Coords=CoordsType()) @@ -242,7 +262,7 @@ class EynollahXmlWriter(): return pcgts - def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml, ocr_all_textlines, conf_contours_textregion, conf_contours_textregion_h): + def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals_left,found_polygons_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_h, slopes_marginals_left, slopes_marginals_right, cont_page, polygons_lines_to_be_written_in_xml, ocr_all_textlines=None, ocr_all_textlines_h=None, ocr_all_textlines_marginals_left=None, ocr_all_textlines_marginals_right=None, ocr_all_textlines_drop=None, conf_contours_textregion=None, conf_contours_textregion_h=None): self.logger.debug('enter build_pagexml_full_layout') # create the file structure @@ -252,8 +272,9 @@ class EynollahXmlWriter(): counter = EynollahIdCounter() _counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) - id_of_marginalia = [_counter_marginals.next_region_id for _ in found_polygons_marginals] - xml_reading_order(page, order_of_texts, id_of_marginalia) + id_of_marginalia_left = [_counter_marginals.next_region_id for _ in found_polygons_marginals_left] + id_of_marginalia_right = [_counter_marginals.next_region_id for _ in found_polygons_marginals_right] + xml_reading_order(page, order_of_texts, id_of_marginalia_left, id_of_marginalia_right) for mm in range(len(found_polygons_text_region)): textregion = TextRegionType(id=counter.next_region_id, type_='paragraph', @@ -272,25 +293,43 @@ class EynollahXmlWriter(): Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region_h[mm], page_coord))) page.add_TextRegion(textregion) - if ocr_all_textlines: - ocr_textlines = ocr_all_textlines[mm] + if ocr_all_textlines_h: + ocr_textlines = ocr_all_textlines_h[mm] else: ocr_textlines = None self.serialize_lines_in_region(textregion, all_found_textline_polygons_h, mm, page_coord, all_box_coord_h, slopes_h, counter, ocr_textlines) - for mm in range(len(found_polygons_marginals)): + for mm in range(len(found_polygons_marginals_left)): marginal = TextRegionType(id=counter.next_region_id, type_='marginalia', - Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord))) + Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals_left[mm], page_coord))) page.add_TextRegion(marginal) - self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter) - + if ocr_all_textlines_marginals_left: + ocr_textlines = ocr_all_textlines_marginals_left[mm] + else: + ocr_textlines = None + self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals_left, mm, page_coord, all_box_coord_marginals_left, slopes_marginals_left, counter, ocr_textlines) + + for mm in range(len(found_polygons_marginals_right)): + marginal = TextRegionType(id=counter.next_region_id, type_='marginalia', + Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals_right[mm], page_coord))) + page.add_TextRegion(marginal) + if ocr_all_textlines_marginals_right: + ocr_textlines = ocr_all_textlines_marginals_right[mm] + else: + ocr_textlines = None + self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals_right, mm, page_coord, all_box_coord_marginals_right, slopes_marginals_right, counter, ocr_textlines) + for mm in range(len(found_polygons_drop_capitals)): dropcapital = TextRegionType(id=counter.next_region_id, type_='drop-capital', Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_drop_capitals[mm], page_coord))) page.add_TextRegion(dropcapital) - ###all_box_coord_drop = None - ###slopes_drop = None - ###self.serialize_lines_in_dropcapital(dropcapital, [found_polygons_drop_capitals[mm]], mm, page_coord, all_box_coord_drop, slopes_drop, counter, ocr_all_textlines_textregion=None) + all_box_coord_drop = None + slopes_drop = None + if ocr_all_textlines_drop: + ocr_textlines = ocr_all_textlines_drop[mm] + else: + ocr_textlines = None + self.serialize_lines_in_dropcapital(dropcapital, [found_polygons_drop_capitals[mm]], mm, page_coord, all_box_coord_drop, slopes_drop, counter, ocr_all_textlines_textregion=ocr_textlines) for mm in range(len(found_polygons_text_region_img)): page.add_ImageRegion(ImageRegionType(id=counter.next_region_id, Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region_img[mm], page_coord)))) @@ -303,18 +342,28 @@ class EynollahXmlWriter(): return pcgts - def calculate_polygon_coords(self, contour, page_coord): + def calculate_polygon_coords(self, contour, page_coord, skip_layout_reading_order=False): self.logger.debug('enter calculate_polygon_coords') coords = '' for value_bbox in contour: - if len(value_bbox) == 2: - coords += str(int((value_bbox[0] + page_coord[2]) / self.scale_x)) - coords += ',' - coords += str(int((value_bbox[1] + page_coord[0]) / self.scale_y)) + if skip_layout_reading_order: + if len(value_bbox) == 2: + coords += str(int((value_bbox[0]) / self.scale_x)) + coords += ',' + coords += str(int((value_bbox[1]) / self.scale_y)) + else: + coords += str(int((value_bbox[0][0]) / self.scale_x)) + coords += ',' + coords += str(int((value_bbox[0][1]) / self.scale_y)) else: - coords += str(int((value_bbox[0][0] + page_coord[2]) / self.scale_x)) - coords += ',' - coords += str(int((value_bbox[0][1] + page_coord[0]) / self.scale_y)) + if len(value_bbox) == 2: + coords += str(int((value_bbox[0] + page_coord[2]) / self.scale_x)) + coords += ',' + coords += str(int((value_bbox[1] + page_coord[0]) / self.scale_y)) + else: + coords += str(int((value_bbox[0][0] + page_coord[2]) / self.scale_x)) + coords += ',' + coords += str(int((value_bbox[0][1] + page_coord[0]) / self.scale_y)) coords=coords + ' ' return coords[:-1]