diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index 8ce50bd..57384b6 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -13,11 +13,10 @@ import time import warnings from pathlib import Path from multiprocessing import Process, Queue, cpu_count - +import gc from ocrd_utils import getLogger import cv2 import numpy as np - os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" stderr = sys.stderr sys.stderr = open(os.devnull, "w") @@ -63,11 +62,11 @@ from .utils import ( putt_bb_of_drop_capitals_of_model_in_patches_in_layout, check_any_text_region_in_model_one_is_main_or_header, small_textlines_to_parent_adherence2, - order_and_id_of_texts, order_of_regions, find_number_of_columns_in_document, return_boxes_of_images_by_order_of_reading_new) from .utils.pil_cv2 import check_dpi +from .utils.xml import order_and_id_of_texts from .plot import EynollahPlotter from .writer import EynollahXmlWriter @@ -149,7 +148,7 @@ class Eynollah: def predict_enhancement(self, img): self.logger.debug("enter predict_enhancement") - model_enhancement, _ = self.start_new_session_and_model(self.model_dir_of_enhancement) + model_enhancement, session_enhancement = self.start_new_session_and_model(self.model_dir_of_enhancement) img_height_model = model_enhancement.layers[len(model_enhancement.layers) - 1].output_shape[1] img_width_model = model_enhancement.layers[len(model_enhancement.layers) - 1].output_shape[2] @@ -230,6 +229,10 @@ class Eynollah: prediction_true[index_y_d + margin : index_y_u - margin, index_x_d + margin : index_x_u - margin, :] = seg prediction_true = prediction_true.astype(int) + session_enhancement.close() + del model_enhancement + del session_enhancement + gc.collect() return prediction_true @@ -324,8 +327,14 @@ class Eynollah: self.logger.info("Found %s columns (%s)", num_col, label_p_pred) session_col_classifier.close() - + + del model_num_classifier + del session_col_classifier + K.clear_session() + gc.collect() + + img_new, _ = self.calculate_width_height_by_columns(img, num_col, width_early, label_p_pred) @@ -375,7 +384,10 @@ class Eynollah: is_image_enhanced = False num_column_is_classified = True image_res = np.copy(img) + + session_col_classifier.close() + self.logger.debug("exit resize_and_enhance_image_with_column_classifier") return is_image_enhanced, img, image_res, num_col, num_column_is_classified @@ -438,13 +450,17 @@ class Eynollah: model = load_model(model_dir, compile=False) return model, session + + def start_new_session_and_model(self, model_dir): self.logger.debug("enter start_new_session_and_model (model_dir=%s)", model_dir) - gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=7.7, allow_growth=True) + gpu_options = tf.compat.v1.GPUOptions(allow_growth=True) + #gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=7.7, allow_growth=True) session = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options)) model = load_model(model_dir, compile=False) return model, session + def do_prediction(self, patches, img, model, marginal_of_patch_percent=0.1): self.logger.debug("enter do_prediction") @@ -560,6 +576,8 @@ class Eynollah: prediction_true[index_y_d + margin : index_y_u - margin, index_x_d + margin : index_x_u - margin, :] = seg_color prediction_true = prediction_true.astype(np.uint8) + del model + gc.collect() return prediction_true def early_page_for_num_of_column_classification(self): @@ -580,7 +598,10 @@ class Eynollah: box = [x, y, w, h] croped_page, page_coord = crop_image_inside_box(box, img) session_page.close() - + del model_page + del session_page + gc.collect() + K.clear_session() self.logger.debug("exit early_page_for_num_of_column_classification") return croped_page, page_coord @@ -612,7 +633,9 @@ class Eynollah: croped_page, page_coord = crop_image_inside_box(box, self.image) cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]])) session_page.close() - + del model_page + del session_page + gc.collect() K.clear_session() self.logger.debug("exit extract_page") return croped_page, page_coord, cont_page @@ -710,6 +733,10 @@ class Eynollah: prediction_regions = resize_image(prediction_regions, img_height_h, img_width_h) session_region.close() + del model_region + del session_region + gc.collect() + self.logger.debug("exit extract_text_regions") return prediction_regions, prediction_regions2 @@ -1006,11 +1033,10 @@ class Eynollah: prediction_textline = resize_image(prediction_textline, img_h, img_w) prediction_textline_longshot = self.do_prediction(False, img, model_textline) prediction_textline_longshot_true_size = resize_image(prediction_textline_longshot, img_h, img_w) - ##plt.imshow(prediction_textline_streched[:,:,0]) - ##plt.show() session_textline.close() + return prediction_textline[:, :, 0], prediction_textline_longshot_true_size[:, :, 0] def do_work_of_slopes(self, q, poly, box_sub, boxes_per_process, textline_mask_tot, contours_per_process): @@ -1077,18 +1103,22 @@ class Eynollah: ##plt.show() prediction_regions_org=prediction_regions_org[:,:,0] prediction_regions_org[(prediction_regions_org[:,:]==1) & (mask_zeros_y[:,:]==1)]=0 + session_region.close() + del model_region + del session_region + gc.collect() model_region, session_region = self.start_new_session_and_model(self.model_region_dir_p2) img = resize_image(img_org, int(img_org.shape[0]), int(img_org.shape[1])) prediction_regions_org2 = self.do_prediction(True, img, model_region, 0.2) prediction_regions_org2=resize_image(prediction_regions_org2, img_height_h, img_width_h ) - #plt.imshow(prediction_regions_org2[:,:,0]) - #plt.show() - ##prediction_regions_org=prediction_regions_org[:,:,0] session_region.close() + del model_region + del session_region + gc.collect() mask_zeros2 = (prediction_regions_org2[:,:,0] == 0) mask_lines2 = (prediction_regions_org2[:,:,0] == 3) @@ -1179,18 +1209,16 @@ class Eynollah: for zahler, _ in enumerate(args_contours_box): arg_order_v = indexes_sorted_main[zahler] - tartib = np.where(indexes_sorted == arg_order_v)[0][0] - order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = tartib + ref_point + order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = np.where(indexes_sorted == arg_order_v)[0][0] + ref_point for zahler, _ in enumerate(args_contours_box_h): arg_order_v = indexes_sorted_head[zahler] - tartib = np.where(indexes_sorted == arg_order_v)[0][0] - order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = tartib + ref_point + order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = np.where(indexes_sorted == arg_order_v)[0][0] + ref_point for jji in range(len(id_of_texts)): order_of_texts_tot.append(order_of_texts[jji] + ref_point) id_of_texts_tot.append(id_of_texts[jji]) - ref_point = ref_point + len(id_of_texts) + ref_point += len(id_of_texts) order_of_texts_tot = [] for tj1 in range(len(contours_only_text_parent)): @@ -1253,18 +1281,16 @@ class Eynollah: for zahler, _ in enumerate(args_contours_box): arg_order_v = indexes_sorted_main[zahler] - tartib = np.where(indexes_sorted == arg_order_v)[0][0] - order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = tartib + ref_point + order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = np.where(indexes_sorted == arg_order_v)[0][0] + ref_point for zahler, _ in enumerate(args_contours_box_h): arg_order_v = indexes_sorted_head[zahler] - tartib = np.where(indexes_sorted == arg_order_v)[0][0] - order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = tartib + ref_point + order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = np.where(indexes_sorted == arg_order_v)[0][0] + ref_point for jji, _ in enumerate(id_of_texts): order_of_texts_tot.append(order_of_texts[jji] + ref_point) id_of_texts_tot.append(id_of_texts[jji]) - ref_point = ref_point + len(id_of_texts) + ref_point += len(id_of_texts) order_of_texts_tot = [] for tj1 in range(len(contours_only_text_parent)): @@ -1311,13 +1337,12 @@ class Eynollah: for zahler, _ in enumerate(args_contours_box): arg_order_v = indexes_sorted_main[zahler] - tartib = np.where(indexes_sorted == arg_order_v)[0][0] - order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = tartib + ref_point + order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = np.where(indexes_sorted == arg_order_v)[0][0] + ref_point - for jji in range(len(id_of_texts)): + for jji, _ in enumerate(id_of_texts): order_of_texts_tot.append(order_of_texts[jji] + ref_point) id_of_texts_tot.append(id_of_texts[jji]) - ref_point = ref_point + len(id_of_texts) + ref_point += len(id_of_texts) order_of_texts_tot = [] for tj1 in range(len(contours_only_text_parent)): @@ -1326,7 +1351,7 @@ class Eynollah: order_text_new = [] for iii in range(len(order_of_texts_tot)): order_text_new.append(np.where(np.array(order_of_texts_tot) == iii)[0][0]) - + except Exception as why: self.logger.error(why) arg_text_con = [] @@ -1359,13 +1384,12 @@ class Eynollah: for zahler, _ in enumerate(args_contours_box): arg_order_v = indexes_sorted_main[zahler] - tartib = np.where(indexes_sorted == arg_order_v)[0][0] - order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = tartib + ref_point + order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = np.where(indexes_sorted == arg_order_v)[0][0] + ref_point for jji, _ in enumerate(id_of_texts): order_of_texts_tot.append(order_of_texts[jji] + ref_point) id_of_texts_tot.append(id_of_texts[jji]) - ref_point = ref_point + len(id_of_texts) + ref_point += len(id_of_texts) order_of_texts_tot = [] for tj1 in range(len(contours_only_text_parent)): @@ -1374,7 +1398,7 @@ class Eynollah: order_text_new = [] for iii in range(len(order_of_texts_tot)): order_text_new.append(np.where(np.array(order_of_texts_tot) == iii)[0][0]) - + return order_text_new, id_of_texts_tot def do_order_of_regions(self, *args, **kwargs): diff --git a/qurator/eynollah/utils/__init__.py b/qurator/eynollah/utils/__init__.py index 707e32d..8916804 100644 --- a/qurator/eynollah/utils/__init__.py +++ b/qurator/eynollah/utils/__init__.py @@ -977,45 +977,6 @@ def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col) textlines_con_changed.append(textlines_big_org_form) return textlines_con_changed -def order_and_id_of_texts(found_polygons_text_region, found_polygons_text_region_h, matrix_of_orders, indexes_sorted, index_of_types, kind_of_texts, ref_point): - indexes_sorted = np.array(indexes_sorted) - index_of_types = np.array(index_of_types) - kind_of_texts = np.array(kind_of_texts) - - id_of_texts = [] - order_of_texts = [] - - index_of_types_1 = index_of_types[kind_of_texts == 1] - indexes_sorted_1 = indexes_sorted[kind_of_texts == 1] - - index_of_types_2 = index_of_types[kind_of_texts == 2] - indexes_sorted_2 = indexes_sorted[kind_of_texts == 2] - - ##print(index_of_types,'index_of_types') - ##print(kind_of_texts,'kind_of_texts') - ##print(len(found_polygons_text_region),'found_polygons_text_region') - ##print(index_of_types_1,'index_of_types_1') - ##print(indexes_sorted_1,'indexes_sorted_1') - index_b = 0 + ref_point - for mm in range(len(found_polygons_text_region)): - - id_of_texts.append("r" + str(index_b)) - interest = indexes_sorted_1[indexes_sorted_1 == index_of_types_1[mm]] - - if len(interest) > 0: - order_of_texts.append(interest[0]) - index_b += 1 - else: - pass - - for mm in range(len(found_polygons_text_region_h)): - id_of_texts.append("r" + str(index_b)) - interest = indexes_sorted_2[index_of_types_2[mm]] - order_of_texts.append(interest) - index_b += 1 - - return order_of_texts, id_of_texts - def order_of_regions(textline_mask, contours_main, contours_header, y_ref): ##plt.imshow(textline_mask) diff --git a/qurator/eynollah/utils/counter.py b/qurator/eynollah/utils/counter.py new file mode 100644 index 0000000..bc1d765 --- /dev/null +++ b/qurator/eynollah/utils/counter.py @@ -0,0 +1,48 @@ +from collections import Counter + +REGION_ID_TEMPLATE = 'region_%04d' +LINE_ID_TEMPLATE = 'region_%04d_line_%04d' + +class EynollahIdCounter(): + + def __init__(self, region_idx=0, line_idx=0): + self._counter = Counter() + self._inital_region_idx = region_idx + self._inital_line_idx = line_idx + self.reset() + + def reset(self): + self.set('region', self._inital_region_idx) + self.set('line', self._inital_line_idx) + + def inc(self, name, val=1): + self._counter.update({name: val}) + + def get(self, name): + return self._counter[name] + + def set(self, name, val): + self._counter[name] = val + + def region_id(self, region_idx=None): + if region_idx is None: + region_idx = self._counter['region'] + return REGION_ID_TEMPLATE % region_idx + + def line_id(self, region_idx=None, line_idx=None): + if region_idx is None: + region_idx = self._counter['region'] + if line_idx is None: + line_idx = self._counter['line'] + return LINE_ID_TEMPLATE % (region_idx, line_idx) + + @property + def next_region_id(self): + self.inc('region') + self.set('line', 0) + return self.region_id() + + @property + def next_line_id(self): + self.inc('line') + return self.line_id() diff --git a/qurator/eynollah/utils/xml.py b/qurator/eynollah/utils/xml.py index fe806e9..e972218 100644 --- a/qurator/eynollah/utils/xml.py +++ b/qurator/eynollah/utils/xml.py @@ -1,4 +1,8 @@ +# pylint: disable=too-many-locals,wrong-import-position,too-many-lines,too-many-statements,chained-comparison,fixme,broad-except,c-extension-no-member +# pylint: disable=invalid-name from lxml import etree as ET +from .counter import EynollahIdCounter +import numpy as np NAMESPACES = {} NAMESPACES['page'] = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" @@ -37,26 +41,46 @@ def add_textequiv(parent, text=''): unireg = ET.SubElement(textequiv, 'Unicode') unireg.text = text -def xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals): - """ - XXX side-effect: extends id_of_marginalia - """ +def xml_reading_order(page, order_of_texts, id_of_marginalia): region_order = ET.SubElement(page, 'ReadingOrder') region_order_sub = ET.SubElement(region_order, 'OrderedGroup') region_order_sub.set('id', "ro357564684568544579089") - indexer_region = 0 - for vj in order_of_texts: - name = "coord_text_%s" % vj + region_counter = EynollahIdCounter() + for idx_textregion, _ in enumerate(order_of_texts): name = ET.SubElement(region_order_sub, 'RegionRefIndexed') - name.set('index', str(indexer_region)) - name.set('regionRef', id_of_texts[vj]) - indexer_region += 1 - for vm in range(len(found_polygons_marginals)): - id_of_marginalia.append('r%s' % indexer_region) - name = "coord_text_%s" % indexer_region + name.set('index', str(region_counter.get('region'))) + name.set('regionRef', region_counter.region_id(order_of_texts[idx_textregion] + 1)) + region_counter.inc('region') + for id_marginal in id_of_marginalia: name = ET.SubElement(region_order_sub, 'RegionRefIndexed') - name.set('index', str(indexer_region)) - name.set('regionRef', 'r%s' % indexer_region) - indexer_region += 1 - return id_of_marginalia + name.set('index', str(region_counter.get('region'))) + name.set('regionRef', id_marginal) + region_counter.inc('region') +def order_and_id_of_texts(found_polygons_text_region, found_polygons_text_region_h, matrix_of_orders, indexes_sorted, index_of_types, kind_of_texts, ref_point): + indexes_sorted = np.array(indexes_sorted) + index_of_types = np.array(index_of_types) + kind_of_texts = np.array(kind_of_texts) + + id_of_texts = [] + order_of_texts = [] + + index_of_types_1 = index_of_types[kind_of_texts == 1] + indexes_sorted_1 = indexes_sorted[kind_of_texts == 1] + + index_of_types_2 = index_of_types[kind_of_texts == 2] + indexes_sorted_2 = indexes_sorted[kind_of_texts == 2] + + counter = EynollahIdCounter(region_idx=ref_point) + for idx_textregion, _ in enumerate(found_polygons_text_region): + id_of_texts.append(counter.next_region_id) + interest = indexes_sorted_1[indexes_sorted_1 == index_of_types_1[idx_textregion]] + if len(interest) > 0: + order_of_texts.append(interest[0]) + + for idx_headerregion, _ in enumerate(found_polygons_text_region_h): + id_of_texts.append(counter.next_region_id) + interest = indexes_sorted_2[index_of_types_2[idx_headerregion]] + order_of_texts.append(interest) + + return order_of_texts, id_of_texts diff --git a/qurator/eynollah/writer.py b/qurator/eynollah/writer.py index c8c34e4..a54103c 100644 --- a/qurator/eynollah/writer.py +++ b/qurator/eynollah/writer.py @@ -3,6 +3,7 @@ from pathlib import Path import os.path from .utils.xml import create_page_xml, add_textequiv, xml_reading_order +from .utils.counter import EynollahIdCounter from ocrd_utils import getLogger from lxml import etree as ET @@ -12,6 +13,7 @@ class EynollahXmlWriter(): def __init__(self, *, dir_out, image_filename, curved_line): self.logger = getLogger('eynollah.writer') + self.counter = EynollahIdCounter() self.dir_out = dir_out self.image_filename = image_filename self.image_filename_stem = Path(Path(image_filename).name).stem @@ -36,11 +38,10 @@ class EynollahXmlWriter(): points_page_print = points_page_print + ' ' return points_page_print[:-1] - def serialize_lines_in_marginal(self, marginal, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, slopes_marginals, id_indexer_l): + def serialize_lines_in_marginal(self, marginal, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, slopes_marginals, counter): for j in range(len(all_found_texline_polygons_marginals[marginal_idx])): textline = ET.SubElement(marginal, 'TextLine') - textline.set('id', 'l%s' % id_indexer_l) - id_indexer_l += 1 + textline.set('id', counter.next_line_id) coord = ET.SubElement(textline, 'Coords') add_textequiv(textline) points_co = '' @@ -77,53 +78,50 @@ class EynollahXmlWriter(): if l < len(all_found_texline_polygons_marginals[marginal_idx][j]) - 1: points_co += ' ' coord.set('points',points_co) - return id_indexer_l - def serialize_lines_in_region(self, textregion, all_found_texline_polygons, region_idx, page_coord, all_box_coord, slopes, id_indexer_l): + def serialize_lines_in_region(self, textregion, all_found_texline_polygons, region_idx, page_coord, all_box_coord, slopes, counter): self.logger.debug('enter serialize_lines_in_region') for j in range(len(all_found_texline_polygons[region_idx])): textline = ET.SubElement(textregion, 'TextLine') - textline.set('id', 'l%s' % id_indexer_l) - id_indexer_l += 1 + textline.set('id', counter.next_line_id) coord = ET.SubElement(textline, 'Coords') add_textequiv(textline) + region_bboxes = all_box_coord[region_idx] points_co = '' - for l in range(len(all_found_texline_polygons[region_idx][j])): + for idx_contour_textline, contour_textline in enumerate(all_found_texline_polygons[region_idx][j]): if not self.curved_line: - if len(all_found_texline_polygons[region_idx][j][l])==2: - textline_x_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][0] + all_box_coord[region_idx][2] + page_coord[2]) / self.scale_x)) - textline_y_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][1] + all_box_coord[region_idx][0] + page_coord[0]) / self.scale_y)) + if len(contour_textline) == 2: + textline_x_coord = max(0, int((contour_textline[0] + region_bboxes[2] + page_coord[2]) / self.scale_x)) + textline_y_coord = max(0, int((contour_textline[1] + region_bboxes[0] + page_coord[0]) / self.scale_y)) else: - textline_x_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][0][0] + all_box_coord[region_idx][2] + page_coord[2]) / self.scale_x)) - textline_y_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][0][1] + all_box_coord[region_idx][0] + page_coord[0]) / self.scale_y)) + textline_x_coord = max(0, int((contour_textline[0][0] + region_bboxes[2] + page_coord[2]) / self.scale_x)) + textline_y_coord = max(0, int((contour_textline[0][1] + region_bboxes[0] + page_coord[0]) / self.scale_y)) points_co += str(textline_x_coord) points_co += ',' points_co += str(textline_y_coord) if self.curved_line and np.abs(slopes[region_idx]) <= 45: - if len(all_found_texline_polygons[region_idx][j][l]) == 2: - points_co += str(int((all_found_texline_polygons[region_idx][j][l][0] + page_coord[2]) / self.scale_x)) + if len(contour_textline) == 2: + points_co += str(int((contour_textline[0] + page_coord[2]) / self.scale_x)) points_co += ',' - points_co += str(int((all_found_texline_polygons[region_idx][j][l][1] + page_coord[0]) / self.scale_y)) + points_co += str(int((contour_textline[1] + page_coord[0]) / self.scale_y)) else: - points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][0] + page_coord[2]) / self.scale_x)) + points_co += str(int((contour_textline[0][0] + page_coord[2]) / self.scale_x)) points_co += ',' - points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][1] + page_coord[0])/self.scale_y)) + points_co += str(int((contour_textline[0][1] + page_coord[0])/self.scale_y)) elif self.curved_line and np.abs(slopes[region_idx]) > 45: - if len(all_found_texline_polygons[region_idx][j][l])==2: - points_co += str(int((all_found_texline_polygons[region_idx][j][l][0] + all_box_coord[region_idx][2]+page_coord[2])/self.scale_x)) + if len(contour_textline)==2: + points_co += str(int((contour_textline[0] + region_bboxes[2] + page_coord[2])/self.scale_x)) points_co += ',' - points_co += str(int((all_found_texline_polygons[region_idx][j][l][1] + all_box_coord[region_idx][0]+page_coord[0])/self.scale_y)) + points_co += str(int((contour_textline[1] + region_bboxes[0] + page_coord[0])/self.scale_y)) else: - points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][0] + all_box_coord[region_idx][2]+page_coord[2])/self.scale_x)) + points_co += str(int((contour_textline[0][0] + region_bboxes[2]+page_coord[2])/self.scale_x)) points_co += ',' - points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][1] + all_box_coord[region_idx][0]+page_coord[0])/self.scale_y)) + points_co += str(int((contour_textline[0][1] + region_bboxes[0]+page_coord[0])/self.scale_y)) - if l < len(all_found_texline_polygons[region_idx][j]) - 1: - points_co += ' ' - coord.set('points',points_co) - return id_indexer_l + points_co += ' ' + coord.set('points', points_co[:-1]) def write_pagexml(self, pcgts): self.logger.info("filename stem: '%s'", self.image_filename_stem) @@ -139,43 +137,40 @@ class EynollahXmlWriter(): coord_page = ET.SubElement(page_print_sub, "Coords") coord_page.set('points', self.calculate_page_coords(cont_page)) - id_of_marginalia = [] - id_indexer = 0 - id_indexer_l = 0 + counter = EynollahIdCounter() if len(found_polygons_text_region) > 0: - id_of_marginalia = xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals) - for mm in range(len(found_polygons_text_region)): - textregion = ET.SubElement(page, 'TextRegion') - textregion.set('id', 'r%s' % id_indexer) - id_indexer += 1 - textregion.set('type', 'paragraph') - coord_text = ET.SubElement(textregion, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord)) - id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l) - add_textequiv(textregion) + _counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) + id_of_marginalia = [_counter_marginals.next_region_id for _ in found_polygons_marginals] + xml_reading_order(page, order_of_texts, id_of_marginalia) + + for mm in range(len(found_polygons_text_region)): + textregion = ET.SubElement(page, 'TextRegion') + textregion.set('id', counter.next_region_id) + textregion.set('type', 'paragraph') + coord_text = ET.SubElement(textregion, 'Coords') + coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord)) + self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, counter) + add_textequiv(textregion) for mm in range(len(found_polygons_marginals)): marginal = ET.SubElement(page, 'TextRegion') - marginal.set('id', id_of_marginalia[mm]) + marginal.set('id', counter.next_region_id) marginal.set('type', 'marginalia') coord_text = ET.SubElement(marginal, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord)) - id_indexer_l = self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, id_indexer_l) + coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord)) + self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter) - id_indexer = len(found_polygons_text_region) + len(found_polygons_marginals) for mm in range(len(found_polygons_text_region_img)): textregion = ET.SubElement(page, 'ImageRegion') - textregion.set('id', 'r%s' % id_indexer) - id_indexer += 1 + textregion.set('id', counter.next_region_id) coord_text = ET.SubElement(textregion, 'Coords') points_co = '' for lmm in range(len(found_polygons_text_region_img[mm])): points_co += str(int((found_polygons_text_region_img[mm][lmm,0,0] + page_coord[2]) / self.scale_x)) points_co += ',' points_co += str(int((found_polygons_text_region_img[mm][lmm,0,1] + page_coord[0]) / self.scale_y)) - if lmm < len(found_polygons_text_region_img[mm]) - 1: - points_co += ' ' - coord_text.set('points', points_co) + points_co += ' ' + coord_text.set('points', points_co[:-1]) return pcgts @@ -188,85 +183,73 @@ class EynollahXmlWriter(): coord_page = ET.SubElement(page_print_sub, "Coords") coord_page.set('points', self.calculate_page_coords(cont_page)) - id_indexer = 0 - id_indexer_l = 0 - id_of_marginalia = [] + counter = EynollahIdCounter() + _counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) + id_of_marginalia = [_counter_marginals.next_region_id for _ in found_polygons_marginals] + xml_reading_order(page, order_of_texts, id_of_marginalia) - if len(found_polygons_text_region) > 0: - id_of_marginalia = xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals) - for mm in range(len(found_polygons_text_region)): - textregion=ET.SubElement(page, 'TextRegion') - textregion.set('id', 'r%s' % id_indexer) - id_indexer += 1 - textregion.set('type', 'paragraph') - coord_text = ET.SubElement(textregion, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord)) - id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l) - add_textequiv(textregion) + for mm in range(len(found_polygons_text_region)): + textregion=ET.SubElement(page, 'TextRegion') + textregion.set('id', counter.next_region_id) + textregion.set('type', 'paragraph') + coord_text = ET.SubElement(textregion, 'Coords') + coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord)) + self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, counter) + add_textequiv(textregion) self.logger.debug('len(found_polygons_text_region_h) %s', len(found_polygons_text_region_h)) - if len(found_polygons_text_region_h) > 0: - for mm in range(len(found_polygons_text_region_h)): - textregion=ET.SubElement(page, 'TextRegion') - textregion.set('id', 'r%s' % id_indexer) - id_indexer += 1 - textregion.set('type','header') - coord_text = ET.SubElement(textregion, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_h, mm, page_coord)) - id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes, id_indexer_l) - add_textequiv(textregion) - - if len(found_polygons_drop_capitals) > 0: - id_indexer = len(found_polygons_text_region) + len(found_polygons_text_region_h) + len(found_polygons_marginals) - for mm in range(len(found_polygons_drop_capitals)): - textregion=ET.SubElement(page, 'TextRegion') - textregion.set('id',' r%s' % id_indexer) - id_indexer += 1 - textregion.set('type', 'drop-capital') - coord_text = ET.SubElement(textregion, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals, mm, page_coord)) - add_textequiv(textregion) + for mm in range(len(found_polygons_text_region_h)): + textregion=ET.SubElement(page, 'TextRegion') + textregion.set('id', counter.next_region_id) + textregion.set('type','header') + coord_text = ET.SubElement(textregion, 'Coords') + coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_h[mm], page_coord)) + self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes, counter) + add_textequiv(textregion) for mm in range(len(found_polygons_marginals)): marginal = ET.SubElement(page, 'TextRegion') add_textequiv(textregion) - marginal.set('id', id_of_marginalia[mm]) + marginal.set('id', counter.next_region_id) marginal.set('type', 'marginalia') coord_text = ET.SubElement(marginal, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord)) - id_indexer_l = self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, id_indexer_l) + coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord)) + self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter) + + for mm in range(len(found_polygons_drop_capitals)): + textregion=ET.SubElement(page, 'TextRegion') + textregion.set('id', counter.next_region_id) + textregion.set('type', 'drop-capital') + coord_text = ET.SubElement(textregion, 'Coords') + coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals[mm], page_coord)) + add_textequiv(textregion) - id_indexer = len(found_polygons_text_region) + len(found_polygons_text_region_h) + len(found_polygons_marginals) + len(found_polygons_drop_capitals) for mm in range(len(found_polygons_text_region_img)): textregion=ET.SubElement(page, 'ImageRegion') - textregion.set('id', 'r%s' % id_indexer) - id_indexer += 1 + textregion.set('id', counter.next_region_id) coord_text = ET.SubElement(textregion, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_img, mm, page_coord)) + coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_img[mm], page_coord)) for mm in range(len(found_polygons_tables)): textregion = ET.SubElement(page, 'TableRegion') - textregion.set('id', 'r%s' %id_indexer) - id_indexer += 1 + textregion.set('id', counter.next_region_id) coord_text = ET.SubElement(textregion, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_tables, mm, page_coord)) + coord_text.set('points', self.calculate_polygon_coords(found_polygons_tables[mm], page_coord)) return pcgts - def calculate_polygon_coords(self, contour_list, i, page_coord): + def calculate_polygon_coords(self, contour, page_coord): self.logger.debug('enter calculate_polygon_coords') coords = '' - for j in range(len(contour_list[i])): - if len(contour_list[i][j]) == 2: - coords += str(int((contour_list[i][j][0] + page_coord[2]) / self.scale_x)) + for value_bbox in contour: + if len(value_bbox) == 2: + coords += str(int((value_bbox[0] + page_coord[2]) / self.scale_x)) coords += ',' - coords += str(int((contour_list[i][j][1] + page_coord[0]) / self.scale_y)) + coords += str(int((value_bbox[1] + page_coord[0]) / self.scale_y)) else: - coords += str(int((contour_list[i][j][0][0] + page_coord[2]) / self.scale_x)) + coords += str(int((value_bbox[0][0] + page_coord[2]) / self.scale_x)) coords += ',' - coords += str(int((contour_list[i][j][0][1] + page_coord[0]) / self.scale_y)) - - if j < len(contour_list[i]) - 1: - coords=coords + ' ' - return coords + coords += str(int((value_bbox[0][1] + page_coord[0]) / self.scale_y)) + coords=coords + ' ' + return coords[:-1] diff --git a/tests/test_counter.py b/tests/test_counter.py new file mode 100644 index 0000000..8ef0756 --- /dev/null +++ b/tests/test_counter.py @@ -0,0 +1,33 @@ +from tests.base import main +from qurator.eynollah.utils.counter import EynollahIdCounter + +def test_counter_string(): + c = EynollahIdCounter() + assert c.next_region_id == 'region_0001' + assert c.next_region_id == 'region_0002' + assert c.next_line_id == 'region_0002_line_0001' + assert c.next_region_id == 'region_0003' + assert c.next_line_id == 'region_0003_line_0001' + assert c.region_id(999) == 'region_0999' + assert c.line_id(999, 888) == 'region_0999_line_0888' + +def test_counter_init(): + c = EynollahIdCounter(region_idx=2) + assert c.get('region') == 2 + c.inc('region') + assert c.get('region') == 3 + c.reset() + assert c.get('region') == 2 + +def test_counter_methods(): + c = EynollahIdCounter() + assert c.get('region') == 0 + c.inc('region', 5) + assert c.get('region') == 5 + c.set('region', 10) + assert c.get('region') == 10 + c.inc('region', -9) + assert c.get('region') == 1 + +if __name__ == '__main__': + main(__file__)