From 62fa710f272f8322e0860ebb55d98027c1a34c1a Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 2 Mar 2021 13:13:41 +0100 Subject: [PATCH 01/21] move order_and_id_of_texts to utils.xml --- qurator/eynollah/eynollah.py | 4 +-- qurator/eynollah/utils/__init__.py | 39 ------------------------------ qurator/eynollah/utils/xml.py | 32 ++++++++++++++++++++++++ 3 files changed, 34 insertions(+), 41 deletions(-) diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index cb5b028..75971ec 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -63,11 +63,11 @@ from .utils import ( putt_bb_of_drop_capitals_of_model_in_patches_in_layout, check_any_text_region_in_model_one_is_main_or_header, small_textlines_to_parent_adherence2, - order_and_id_of_texts, order_of_regions, find_number_of_columns_in_document, return_boxes_of_images_by_order_of_reading_new) from .utils.pil_cv2 import check_dpi +from .utils.xml import order_and_id_of_texts from .plot import EynollahPlotter from .writer import EynollahXmlWriter @@ -1308,7 +1308,7 @@ class Eynollah: tartib = np.where(indexes_sorted == arg_order_v)[0][0] order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = tartib + ref_point - for jji in range(len(id_of_texts)): + for jji, _ in range(len(id_of_texts)): order_of_texts_tot.append(order_of_texts[jji] + ref_point) id_of_texts_tot.append(id_of_texts[jji]) ref_point = ref_point + len(id_of_texts) diff --git a/qurator/eynollah/utils/__init__.py b/qurator/eynollah/utils/__init__.py index ac72ef9..a44c6c8 100644 --- a/qurator/eynollah/utils/__init__.py +++ b/qurator/eynollah/utils/__init__.py @@ -977,45 +977,6 @@ def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col) textlines_con_changed.append(textlines_big_org_form) return textlines_con_changed -def order_and_id_of_texts(found_polygons_text_region, found_polygons_text_region_h, matrix_of_orders, indexes_sorted, index_of_types, kind_of_texts, ref_point): - indexes_sorted = np.array(indexes_sorted) - index_of_types = np.array(index_of_types) - kind_of_texts = np.array(kind_of_texts) - - id_of_texts = [] - order_of_texts = [] - - index_of_types_1 = index_of_types[kind_of_texts == 1] - indexes_sorted_1 = indexes_sorted[kind_of_texts == 1] - - index_of_types_2 = index_of_types[kind_of_texts == 2] - indexes_sorted_2 = indexes_sorted[kind_of_texts == 2] - - ##print(index_of_types,'index_of_types') - ##print(kind_of_texts,'kind_of_texts') - ##print(len(found_polygons_text_region),'found_polygons_text_region') - ##print(index_of_types_1,'index_of_types_1') - ##print(indexes_sorted_1,'indexes_sorted_1') - index_b = 0 + ref_point - for mm in range(len(found_polygons_text_region)): - - id_of_texts.append("r" + str(index_b)) - interest = indexes_sorted_1[indexes_sorted_1 == index_of_types_1[mm]] - - if len(interest) > 0: - order_of_texts.append(interest[0]) - index_b += 1 - else: - pass - - for mm in range(len(found_polygons_text_region_h)): - id_of_texts.append("r" + str(index_b)) - interest = indexes_sorted_2[index_of_types_2[mm]] - order_of_texts.append(interest) - index_b += 1 - - return order_of_texts, id_of_texts - def order_of_regions(textline_mask, contours_main, contours_header, y_ref): ##plt.imshow(textline_mask) diff --git a/qurator/eynollah/utils/xml.py b/qurator/eynollah/utils/xml.py index fe806e9..4f41461 100644 --- a/qurator/eynollah/utils/xml.py +++ b/qurator/eynollah/utils/xml.py @@ -1,4 +1,6 @@ +# pylint: disable=too-many-locals,wrong-import-position,too-many-lines,too-many-statements,chained-comparison,fixme,broad-except,c-extension-no-member from lxml import etree as ET +import numpy as np NAMESPACES = {} NAMESPACES['page'] = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" @@ -60,3 +62,33 @@ def xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found indexer_region += 1 return id_of_marginalia +def order_and_id_of_texts(found_polygons_text_region, found_polygons_text_region_h, matrix_of_orders, indexes_sorted, index_of_types, kind_of_texts, ref_point): + indexes_sorted = np.array(indexes_sorted) + index_of_types = np.array(index_of_types) + kind_of_texts = np.array(kind_of_texts) + + id_of_texts = [] + order_of_texts = [] + + index_of_types_1 = index_of_types[kind_of_texts == 1] + indexes_sorted_1 = indexes_sorted[kind_of_texts == 1] + + index_of_types_2 = index_of_types[kind_of_texts == 2] + indexes_sorted_2 = indexes_sorted[kind_of_texts == 2] + + index_b = 0 + ref_point + for mm, _ in enumerate(found_polygons_text_region): + id_of_texts.append("r" + str(index_b)) + interest = indexes_sorted_1[indexes_sorted_1 == index_of_types_1[mm]] + if len(interest) > 0: + order_of_texts.append(interest[0]) + index_b += 1 + + for mm, _ in enumerate(found_polygons_text_region_h): + id_of_texts.append("r" + str(index_b)) + interest = indexes_sorted_2[index_of_types_2[mm]] + order_of_texts.append(interest) + index_b += 1 + + return order_of_texts, id_of_texts + From 38ab2aa5732697ae8a9f28191e694e26034eee59 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 2 Mar 2021 13:36:25 +0100 Subject: [PATCH 02/21] eliminate tartib var --- qurator/eynollah/eynollah.py | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index 75971ec..46739bd 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -1173,18 +1173,16 @@ class Eynollah: for zahler, _ in enumerate(args_contours_box): arg_order_v = indexes_sorted_main[zahler] - tartib = np.where(indexes_sorted == arg_order_v)[0][0] - order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = tartib + ref_point + order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = np.where(indexes_sorted == arg_order_v)[0][0] + ref_point for zahler, _ in enumerate(args_contours_box_h): arg_order_v = indexes_sorted_head[zahler] - tartib = np.where(indexes_sorted == arg_order_v)[0][0] - order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = tartib + ref_point + order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = np.where(indexes_sorted == arg_order_v)[0][0] + ref_point for jji in range(len(id_of_texts)): order_of_texts_tot.append(order_of_texts[jji] + ref_point) id_of_texts_tot.append(id_of_texts[jji]) - ref_point = ref_point + len(id_of_texts) + ref_point += len(id_of_texts) order_of_texts_tot = [] for tj1 in range(len(contours_only_text_parent)): @@ -1247,18 +1245,16 @@ class Eynollah: for zahler, _ in enumerate(args_contours_box): arg_order_v = indexes_sorted_main[zahler] - tartib = np.where(indexes_sorted == arg_order_v)[0][0] - order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = tartib + ref_point + order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = np.where(indexes_sorted == arg_order_v)[0][0] + ref_point for zahler, _ in enumerate(args_contours_box_h): arg_order_v = indexes_sorted_head[zahler] - tartib = np.where(indexes_sorted == arg_order_v)[0][0] - order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = tartib + ref_point + order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = np.where(indexes_sorted == arg_order_v)[0][0] + ref_point for jji, _ in enumerate(id_of_texts): order_of_texts_tot.append(order_of_texts[jji] + ref_point) id_of_texts_tot.append(id_of_texts[jji]) - ref_point = ref_point + len(id_of_texts) + ref_point += len(id_of_texts) order_of_texts_tot = [] for tj1 in range(len(contours_only_text_parent)): @@ -1305,13 +1301,12 @@ class Eynollah: for zahler, _ in enumerate(args_contours_box): arg_order_v = indexes_sorted_main[zahler] - tartib = np.where(indexes_sorted == arg_order_v)[0][0] - order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = tartib + ref_point + order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = np.where(indexes_sorted == arg_order_v)[0][0] + ref_point for jji, _ in range(len(id_of_texts)): order_of_texts_tot.append(order_of_texts[jji] + ref_point) id_of_texts_tot.append(id_of_texts[jji]) - ref_point = ref_point + len(id_of_texts) + ref_point += len(id_of_texts) order_of_texts_tot = [] for tj1 in range(len(contours_only_text_parent)): @@ -1353,13 +1348,12 @@ class Eynollah: for zahler, _ in enumerate(args_contours_box): arg_order_v = indexes_sorted_main[zahler] - tartib = np.where(indexes_sorted == arg_order_v)[0][0] - order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = tartib + ref_point + order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = np.where(indexes_sorted == arg_order_v)[0][0] + ref_point for jji, _ in enumerate(id_of_texts): order_of_texts_tot.append(order_of_texts[jji] + ref_point) id_of_texts_tot.append(id_of_texts[jji]) - ref_point = ref_point + len(id_of_texts) + ref_point += len(id_of_texts) order_of_texts_tot = [] for tj1 in range(len(contours_only_text_parent)): From 630002d96d151aa68968ee168ab6f1148686e946 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 2 Mar 2021 13:54:15 +0100 Subject: [PATCH 03/21] minor clean up xml_reading_order --- qurator/eynollah/utils/xml.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/qurator/eynollah/utils/xml.py b/qurator/eynollah/utils/xml.py index 4f41461..4ae4a06 100644 --- a/qurator/eynollah/utils/xml.py +++ b/qurator/eynollah/utils/xml.py @@ -1,4 +1,5 @@ # pylint: disable=too-many-locals,wrong-import-position,too-many-lines,too-many-statements,chained-comparison,fixme,broad-except,c-extension-no-member +# pylint: disable=invalid-name from lxml import etree as ET import numpy as np @@ -47,15 +48,13 @@ def xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found region_order_sub = ET.SubElement(region_order, 'OrderedGroup') region_order_sub.set('id', "ro357564684568544579089") indexer_region = 0 - for vj in order_of_texts: - name = "coord_text_%s" % vj + for idx_text in order_of_texts: name = ET.SubElement(region_order_sub, 'RegionRefIndexed') name.set('index', str(indexer_region)) - name.set('regionRef', id_of_texts[vj]) + name.set('regionRef', id_of_texts[idx_text]) indexer_region += 1 - for vm in range(len(found_polygons_marginals)): + for _ in found_polygons_marginals: id_of_marginalia.append('r%s' % indexer_region) - name = "coord_text_%s" % indexer_region name = ET.SubElement(region_order_sub, 'RegionRefIndexed') name.set('index', str(indexer_region)) name.set('regionRef', 'r%s' % indexer_region) From 9f5e4af5f087b9a542ad4b8617f1a682eb69b72f Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 2 Mar 2021 14:13:56 +0100 Subject: [PATCH 04/21] factor out marginalia ID calc from xml_reading_order --- qurator/eynollah/utils/xml.py | 7 +------ qurator/eynollah/writer.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/qurator/eynollah/utils/xml.py b/qurator/eynollah/utils/xml.py index 4ae4a06..3e76e68 100644 --- a/qurator/eynollah/utils/xml.py +++ b/qurator/eynollah/utils/xml.py @@ -40,10 +40,7 @@ def add_textequiv(parent, text=''): unireg = ET.SubElement(textequiv, 'Unicode') unireg.text = text -def xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals): - """ - XXX side-effect: extends id_of_marginalia - """ +def xml_reading_order(page, order_of_texts, id_of_texts, found_polygons_marginals): region_order = ET.SubElement(page, 'ReadingOrder') region_order_sub = ET.SubElement(region_order, 'OrderedGroup') region_order_sub.set('id', "ro357564684568544579089") @@ -54,12 +51,10 @@ def xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found name.set('regionRef', id_of_texts[idx_text]) indexer_region += 1 for _ in found_polygons_marginals: - id_of_marginalia.append('r%s' % indexer_region) name = ET.SubElement(region_order_sub, 'RegionRefIndexed') name.set('index', str(indexer_region)) name.set('regionRef', 'r%s' % indexer_region) indexer_region += 1 - return id_of_marginalia def order_and_id_of_texts(found_polygons_text_region, found_polygons_text_region_h, matrix_of_orders, indexes_sorted, index_of_types, kind_of_texts, ref_point): indexes_sorted = np.array(indexes_sorted) diff --git a/qurator/eynollah/writer.py b/qurator/eynollah/writer.py index c8c34e4..874b69c 100644 --- a/qurator/eynollah/writer.py +++ b/qurator/eynollah/writer.py @@ -140,10 +140,14 @@ class EynollahXmlWriter(): coord_page.set('points', self.calculate_page_coords(cont_page)) id_of_marginalia = [] + for idx_marginal, _ in enumerate(found_polygons_marginals): + id_of_marginalia.append('r%s' % len(order_of_texts) + idx_marginal) + id_indexer = 0 id_indexer_l = 0 + if len(found_polygons_text_region) > 0: - id_of_marginalia = xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals) + xml_reading_order(page, order_of_texts, id_of_texts, found_polygons_marginals) for mm in range(len(found_polygons_text_region)): textregion = ET.SubElement(page, 'TextRegion') textregion.set('id', 'r%s' % id_indexer) @@ -191,9 +195,11 @@ class EynollahXmlWriter(): id_indexer = 0 id_indexer_l = 0 id_of_marginalia = [] + for idx_marginal, _ in enumerate(found_polygons_marginals): + id_of_marginalia.append('r%s' % len(order_of_texts) + idx_marginal) if len(found_polygons_text_region) > 0: - id_of_marginalia = xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals) + xml_reading_order(page, order_of_texts, id_of_texts, found_polygons_marginals) for mm in range(len(found_polygons_text_region)): textregion=ET.SubElement(page, 'TextRegion') textregion.set('id', 'r%s' % id_indexer) From 24da8798444a8f43ce0776bec7cb449fe796bb2a Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 2 Mar 2021 15:24:56 +0100 Subject: [PATCH 05/21] add EynollahIdCounter class --- qurator/eynollah/utils/counter.py | 29 +++++++++++++++++++ qurator/eynollah/utils/xml.py | 18 ++++++------ qurator/eynollah/writer.py | 47 +++++++++++++++---------------- 3 files changed, 59 insertions(+), 35 deletions(-) create mode 100644 qurator/eynollah/utils/counter.py diff --git a/qurator/eynollah/utils/counter.py b/qurator/eynollah/utils/counter.py new file mode 100644 index 0000000..e2ba34f --- /dev/null +++ b/qurator/eynollah/utils/counter.py @@ -0,0 +1,29 @@ +from collections import Counter + +REGION_ID_TEMPLATE = 'region_%04d' +LINE_ID_TEMPLATE = 'region_%04d_line_%04d' + +class EynollahIdCounter(): + + def __init__(self, region_idx=0, line_idx=0): + self._counter = Counter() + + def inc(self, name, val=1): + self._counter.update({name: val}) + + def get(self, name): + return self._counter[name] + + def set(self, name, val): + self._counter[name] = val + + @property + def next_region_id(self): + self.inc('region') + self.set('line', 0) + return REGION_ID_TEMPLATE % self._counter['region'] + + @property + def next_line_id(self): + self.inc('line') + return LINE_ID_TEMPLATE % (self._counter['region'], self._counter['line']) diff --git a/qurator/eynollah/utils/xml.py b/qurator/eynollah/utils/xml.py index 3e76e68..194e7eb 100644 --- a/qurator/eynollah/utils/xml.py +++ b/qurator/eynollah/utils/xml.py @@ -1,6 +1,7 @@ # pylint: disable=too-many-locals,wrong-import-position,too-many-lines,too-many-statements,chained-comparison,fixme,broad-except,c-extension-no-member # pylint: disable=invalid-name from lxml import etree as ET +from .counter import EynollahIdCounter import numpy as np NAMESPACES = {} @@ -70,19 +71,16 @@ def order_and_id_of_texts(found_polygons_text_region, found_polygons_text_region index_of_types_2 = index_of_types[kind_of_texts == 2] indexes_sorted_2 = indexes_sorted[kind_of_texts == 2] - index_b = 0 + ref_point - for mm, _ in enumerate(found_polygons_text_region): - id_of_texts.append("r" + str(index_b)) - interest = indexes_sorted_1[indexes_sorted_1 == index_of_types_1[mm]] + counter = EynollahIdCounter(region_idx=ref_point) + for idx_textregion, _ in enumerate(found_polygons_text_region): + id_of_texts.append(counter.next_region_id) + interest = indexes_sorted_1[indexes_sorted_1 == index_of_types_1[idx_textregion]] if len(interest) > 0: order_of_texts.append(interest[0]) - index_b += 1 - for mm, _ in enumerate(found_polygons_text_region_h): - id_of_texts.append("r" + str(index_b)) - interest = indexes_sorted_2[index_of_types_2[mm]] + for idx_headerregion, _ in enumerate(found_polygons_text_region_h): + id_of_texts.append(counter.next_region_id) + interest = indexes_sorted_2[index_of_types_2[idx_headerregion]] order_of_texts.append(interest) - index_b += 1 return order_of_texts, id_of_texts - diff --git a/qurator/eynollah/writer.py b/qurator/eynollah/writer.py index 874b69c..70ac17b 100644 --- a/qurator/eynollah/writer.py +++ b/qurator/eynollah/writer.py @@ -3,6 +3,7 @@ from pathlib import Path import os.path from .utils.xml import create_page_xml, add_textequiv, xml_reading_order +from .utils.counter import EynollahIdCounter from ocrd_utils import getLogger from lxml import etree as ET @@ -12,6 +13,7 @@ class EynollahXmlWriter(): def __init__(self, *, dir_out, image_filename, curved_line): self.logger = getLogger('eynollah.writer') + self.counter = EynollahIdCounter() self.dir_out = dir_out self.image_filename = image_filename self.image_filename_stem = Path(Path(image_filename).name).stem @@ -139,38 +141,37 @@ class EynollahXmlWriter(): coord_page = ET.SubElement(page_print_sub, "Coords") coord_page.set('points', self.calculate_page_coords(cont_page)) + counter_textregions = EynollahIdCounter() + counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) + id_of_marginalia = [] - for idx_marginal, _ in enumerate(found_polygons_marginals): - id_of_marginalia.append('r%s' % len(order_of_texts) + idx_marginal) + for _ in found_polygons_marginals: + id_of_marginalia.append(counter_marginals.next_region_id) - id_indexer = 0 id_indexer_l = 0 if len(found_polygons_text_region) > 0: xml_reading_order(page, order_of_texts, id_of_texts, found_polygons_marginals) for mm in range(len(found_polygons_text_region)): textregion = ET.SubElement(page, 'TextRegion') - textregion.set('id', 'r%s' % id_indexer) - id_indexer += 1 + textregion.set('id', counter_textregions.next_region_id) textregion.set('type', 'paragraph') coord_text = ET.SubElement(textregion, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord)) id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l) add_textequiv(textregion) - for mm in range(len(found_polygons_marginals)): + for idx_marginal, _ in enumerate(found_polygons_marginals): marginal = ET.SubElement(page, 'TextRegion') - marginal.set('id', id_of_marginalia[mm]) + marginal.set('id', id_of_marginalia[idx_marginal]) marginal.set('type', 'marginalia') coord_text = ET.SubElement(marginal, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord)) id_indexer_l = self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, id_indexer_l) - id_indexer = len(found_polygons_text_region) + len(found_polygons_marginals) for mm in range(len(found_polygons_text_region_img)): textregion = ET.SubElement(page, 'ImageRegion') - textregion.set('id', 'r%s' % id_indexer) - id_indexer += 1 + textregion.set('id', counter_textregions.next_region_id) coord_text = ET.SubElement(textregion, 'Coords') points_co = '' for lmm in range(len(found_polygons_text_region_img[mm])): @@ -192,18 +193,19 @@ class EynollahXmlWriter(): coord_page = ET.SubElement(page_print_sub, "Coords") coord_page.set('points', self.calculate_page_coords(cont_page)) - id_indexer = 0 + counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) + counter_textregions = EynollahIdCounter() + id_indexer_l = 0 id_of_marginalia = [] - for idx_marginal, _ in enumerate(found_polygons_marginals): - id_of_marginalia.append('r%s' % len(order_of_texts) + idx_marginal) + for _ in found_polygons_marginals: + id_of_marginalia.append(counter_marginals.next_region_id) if len(found_polygons_text_region) > 0: xml_reading_order(page, order_of_texts, id_of_texts, found_polygons_marginals) for mm in range(len(found_polygons_text_region)): textregion=ET.SubElement(page, 'TextRegion') - textregion.set('id', 'r%s' % id_indexer) - id_indexer += 1 + textregion.set('id', counter_textregions.next_region_id) textregion.set('type', 'paragraph') coord_text = ET.SubElement(textregion, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord)) @@ -214,8 +216,7 @@ class EynollahXmlWriter(): if len(found_polygons_text_region_h) > 0: for mm in range(len(found_polygons_text_region_h)): textregion=ET.SubElement(page, 'TextRegion') - textregion.set('id', 'r%s' % id_indexer) - id_indexer += 1 + textregion.set('id', counter_textregions.next_region_id) textregion.set('type','header') coord_text = ET.SubElement(textregion, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_h, mm, page_coord)) @@ -223,11 +224,9 @@ class EynollahXmlWriter(): add_textequiv(textregion) if len(found_polygons_drop_capitals) > 0: - id_indexer = len(found_polygons_text_region) + len(found_polygons_text_region_h) + len(found_polygons_marginals) for mm in range(len(found_polygons_drop_capitals)): textregion=ET.SubElement(page, 'TextRegion') - textregion.set('id',' r%s' % id_indexer) - id_indexer += 1 + textregion.set('id', counter_textregions.next_region_id) textregion.set('type', 'drop-capital') coord_text = ET.SubElement(textregion, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals, mm, page_coord)) @@ -241,19 +240,17 @@ class EynollahXmlWriter(): coord_text = ET.SubElement(marginal, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord)) id_indexer_l = self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, id_indexer_l) + counter_textregions.inc('region', counter_marginals.get('region')) - id_indexer = len(found_polygons_text_region) + len(found_polygons_text_region_h) + len(found_polygons_marginals) + len(found_polygons_drop_capitals) for mm in range(len(found_polygons_text_region_img)): textregion=ET.SubElement(page, 'ImageRegion') - textregion.set('id', 'r%s' % id_indexer) - id_indexer += 1 + textregion.set('id', counter_textregions.next_region_id) coord_text = ET.SubElement(textregion, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_img, mm, page_coord)) for mm in range(len(found_polygons_tables)): textregion = ET.SubElement(page, 'TableRegion') - textregion.set('id', 'r%s' %id_indexer) - id_indexer += 1 + textregion.set('id', counter_textregions.next_region_id) coord_text = ET.SubElement(textregion, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_tables, mm, page_coord)) From 20fcac623251c632547d67e90211966fd327757f Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 2 Mar 2021 15:26:05 +0100 Subject: [PATCH 06/21] remove unnecessary if --- qurator/eynollah/writer.py | 70 ++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 36 deletions(-) diff --git a/qurator/eynollah/writer.py b/qurator/eynollah/writer.py index 70ac17b..ac74b2f 100644 --- a/qurator/eynollah/writer.py +++ b/qurator/eynollah/writer.py @@ -152,14 +152,15 @@ class EynollahXmlWriter(): if len(found_polygons_text_region) > 0: xml_reading_order(page, order_of_texts, id_of_texts, found_polygons_marginals) - for mm in range(len(found_polygons_text_region)): - textregion = ET.SubElement(page, 'TextRegion') - textregion.set('id', counter_textregions.next_region_id) - textregion.set('type', 'paragraph') - coord_text = ET.SubElement(textregion, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord)) - id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l) - add_textequiv(textregion) + + for mm in range(len(found_polygons_text_region)): + textregion = ET.SubElement(page, 'TextRegion') + textregion.set('id', counter_textregions.next_region_id) + textregion.set('type', 'paragraph') + coord_text = ET.SubElement(textregion, 'Coords') + coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord)) + id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l) + add_textequiv(textregion) for idx_marginal, _ in enumerate(found_polygons_marginals): marginal = ET.SubElement(page, 'TextRegion') @@ -201,36 +202,33 @@ class EynollahXmlWriter(): for _ in found_polygons_marginals: id_of_marginalia.append(counter_marginals.next_region_id) - if len(found_polygons_text_region) > 0: - xml_reading_order(page, order_of_texts, id_of_texts, found_polygons_marginals) - for mm in range(len(found_polygons_text_region)): - textregion=ET.SubElement(page, 'TextRegion') - textregion.set('id', counter_textregions.next_region_id) - textregion.set('type', 'paragraph') - coord_text = ET.SubElement(textregion, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord)) - id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l) - add_textequiv(textregion) + xml_reading_order(page, order_of_texts, id_of_texts, found_polygons_marginals) + for mm in range(len(found_polygons_text_region)): + textregion=ET.SubElement(page, 'TextRegion') + textregion.set('id', counter_textregions.next_region_id) + textregion.set('type', 'paragraph') + coord_text = ET.SubElement(textregion, 'Coords') + coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord)) + id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l) + add_textequiv(textregion) self.logger.debug('len(found_polygons_text_region_h) %s', len(found_polygons_text_region_h)) - if len(found_polygons_text_region_h) > 0: - for mm in range(len(found_polygons_text_region_h)): - textregion=ET.SubElement(page, 'TextRegion') - textregion.set('id', counter_textregions.next_region_id) - textregion.set('type','header') - coord_text = ET.SubElement(textregion, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_h, mm, page_coord)) - id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes, id_indexer_l) - add_textequiv(textregion) - - if len(found_polygons_drop_capitals) > 0: - for mm in range(len(found_polygons_drop_capitals)): - textregion=ET.SubElement(page, 'TextRegion') - textregion.set('id', counter_textregions.next_region_id) - textregion.set('type', 'drop-capital') - coord_text = ET.SubElement(textregion, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals, mm, page_coord)) - add_textequiv(textregion) + for mm in range(len(found_polygons_text_region_h)): + textregion=ET.SubElement(page, 'TextRegion') + textregion.set('id', counter_textregions.next_region_id) + textregion.set('type','header') + coord_text = ET.SubElement(textregion, 'Coords') + coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_h, mm, page_coord)) + id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes, id_indexer_l) + add_textequiv(textregion) + + for mm in range(len(found_polygons_drop_capitals)): + textregion=ET.SubElement(page, 'TextRegion') + textregion.set('id', counter_textregions.next_region_id) + textregion.set('type', 'drop-capital') + coord_text = ET.SubElement(textregion, 'Coords') + coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals, mm, page_coord)) + add_textequiv(textregion) for mm in range(len(found_polygons_marginals)): marginal = ET.SubElement(page, 'TextRegion') From 1cd3ee1a2e639cfba6148ef70dbcab911703fa23 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 2 Mar 2021 15:33:41 +0100 Subject: [PATCH 07/21] simplify calculate_polygon_coords --- qurator/eynollah/writer.py | 41 ++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/qurator/eynollah/writer.py b/qurator/eynollah/writer.py index ac74b2f..e423dae 100644 --- a/qurator/eynollah/writer.py +++ b/qurator/eynollah/writer.py @@ -158,7 +158,7 @@ class EynollahXmlWriter(): textregion.set('id', counter_textregions.next_region_id) textregion.set('type', 'paragraph') coord_text = ET.SubElement(textregion, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord)) + coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord)) id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l) add_textequiv(textregion) @@ -167,7 +167,7 @@ class EynollahXmlWriter(): marginal.set('id', id_of_marginalia[idx_marginal]) marginal.set('type', 'marginalia') coord_text = ET.SubElement(marginal, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord)) + coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord)) id_indexer_l = self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, id_indexer_l) for mm in range(len(found_polygons_text_region_img)): @@ -179,9 +179,8 @@ class EynollahXmlWriter(): points_co += str(int((found_polygons_text_region_img[mm][lmm,0,0] + page_coord[2]) / self.scale_x)) points_co += ',' points_co += str(int((found_polygons_text_region_img[mm][lmm,0,1] + page_coord[0]) / self.scale_y)) - if lmm < len(found_polygons_text_region_img[mm]) - 1: - points_co += ' ' - coord_text.set('points', points_co) + points_co += ' ' + coord_text.set('points', points_co[:-1]) return pcgts @@ -208,7 +207,7 @@ class EynollahXmlWriter(): textregion.set('id', counter_textregions.next_region_id) textregion.set('type', 'paragraph') coord_text = ET.SubElement(textregion, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord)) + coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord)) id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l) add_textequiv(textregion) @@ -218,7 +217,7 @@ class EynollahXmlWriter(): textregion.set('id', counter_textregions.next_region_id) textregion.set('type','header') coord_text = ET.SubElement(textregion, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_h, mm, page_coord)) + coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_h[mm], page_coord)) id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes, id_indexer_l) add_textequiv(textregion) @@ -227,7 +226,7 @@ class EynollahXmlWriter(): textregion.set('id', counter_textregions.next_region_id) textregion.set('type', 'drop-capital') coord_text = ET.SubElement(textregion, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals, mm, page_coord)) + coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals[mm], page_coord)) add_textequiv(textregion) for mm in range(len(found_polygons_marginals)): @@ -236,7 +235,7 @@ class EynollahXmlWriter(): marginal.set('id', id_of_marginalia[mm]) marginal.set('type', 'marginalia') coord_text = ET.SubElement(marginal, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord)) + coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord)) id_indexer_l = self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, id_indexer_l) counter_textregions.inc('region', counter_marginals.get('region')) @@ -244,30 +243,28 @@ class EynollahXmlWriter(): textregion=ET.SubElement(page, 'ImageRegion') textregion.set('id', counter_textregions.next_region_id) coord_text = ET.SubElement(textregion, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_img, mm, page_coord)) + coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_img[mm], page_coord)) for mm in range(len(found_polygons_tables)): textregion = ET.SubElement(page, 'TableRegion') textregion.set('id', counter_textregions.next_region_id) coord_text = ET.SubElement(textregion, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_tables, mm, page_coord)) + coord_text.set('points', self.calculate_polygon_coords(found_polygons_tables[mm], page_coord)) return pcgts - def calculate_polygon_coords(self, contour_list, i, page_coord): + def calculate_polygon_coords(self, contour, page_coord): self.logger.debug('enter calculate_polygon_coords') coords = '' - for j in range(len(contour_list[i])): - if len(contour_list[i][j]) == 2: - coords += str(int((contour_list[i][j][0] + page_coord[2]) / self.scale_x)) + for value_bbox in contour: + if len(value_bbox) == 2: + coords += str(int((value_bbox[0] + page_coord[2]) / self.scale_x)) coords += ',' - coords += str(int((contour_list[i][j][1] + page_coord[0]) / self.scale_y)) + coords += str(int((value_bbox[1] + page_coord[0]) / self.scale_y)) else: - coords += str(int((contour_list[i][j][0][0] + page_coord[2]) / self.scale_x)) + coords += str(int((value_bbox[0][0] + page_coord[2]) / self.scale_x)) coords += ',' - coords += str(int((contour_list[i][j][0][1] + page_coord[0]) / self.scale_y)) - - if j < len(contour_list[i]) - 1: - coords=coords + ' ' - return coords + coords += str(int((value_bbox[0][1] + page_coord[0]) / self.scale_y)) + coords=coords + ' ' + return coords[:-1] From 9b1da7c0234d1e909076c88c2e1b6ef1fc24f238 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 2 Mar 2021 15:53:43 +0100 Subject: [PATCH 08/21] use counter for lines too --- qurator/eynollah/writer.py | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/qurator/eynollah/writer.py b/qurator/eynollah/writer.py index e423dae..535393e 100644 --- a/qurator/eynollah/writer.py +++ b/qurator/eynollah/writer.py @@ -38,11 +38,10 @@ class EynollahXmlWriter(): points_page_print = points_page_print + ' ' return points_page_print[:-1] - def serialize_lines_in_marginal(self, marginal, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, slopes_marginals, id_indexer_l): + def serialize_lines_in_marginal(self, marginal, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, slopes_marginals, counter): for j in range(len(all_found_texline_polygons_marginals[marginal_idx])): textline = ET.SubElement(marginal, 'TextLine') - textline.set('id', 'l%s' % id_indexer_l) - id_indexer_l += 1 + textline.set('id', counter.next_line_id) coord = ET.SubElement(textline, 'Coords') add_textequiv(textline) points_co = '' @@ -79,14 +78,12 @@ class EynollahXmlWriter(): if l < len(all_found_texline_polygons_marginals[marginal_idx][j]) - 1: points_co += ' ' coord.set('points',points_co) - return id_indexer_l - def serialize_lines_in_region(self, textregion, all_found_texline_polygons, region_idx, page_coord, all_box_coord, slopes, id_indexer_l): + def serialize_lines_in_region(self, textregion, all_found_texline_polygons, region_idx, page_coord, all_box_coord, slopes, counter): self.logger.debug('enter serialize_lines_in_region') for j in range(len(all_found_texline_polygons[region_idx])): textline = ET.SubElement(textregion, 'TextLine') - textline.set('id', 'l%s' % id_indexer_l) - id_indexer_l += 1 + textline.set('id', counter.next_line_id) coord = ET.SubElement(textline, 'Coords') add_textequiv(textline) @@ -125,7 +122,6 @@ class EynollahXmlWriter(): if l < len(all_found_texline_polygons[region_idx][j]) - 1: points_co += ' ' coord.set('points',points_co) - return id_indexer_l def write_pagexml(self, pcgts): self.logger.info("filename stem: '%s'", self.image_filename_stem) @@ -148,8 +144,6 @@ class EynollahXmlWriter(): for _ in found_polygons_marginals: id_of_marginalia.append(counter_marginals.next_region_id) - id_indexer_l = 0 - if len(found_polygons_text_region) > 0: xml_reading_order(page, order_of_texts, id_of_texts, found_polygons_marginals) @@ -159,7 +153,7 @@ class EynollahXmlWriter(): textregion.set('type', 'paragraph') coord_text = ET.SubElement(textregion, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord)) - id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l) + self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, counter_textregions) add_textequiv(textregion) for idx_marginal, _ in enumerate(found_polygons_marginals): @@ -168,7 +162,7 @@ class EynollahXmlWriter(): marginal.set('type', 'marginalia') coord_text = ET.SubElement(marginal, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord)) - id_indexer_l = self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, id_indexer_l) + self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter_textregions) for mm in range(len(found_polygons_text_region_img)): textregion = ET.SubElement(page, 'ImageRegion') @@ -196,7 +190,6 @@ class EynollahXmlWriter(): counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) counter_textregions = EynollahIdCounter() - id_indexer_l = 0 id_of_marginalia = [] for _ in found_polygons_marginals: id_of_marginalia.append(counter_marginals.next_region_id) @@ -208,7 +201,7 @@ class EynollahXmlWriter(): textregion.set('type', 'paragraph') coord_text = ET.SubElement(textregion, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord)) - id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l) + self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, counter_textregions) add_textequiv(textregion) self.logger.debug('len(found_polygons_text_region_h) %s', len(found_polygons_text_region_h)) @@ -218,7 +211,7 @@ class EynollahXmlWriter(): textregion.set('type','header') coord_text = ET.SubElement(textregion, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_h[mm], page_coord)) - id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes, id_indexer_l) + self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes, counter_textregions) add_textequiv(textregion) for mm in range(len(found_polygons_drop_capitals)): @@ -236,7 +229,7 @@ class EynollahXmlWriter(): marginal.set('type', 'marginalia') coord_text = ET.SubElement(marginal, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord)) - id_indexer_l = self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, id_indexer_l) + self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter_textregions) counter_textregions.inc('region', counter_marginals.get('region')) for mm in range(len(found_polygons_text_region_img)): From 98568402c7e422dd2a2dd5aec8e2f2398dc9ff62 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 2 Mar 2021 16:13:03 +0100 Subject: [PATCH 09/21] counter: init-overrideable --- qurator/eynollah/utils/counter.py | 2 ++ tests/test_counter.py | 27 +++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 tests/test_counter.py diff --git a/qurator/eynollah/utils/counter.py b/qurator/eynollah/utils/counter.py index e2ba34f..f076c23 100644 --- a/qurator/eynollah/utils/counter.py +++ b/qurator/eynollah/utils/counter.py @@ -7,6 +7,8 @@ class EynollahIdCounter(): def __init__(self, region_idx=0, line_idx=0): self._counter = Counter() + self.set('region', region_idx) + self.set('line', line_idx) def inc(self, name, val=1): self._counter.update({name: val}) diff --git a/tests/test_counter.py b/tests/test_counter.py new file mode 100644 index 0000000..44715b8 --- /dev/null +++ b/tests/test_counter.py @@ -0,0 +1,27 @@ +from tests.base import main +from qurator.eynollah.utils.counter import EynollahIdCounter + +def test_counter_string(): + c = EynollahIdCounter() + assert c.next_region_id == 'region_0001' + assert c.next_region_id == 'region_0002' + assert c.next_line_id == 'region_0002_line_0001' + assert c.next_region_id == 'region_0003' + assert c.next_line_id == 'region_0003_line_0001' + +def test_counter_init(): + c = EynollahIdCounter(region_idx=2) + assert c.get('region') == 2 + +def test_counter_methods(): + c = EynollahIdCounter() + assert c.get('region') == 0 + c.inc('region', 5) + assert c.get('region') == 5 + c.set('region', 10) + assert c.get('region') == 10 + c.inc('region', -9) + assert c.get('region') == 1 + +if __name__ == '__main__': + main(__file__) From 6b2a6588fa4ebd92bd8ca1062ce205e720f8d379 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Tue, 2 Mar 2021 10:37:00 -0500 Subject: [PATCH 10/21] Ein klein bug gefixt --- qurator/eynollah/writer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/qurator/eynollah/writer.py b/qurator/eynollah/writer.py index 874b69c..c2a3d61 100644 --- a/qurator/eynollah/writer.py +++ b/qurator/eynollah/writer.py @@ -141,7 +141,7 @@ class EynollahXmlWriter(): id_of_marginalia = [] for idx_marginal, _ in enumerate(found_polygons_marginals): - id_of_marginalia.append('r%s' % len(order_of_texts) + idx_marginal) + id_of_marginalia.append('r%s' % (len(order_of_texts) + idx_marginal) ) id_indexer = 0 id_indexer_l = 0 @@ -191,12 +191,12 @@ class EynollahXmlWriter(): page_print_sub = ET.SubElement(page, "Border") coord_page = ET.SubElement(page_print_sub, "Coords") coord_page.set('points', self.calculate_page_coords(cont_page)) - + id_indexer = 0 id_indexer_l = 0 id_of_marginalia = [] for idx_marginal, _ in enumerate(found_polygons_marginals): - id_of_marginalia.append('r%s' % len(order_of_texts) + idx_marginal) + id_of_marginalia.append('r%s' % ( len(order_of_texts) + idx_marginal) ) if len(found_polygons_text_region) > 0: xml_reading_order(page, order_of_texts, id_of_texts, found_polygons_marginals) From 7eb973b3aa36091343ca2c177f593c94289cad97 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 2 Mar 2021 17:23:31 +0100 Subject: [PATCH 11/21] xml_reading_order takes id_of_marginals directly --- qurator/eynollah/utils/xml.py | 11 +++++------ qurator/eynollah/writer.py | 11 ++--------- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/qurator/eynollah/utils/xml.py b/qurator/eynollah/utils/xml.py index 194e7eb..3123412 100644 --- a/qurator/eynollah/utils/xml.py +++ b/qurator/eynollah/utils/xml.py @@ -41,21 +41,20 @@ def add_textequiv(parent, text=''): unireg = ET.SubElement(textequiv, 'Unicode') unireg.text = text -def xml_reading_order(page, order_of_texts, id_of_texts, found_polygons_marginals): +def xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia): region_order = ET.SubElement(page, 'ReadingOrder') region_order_sub = ET.SubElement(region_order, 'OrderedGroup') region_order_sub.set('id', "ro357564684568544579089") indexer_region = 0 - for idx_text in order_of_texts: + for id_of_textregion in order_of_texts: name = ET.SubElement(region_order_sub, 'RegionRefIndexed') name.set('index', str(indexer_region)) - name.set('regionRef', id_of_texts[idx_text]) + name.set('regionRef', id_of_textregion) indexer_region += 1 - for _ in found_polygons_marginals: + for id_marginal in id_of_marginalia: name = ET.SubElement(region_order_sub, 'RegionRefIndexed') name.set('index', str(indexer_region)) - name.set('regionRef', 'r%s' % indexer_region) - indexer_region += 1 + name.set('regionRef', id_marginal) def order_and_id_of_texts(found_polygons_text_region, found_polygons_text_region_h, matrix_of_orders, indexes_sorted, index_of_types, kind_of_texts, ref_point): indexes_sorted = np.array(indexes_sorted) diff --git a/qurator/eynollah/writer.py b/qurator/eynollah/writer.py index 535393e..1d027a5 100644 --- a/qurator/eynollah/writer.py +++ b/qurator/eynollah/writer.py @@ -140,12 +140,8 @@ class EynollahXmlWriter(): counter_textregions = EynollahIdCounter() counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) - id_of_marginalia = [] - for _ in found_polygons_marginals: - id_of_marginalia.append(counter_marginals.next_region_id) - if len(found_polygons_text_region) > 0: - xml_reading_order(page, order_of_texts, id_of_texts, found_polygons_marginals) + xml_reading_order(page, order_of_texts, id_of_texts, [counter_marginals.next_region_id for _ in found_polygons_marginals]) for mm in range(len(found_polygons_text_region)): textregion = ET.SubElement(page, 'TextRegion') @@ -190,11 +186,8 @@ class EynollahXmlWriter(): counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) counter_textregions = EynollahIdCounter() - id_of_marginalia = [] - for _ in found_polygons_marginals: - id_of_marginalia.append(counter_marginals.next_region_id) + xml_reading_order(page, order_of_texts, id_of_texts, [counter_marginals.next_region_id for _ in found_polygons_marginals]) - xml_reading_order(page, order_of_texts, id_of_texts, found_polygons_marginals) for mm in range(len(found_polygons_text_region)): textregion=ET.SubElement(page, 'TextRegion') textregion.set('id', counter_textregions.next_region_id) From 56b688befe4802ce618005f521f2d51d4425c4c6 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 2 Mar 2021 17:41:45 +0100 Subject: [PATCH 12/21] counter: allow arbitrary line/region id --- qurator/eynollah/utils/counter.py | 16 ++++++++++++++-- tests/test_counter.py | 2 ++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/qurator/eynollah/utils/counter.py b/qurator/eynollah/utils/counter.py index f076c23..5280c6f 100644 --- a/qurator/eynollah/utils/counter.py +++ b/qurator/eynollah/utils/counter.py @@ -19,13 +19,25 @@ class EynollahIdCounter(): def set(self, name, val): self._counter[name] = val + def region_id(self, region_idx=None): + if not region_idx: + region_idx = self._counter['region'] + return REGION_ID_TEMPLATE % region_idx + + def line_id(self, region_idx=None, line_idx=None): + if not region_idx: + region_idx = self._counter['region'] + if not line_idx: + line_idx = self._counter['line'] + return LINE_ID_TEMPLATE % (region_idx, line_idx) + @property def next_region_id(self): self.inc('region') self.set('line', 0) - return REGION_ID_TEMPLATE % self._counter['region'] + return self.region_id() @property def next_line_id(self): self.inc('line') - return LINE_ID_TEMPLATE % (self._counter['region'], self._counter['line']) + return self.line_id() diff --git a/tests/test_counter.py b/tests/test_counter.py index 44715b8..a157cd5 100644 --- a/tests/test_counter.py +++ b/tests/test_counter.py @@ -8,6 +8,8 @@ def test_counter_string(): assert c.next_line_id == 'region_0002_line_0001' assert c.next_region_id == 'region_0003' assert c.next_line_id == 'region_0003_line_0001' + assert c.region_id(999) == 'region_0999' + assert c.line_id(999, 888) == 'region_0999_line_0888' def test_counter_init(): c = EynollahIdCounter(region_idx=2) From d95fcf14c0699e870c9ba5bb2300bd6a401a4b07 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 2 Mar 2021 17:47:06 +0100 Subject: [PATCH 13/21] id_of_marginalia still necessary --- qurator/eynollah/writer.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/qurator/eynollah/writer.py b/qurator/eynollah/writer.py index 1d027a5..02e1c4f 100644 --- a/qurator/eynollah/writer.py +++ b/qurator/eynollah/writer.py @@ -140,8 +140,9 @@ class EynollahXmlWriter(): counter_textregions = EynollahIdCounter() counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) + id_of_marginalia = [counter_marginals.next_region_id for _ in found_polygons_marginals] if len(found_polygons_text_region) > 0: - xml_reading_order(page, order_of_texts, id_of_texts, [counter_marginals.next_region_id for _ in found_polygons_marginals]) + xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia) for mm in range(len(found_polygons_text_region)): textregion = ET.SubElement(page, 'TextRegion') @@ -152,12 +153,12 @@ class EynollahXmlWriter(): self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, counter_textregions) add_textequiv(textregion) - for idx_marginal, _ in enumerate(found_polygons_marginals): + for idx_marginal, marginal_polygon in enumerate(found_polygons_marginals): marginal = ET.SubElement(page, 'TextRegion') marginal.set('id', id_of_marginalia[idx_marginal]) marginal.set('type', 'marginalia') coord_text = ET.SubElement(marginal, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord)) + coord_text.set('points', self.calculate_polygon_coords(marginal_polygon, page_coord)) self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter_textregions) for mm in range(len(found_polygons_text_region_img)): @@ -183,10 +184,11 @@ class EynollahXmlWriter(): coord_page = ET.SubElement(page_print_sub, "Coords") coord_page.set('points', self.calculate_page_coords(cont_page)) - counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) counter_textregions = EynollahIdCounter() + counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) - xml_reading_order(page, order_of_texts, id_of_texts, [counter_marginals.next_region_id for _ in found_polygons_marginals]) + id_of_marginalia = [counter_marginals.next_region_id for _ in found_polygons_marginals] + xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia) for mm in range(len(found_polygons_text_region)): textregion=ET.SubElement(page, 'TextRegion') From 03d75f57889a4dd964128a74c51d75e64d27a929 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 2 Mar 2021 17:51:30 +0100 Subject: [PATCH 14/21] simplify serialize_lines_in_region --- qurator/eynollah/writer.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/qurator/eynollah/writer.py b/qurator/eynollah/writer.py index 02e1c4f..6fb9061 100644 --- a/qurator/eynollah/writer.py +++ b/qurator/eynollah/writer.py @@ -86,42 +86,42 @@ class EynollahXmlWriter(): textline.set('id', counter.next_line_id) coord = ET.SubElement(textline, 'Coords') add_textequiv(textline) + region_bboxes = all_box_coord[region_idx] points_co = '' - for l in range(len(all_found_texline_polygons[region_idx][j])): + for idx_contour_textline, contour_textline in all_found_texline_polygons[region_idx][j]: if not self.curved_line: - if len(all_found_texline_polygons[region_idx][j][l])==2: - textline_x_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][0] + all_box_coord[region_idx][2] + page_coord[2]) / self.scale_x)) - textline_y_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][1] + all_box_coord[region_idx][0] + page_coord[0]) / self.scale_y)) + if len(contour_textline) == 2: + textline_x_coord = max(0, int((contour_textline[0] + region_bboxes[2] + page_coord[2]) / self.scale_x)) + textline_y_coord = max(0, int((contour_textline[1] + region_bboxes[0] + page_coord[0]) / self.scale_y)) else: - textline_x_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][0][0] + all_box_coord[region_idx][2] + page_coord[2]) / self.scale_x)) - textline_y_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][0][1] + all_box_coord[region_idx][0] + page_coord[0]) / self.scale_y)) + textline_x_coord = max(0, int((contour_textline[0][0] + region_bboxes[2] + page_coord[2]) / self.scale_x)) + textline_y_coord = max(0, int((contour_textline[0][1] + region_bboxes[0] + page_coord[0]) / self.scale_y)) points_co += str(textline_x_coord) points_co += ',' points_co += str(textline_y_coord) if self.curved_line and np.abs(slopes[region_idx]) <= 45: - if len(all_found_texline_polygons[region_idx][j][l]) == 2: - points_co += str(int((all_found_texline_polygons[region_idx][j][l][0] + page_coord[2]) / self.scale_x)) + if len(contour_textline) == 2: + points_co += str(int((contour_textline[0] + page_coord[2]) / self.scale_x)) points_co += ',' - points_co += str(int((all_found_texline_polygons[region_idx][j][l][1] + page_coord[0]) / self.scale_y)) + points_co += str(int((contour_textline[1] + page_coord[0]) / self.scale_y)) else: - points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][0] + page_coord[2]) / self.scale_x)) + points_co += str(int((contour_textline[0][0] + page_coord[2]) / self.scale_x)) points_co += ',' - points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][1] + page_coord[0])/self.scale_y)) + points_co += str(int((contour_textline[0][1] + page_coord[0])/self.scale_y)) elif self.curved_line and np.abs(slopes[region_idx]) > 45: - if len(all_found_texline_polygons[region_idx][j][l])==2: - points_co += str(int((all_found_texline_polygons[region_idx][j][l][0] + all_box_coord[region_idx][2]+page_coord[2])/self.scale_x)) + if len(contour_textline)==2: + points_co += str(int((contour_textline[0] + region_bboxes[2] + page_coord[2])/self.scale_x)) points_co += ',' - points_co += str(int((all_found_texline_polygons[region_idx][j][l][1] + all_box_coord[region_idx][0]+page_coord[0])/self.scale_y)) + points_co += str(int((contour_textline[1] + region_bboxes[0] + page_coord[0])/self.scale_y)) else: - points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][0] + all_box_coord[region_idx][2]+page_coord[2])/self.scale_x)) + points_co += str(int((contour_textline[0][0] + region_bboxes[2]+page_coord[2])/self.scale_x)) points_co += ',' - points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][1] + all_box_coord[region_idx][0]+page_coord[0])/self.scale_y)) + points_co += str(int((contour_textline[0][1] + region_bboxes[0]+page_coord[0])/self.scale_y)) - if l < len(all_found_texline_polygons[region_idx][j]) - 1: - points_co += ' ' - coord.set('points',points_co) + points_co += ' ' + coord.set('points', points_co[:-1]) def write_pagexml(self, pcgts): self.logger.info("filename stem: '%s'", self.image_filename_stem) From c5736e9b748c353ec7ec4b86e6b6d99e09e82085 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 10 Mar 2021 11:52:01 +0100 Subject: [PATCH 15/21] fix region counting --- qurator/eynollah/utils/xml.py | 5 +++-- qurator/eynollah/writer.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/qurator/eynollah/utils/xml.py b/qurator/eynollah/utils/xml.py index 3123412..d08c288 100644 --- a/qurator/eynollah/utils/xml.py +++ b/qurator/eynollah/utils/xml.py @@ -45,11 +45,12 @@ def xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia): region_order = ET.SubElement(page, 'ReadingOrder') region_order_sub = ET.SubElement(region_order, 'OrderedGroup') region_order_sub.set('id', "ro357564684568544579089") + region_counter = EynollahIdCounter() indexer_region = 0 - for id_of_textregion in order_of_texts: + for idx_textregion, _ in enumerate(order_of_texts): name = ET.SubElement(region_order_sub, 'RegionRefIndexed') name.set('index', str(indexer_region)) - name.set('regionRef', id_of_textregion) + name.set('regionRef', region_counter.region_id(order_of_texts[idx_textregion])) indexer_region += 1 for id_marginal in id_of_marginalia: name = ET.SubElement(region_order_sub, 'RegionRefIndexed') diff --git a/qurator/eynollah/writer.py b/qurator/eynollah/writer.py index 6fb9061..3bd68f2 100644 --- a/qurator/eynollah/writer.py +++ b/qurator/eynollah/writer.py @@ -89,7 +89,7 @@ class EynollahXmlWriter(): region_bboxes = all_box_coord[region_idx] points_co = '' - for idx_contour_textline, contour_textline in all_found_texline_polygons[region_idx][j]: + for idx_contour_textline, contour_textline in enumerate(all_found_texline_polygons[region_idx][j]): if not self.curved_line: if len(contour_textline) == 2: textline_x_coord = max(0, int((contour_textline[0] + region_bboxes[2] + page_coord[2]) / self.scale_x)) From 6c60d9e90a4eefd763d430ed9ab77c50903840dc Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 10 Mar 2021 17:27:18 +0100 Subject: [PATCH 16/21] reading order: fix @index --- qurator/eynollah/utils/xml.py | 1 + 1 file changed, 1 insertion(+) diff --git a/qurator/eynollah/utils/xml.py b/qurator/eynollah/utils/xml.py index 30355ea..b5c5ce3 100644 --- a/qurator/eynollah/utils/xml.py +++ b/qurator/eynollah/utils/xml.py @@ -50,6 +50,7 @@ def xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia): name = ET.SubElement(region_order_sub, 'RegionRefIndexed') name.set('index', str(region_counter.get('region'))) name.set('regionRef', region_counter.region_id(order_of_texts[idx_textregion])) + region_counter.inc('region') for id_marginal in id_of_marginalia: name = ET.SubElement(region_order_sub, 'RegionRefIndexed') name.set('index', str(region_counter.get('region'))) From a3465ca1a017e6d198ea4851f76dda0734e950f2 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 11 Mar 2021 19:44:42 +0100 Subject: [PATCH 17/21] eliminate id_of_texts from xml_reading_order, fix plus one error --- qurator/eynollah/utils/counter.py | 6 +++--- qurator/eynollah/utils/xml.py | 4 ++-- qurator/eynollah/writer.py | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/qurator/eynollah/utils/counter.py b/qurator/eynollah/utils/counter.py index 5280c6f..6bd9d7d 100644 --- a/qurator/eynollah/utils/counter.py +++ b/qurator/eynollah/utils/counter.py @@ -20,14 +20,14 @@ class EynollahIdCounter(): self._counter[name] = val def region_id(self, region_idx=None): - if not region_idx: + if region_idx is None: region_idx = self._counter['region'] return REGION_ID_TEMPLATE % region_idx def line_id(self, region_idx=None, line_idx=None): - if not region_idx: + if region_idx is None: region_idx = self._counter['region'] - if not line_idx: + if line_idx is None: line_idx = self._counter['line'] return LINE_ID_TEMPLATE % (region_idx, line_idx) diff --git a/qurator/eynollah/utils/xml.py b/qurator/eynollah/utils/xml.py index b5c5ce3..e972218 100644 --- a/qurator/eynollah/utils/xml.py +++ b/qurator/eynollah/utils/xml.py @@ -41,7 +41,7 @@ def add_textequiv(parent, text=''): unireg = ET.SubElement(textequiv, 'Unicode') unireg.text = text -def xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia): +def xml_reading_order(page, order_of_texts, id_of_marginalia): region_order = ET.SubElement(page, 'ReadingOrder') region_order_sub = ET.SubElement(region_order, 'OrderedGroup') region_order_sub.set('id', "ro357564684568544579089") @@ -49,7 +49,7 @@ def xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia): for idx_textregion, _ in enumerate(order_of_texts): name = ET.SubElement(region_order_sub, 'RegionRefIndexed') name.set('index', str(region_counter.get('region'))) - name.set('regionRef', region_counter.region_id(order_of_texts[idx_textregion])) + name.set('regionRef', region_counter.region_id(order_of_texts[idx_textregion] + 1)) region_counter.inc('region') for id_marginal in id_of_marginalia: name = ET.SubElement(region_order_sub, 'RegionRefIndexed') diff --git a/qurator/eynollah/writer.py b/qurator/eynollah/writer.py index 3bd68f2..d643941 100644 --- a/qurator/eynollah/writer.py +++ b/qurator/eynollah/writer.py @@ -142,7 +142,7 @@ class EynollahXmlWriter(): id_of_marginalia = [counter_marginals.next_region_id for _ in found_polygons_marginals] if len(found_polygons_text_region) > 0: - xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia) + xml_reading_order(page, order_of_texts, id_of_marginalia) for mm in range(len(found_polygons_text_region)): textregion = ET.SubElement(page, 'TextRegion') @@ -188,7 +188,7 @@ class EynollahXmlWriter(): counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) id_of_marginalia = [counter_marginals.next_region_id for _ in found_polygons_marginals] - xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia) + xml_reading_order(page, order_of_texts, id_of_marginalia) for mm in range(len(found_polygons_text_region)): textregion=ET.SubElement(page, 'TextRegion') From a678bbf966897e9af9c4c3f3c6991049def02c7c Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 12 Mar 2021 18:39:27 +0100 Subject: [PATCH 18/21] counter: add reset(); --- qurator/eynollah/utils/counter.py | 9 +++++++-- tests/test_counter.py | 4 ++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/qurator/eynollah/utils/counter.py b/qurator/eynollah/utils/counter.py index 6bd9d7d..bc1d765 100644 --- a/qurator/eynollah/utils/counter.py +++ b/qurator/eynollah/utils/counter.py @@ -7,8 +7,13 @@ class EynollahIdCounter(): def __init__(self, region_idx=0, line_idx=0): self._counter = Counter() - self.set('region', region_idx) - self.set('line', line_idx) + self._inital_region_idx = region_idx + self._inital_line_idx = line_idx + self.reset() + + def reset(self): + self.set('region', self._inital_region_idx) + self.set('line', self._inital_line_idx) def inc(self, name, val=1): self._counter.update({name: val}) diff --git a/tests/test_counter.py b/tests/test_counter.py index a157cd5..8ef0756 100644 --- a/tests/test_counter.py +++ b/tests/test_counter.py @@ -14,6 +14,10 @@ def test_counter_string(): def test_counter_init(): c = EynollahIdCounter(region_idx=2) assert c.get('region') == 2 + c.inc('region') + assert c.get('region') == 3 + c.reset() + assert c.get('region') == 2 def test_counter_methods(): c = EynollahIdCounter() From 3d9da4feaa1b60d4341a8578d07ed78da8da97d7 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 12 Mar 2021 19:29:52 +0100 Subject: [PATCH 19/21] writer: use a single counter for all regions/lines --- qurator/eynollah/writer.py | 41 ++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/qurator/eynollah/writer.py b/qurator/eynollah/writer.py index d643941..1cd256b 100644 --- a/qurator/eynollah/writer.py +++ b/qurator/eynollah/writer.py @@ -137,33 +137,32 @@ class EynollahXmlWriter(): coord_page = ET.SubElement(page_print_sub, "Coords") coord_page.set('points', self.calculate_page_coords(cont_page)) - counter_textregions = EynollahIdCounter() - counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) - - id_of_marginalia = [counter_marginals.next_region_id for _ in found_polygons_marginals] + counter = EynollahIdCounter() if len(found_polygons_text_region) > 0: + _counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) + id_of_marginalia = [_counter_marginals.next_region_id for _ in found_polygons_marginals] xml_reading_order(page, order_of_texts, id_of_marginalia) for mm in range(len(found_polygons_text_region)): textregion = ET.SubElement(page, 'TextRegion') - textregion.set('id', counter_textregions.next_region_id) + textregion.set('id', counter.next_region_id) textregion.set('type', 'paragraph') coord_text = ET.SubElement(textregion, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord)) - self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, counter_textregions) + self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, counter) add_textequiv(textregion) - for idx_marginal, marginal_polygon in enumerate(found_polygons_marginals): + for marginal_polygon in found_polygons_marginals: marginal = ET.SubElement(page, 'TextRegion') - marginal.set('id', id_of_marginalia[idx_marginal]) + marginal.set('id', counter.next_region_id) marginal.set('type', 'marginalia') coord_text = ET.SubElement(marginal, 'Coords') coord_text.set('points', self.calculate_polygon_coords(marginal_polygon, page_coord)) - self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter_textregions) + self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter) for mm in range(len(found_polygons_text_region_img)): textregion = ET.SubElement(page, 'ImageRegion') - textregion.set('id', counter_textregions.next_region_id) + textregion.set('id', counter.next_region_id) coord_text = ET.SubElement(textregion, 'Coords') points_co = '' for lmm in range(len(found_polygons_text_region_img[mm])): @@ -184,15 +183,14 @@ class EynollahXmlWriter(): coord_page = ET.SubElement(page_print_sub, "Coords") coord_page.set('points', self.calculate_page_coords(cont_page)) - counter_textregions = EynollahIdCounter() - counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) - - id_of_marginalia = [counter_marginals.next_region_id for _ in found_polygons_marginals] + counter = EynollahIdCounter() + _counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) + id_of_marginalia = [_counter_marginals.next_region_id for _ in found_polygons_marginals] xml_reading_order(page, order_of_texts, id_of_marginalia) for mm in range(len(found_polygons_text_region)): textregion=ET.SubElement(page, 'TextRegion') - textregion.set('id', counter_textregions.next_region_id) + textregion.set('id', counter.next_region_id) textregion.set('type', 'paragraph') coord_text = ET.SubElement(textregion, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord)) @@ -202,7 +200,7 @@ class EynollahXmlWriter(): self.logger.debug('len(found_polygons_text_region_h) %s', len(found_polygons_text_region_h)) for mm in range(len(found_polygons_text_region_h)): textregion=ET.SubElement(page, 'TextRegion') - textregion.set('id', counter_textregions.next_region_id) + textregion.set('id', counter.next_region_id) textregion.set('type','header') coord_text = ET.SubElement(textregion, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_h[mm], page_coord)) @@ -211,7 +209,7 @@ class EynollahXmlWriter(): for mm in range(len(found_polygons_drop_capitals)): textregion=ET.SubElement(page, 'TextRegion') - textregion.set('id', counter_textregions.next_region_id) + textregion.set('id', counter.next_region_id) textregion.set('type', 'drop-capital') coord_text = ET.SubElement(textregion, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals[mm], page_coord)) @@ -220,22 +218,21 @@ class EynollahXmlWriter(): for mm in range(len(found_polygons_marginals)): marginal = ET.SubElement(page, 'TextRegion') add_textequiv(textregion) - marginal.set('id', id_of_marginalia[mm]) + marginal.set('id', counter.next_region_id) marginal.set('type', 'marginalia') coord_text = ET.SubElement(marginal, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord)) - self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter_textregions) - counter_textregions.inc('region', counter_marginals.get('region')) + self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter) for mm in range(len(found_polygons_text_region_img)): textregion=ET.SubElement(page, 'ImageRegion') - textregion.set('id', counter_textregions.next_region_id) + textregion.set('id', counter.next_region_id) coord_text = ET.SubElement(textregion, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_img[mm], page_coord)) for mm in range(len(found_polygons_tables)): textregion = ET.SubElement(page, 'TableRegion') - textregion.set('id', counter_textregions.next_region_id) + textregion.set('id', counter.next_region_id) coord_text = ET.SubElement(textregion, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_tables[mm], page_coord)) From 43b8759acf407b970d07ff6929748459cc47fca6 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 7 Apr 2021 11:34:35 -0400 Subject: [PATCH 20/21] back on track- freezing problem , memory error and issues with reading order by drop capitals and marginals are resolved --- qurator/eynollah/eynollah.py | 65 +++++++++++++++++++++++++++--------- 1 file changed, 50 insertions(+), 15 deletions(-) diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index 9732b15..5d51aec 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -13,11 +13,10 @@ import time import warnings from pathlib import Path from multiprocessing import Process, Queue, cpu_count - +import gc from ocrd_utils import getLogger import cv2 import numpy as np - os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" stderr = sys.stderr sys.stderr = open(os.devnull, "w") @@ -149,7 +148,7 @@ class Eynollah: def predict_enhancement(self, img): self.logger.debug("enter predict_enhancement") - model_enhancement, _ = self.start_new_session_and_model(self.model_dir_of_enhancement) + model_enhancement, session_enhancement = self.start_new_session_and_model(self.model_dir_of_enhancement) img_height_model = model_enhancement.layers[len(model_enhancement.layers) - 1].output_shape[1] img_width_model = model_enhancement.layers[len(model_enhancement.layers) - 1].output_shape[2] @@ -230,6 +229,10 @@ class Eynollah: prediction_true[index_y_d + margin : index_y_u - margin, index_x_d + margin : index_x_u - margin, :] = seg prediction_true = prediction_true.astype(int) + session_enhancement.close() + del model_enhancement + del session_enhancement + gc.collect() return prediction_true @@ -324,8 +327,14 @@ class Eynollah: self.logger.info("Found %s columns (%s)", num_col, label_p_pred) session_col_classifier.close() - + + del model_num_classifier + del session_col_classifier + K.clear_session() + gc.collect() + + img_new, _ = self.calculate_width_height_by_columns(img, num_col, width_early, label_p_pred) @@ -375,7 +384,10 @@ class Eynollah: is_image_enhanced = False num_column_is_classified = True image_res = np.copy(img) + + session_col_classifier.close() + self.logger.debug("exit resize_and_enhance_image_with_column_classifier") return is_image_enhanced, img, image_res, num_col, num_column_is_classified @@ -429,7 +441,7 @@ class Eynollah: self.writer.height_org = self.height_org self.writer.width_org = self.width_org - def start_new_session_and_model(self, model_dir): + def start_new_session_and_model_old(self, model_dir): self.logger.debug("enter start_new_session_and_model (model_dir=%s)", model_dir) config = tf.ConfigProto() config.gpu_options.allow_growth = True @@ -438,6 +450,15 @@ class Eynollah: model = load_model(model_dir, compile=False) return model, session + + def start_new_session_and_model(self, model_dir): + self.logger.debug("enter start_new_session_and_model (model_dir=%s)", model_dir) + gpu_options = tf.compat.v1.GPUOptions(allow_growth=True) + #gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=7.7, allow_growth=True) + session = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options)) + model = load_model(model_dir, compile=False) + + return model, session def do_prediction(self, patches, img, model, marginal_of_patch_percent=0.1): self.logger.debug("enter do_prediction") @@ -554,6 +575,8 @@ class Eynollah: prediction_true[index_y_d + margin : index_y_u - margin, index_x_d + margin : index_x_u - margin, :] = seg_color prediction_true = prediction_true.astype(np.uint8) + del model + gc.collect() return prediction_true def early_page_for_num_of_column_classification(self): @@ -574,7 +597,10 @@ class Eynollah: box = [x, y, w, h] croped_page, page_coord = crop_image_inside_box(box, img) session_page.close() - + del model_page + del session_page + gc.collect() + K.clear_session() self.logger.debug("exit early_page_for_num_of_column_classification") return croped_page, page_coord @@ -606,7 +632,9 @@ class Eynollah: croped_page, page_coord = crop_image_inside_box(box, self.image) cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]])) session_page.close() - + del model_page + del session_page + gc.collect() K.clear_session() self.logger.debug("exit extract_page") return croped_page, page_coord, cont_page @@ -704,6 +732,10 @@ class Eynollah: prediction_regions = resize_image(prediction_regions, img_height_h, img_width_h) session_region.close() + del model_region + del session_region + gc.collect() + self.logger.debug("exit extract_text_regions") return prediction_regions, prediction_regions2 @@ -1000,11 +1032,10 @@ class Eynollah: prediction_textline = resize_image(prediction_textline, img_h, img_w) prediction_textline_longshot = self.do_prediction(False, img, model_textline) prediction_textline_longshot_true_size = resize_image(prediction_textline_longshot, img_h, img_w) - ##plt.imshow(prediction_textline_streched[:,:,0]) - ##plt.show() session_textline.close() + return prediction_textline[:, :, 0], prediction_textline_longshot_true_size[:, :, 0] def do_work_of_slopes(self, q, poly, box_sub, boxes_per_process, textline_mask_tot, contours_per_process): @@ -1071,18 +1102,22 @@ class Eynollah: ##plt.show() prediction_regions_org=prediction_regions_org[:,:,0] prediction_regions_org[(prediction_regions_org[:,:]==1) & (mask_zeros_y[:,:]==1)]=0 + session_region.close() + del model_region + del session_region + gc.collect() model_region, session_region = self.start_new_session_and_model(self.model_region_dir_p2) img = resize_image(img_org, int(img_org.shape[0]), int(img_org.shape[1])) prediction_regions_org2 = self.do_prediction(True, img, model_region, 0.2) prediction_regions_org2=resize_image(prediction_regions_org2, img_height_h, img_width_h ) - #plt.imshow(prediction_regions_org2[:,:,0]) - #plt.show() - ##prediction_regions_org=prediction_regions_org[:,:,0] session_region.close() + del model_region + del session_region + gc.collect() mask_zeros2 = (prediction_regions_org2[:,:,0] == 0) mask_lines2 = (prediction_regions_org2[:,:,0] == 3) @@ -1303,7 +1338,7 @@ class Eynollah: arg_order_v = indexes_sorted_main[zahler] order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = np.where(indexes_sorted == arg_order_v)[0][0] + ref_point - for jji, _ in range(len(id_of_texts)): + for jji, _ in enumerate(id_of_texts): order_of_texts_tot.append(order_of_texts[jji] + ref_point) id_of_texts_tot.append(id_of_texts[jji]) ref_point += len(id_of_texts) @@ -1315,7 +1350,7 @@ class Eynollah: order_text_new = [] for iii in range(len(order_of_texts_tot)): order_text_new.append(np.where(np.array(order_of_texts_tot) == iii)[0][0]) - + except Exception as why: self.logger.error(why) arg_text_con = [] @@ -1362,7 +1397,7 @@ class Eynollah: order_text_new = [] for iii in range(len(order_of_texts_tot)): order_text_new.append(np.where(np.array(order_of_texts_tot) == iii)[0][0]) - + return order_text_new, id_of_texts_tot def do_order_of_regions(self, *args, **kwargs): From d5a9817390eeef2c8c9fcf411fb4ebf7b69455cf Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 7 Apr 2021 11:35:42 -0400 Subject: [PATCH 21/21] back on track- freezing problem , memory error and issues with reading order by drop capitals and marginals are resolved --- qurator/eynollah/writer.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/qurator/eynollah/writer.py b/qurator/eynollah/writer.py index 1cd256b..a54103c 100644 --- a/qurator/eynollah/writer.py +++ b/qurator/eynollah/writer.py @@ -152,12 +152,12 @@ class EynollahXmlWriter(): self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, counter) add_textequiv(textregion) - for marginal_polygon in found_polygons_marginals: + for mm in range(len(found_polygons_marginals)): marginal = ET.SubElement(page, 'TextRegion') marginal.set('id', counter.next_region_id) marginal.set('type', 'marginalia') coord_text = ET.SubElement(marginal, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(marginal_polygon, page_coord)) + coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord)) self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter) for mm in range(len(found_polygons_text_region_img)): @@ -194,7 +194,7 @@ class EynollahXmlWriter(): textregion.set('type', 'paragraph') coord_text = ET.SubElement(textregion, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord)) - self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, counter_textregions) + self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, counter) add_textequiv(textregion) self.logger.debug('len(found_polygons_text_region_h) %s', len(found_polygons_text_region_h)) @@ -204,15 +204,7 @@ class EynollahXmlWriter(): textregion.set('type','header') coord_text = ET.SubElement(textregion, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_h[mm], page_coord)) - self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes, counter_textregions) - add_textequiv(textregion) - - for mm in range(len(found_polygons_drop_capitals)): - textregion=ET.SubElement(page, 'TextRegion') - textregion.set('id', counter.next_region_id) - textregion.set('type', 'drop-capital') - coord_text = ET.SubElement(textregion, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals[mm], page_coord)) + self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes, counter) add_textequiv(textregion) for mm in range(len(found_polygons_marginals)): @@ -223,6 +215,14 @@ class EynollahXmlWriter(): coord_text = ET.SubElement(marginal, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord)) self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter) + + for mm in range(len(found_polygons_drop_capitals)): + textregion=ET.SubElement(page, 'TextRegion') + textregion.set('id', counter.next_region_id) + textregion.set('type', 'drop-capital') + coord_text = ET.SubElement(textregion, 'Coords') + coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals[mm], page_coord)) + add_textequiv(textregion) for mm in range(len(found_polygons_text_region_img)): textregion=ET.SubElement(page, 'ImageRegion')