diff --git a/qurator/eynollah/utils/counter.py b/qurator/eynollah/utils/counter.py new file mode 100644 index 0000000..e2ba34f --- /dev/null +++ b/qurator/eynollah/utils/counter.py @@ -0,0 +1,29 @@ +from collections import Counter + +REGION_ID_TEMPLATE = 'region_%04d' +LINE_ID_TEMPLATE = 'region_%04d_line_%04d' + +class EynollahIdCounter(): + + def __init__(self, region_idx=0, line_idx=0): + self._counter = Counter() + + def inc(self, name, val=1): + self._counter.update({name: val}) + + def get(self, name): + return self._counter[name] + + def set(self, name, val): + self._counter[name] = val + + @property + def next_region_id(self): + self.inc('region') + self.set('line', 0) + return REGION_ID_TEMPLATE % self._counter['region'] + + @property + def next_line_id(self): + self.inc('line') + return LINE_ID_TEMPLATE % (self._counter['region'], self._counter['line']) diff --git a/qurator/eynollah/utils/xml.py b/qurator/eynollah/utils/xml.py index 3e76e68..194e7eb 100644 --- a/qurator/eynollah/utils/xml.py +++ b/qurator/eynollah/utils/xml.py @@ -1,6 +1,7 @@ # pylint: disable=too-many-locals,wrong-import-position,too-many-lines,too-many-statements,chained-comparison,fixme,broad-except,c-extension-no-member # pylint: disable=invalid-name from lxml import etree as ET +from .counter import EynollahIdCounter import numpy as np NAMESPACES = {} @@ -70,19 +71,16 @@ def order_and_id_of_texts(found_polygons_text_region, found_polygons_text_region index_of_types_2 = index_of_types[kind_of_texts == 2] indexes_sorted_2 = indexes_sorted[kind_of_texts == 2] - index_b = 0 + ref_point - for mm, _ in enumerate(found_polygons_text_region): - id_of_texts.append("r" + str(index_b)) - interest = indexes_sorted_1[indexes_sorted_1 == index_of_types_1[mm]] + counter = EynollahIdCounter(region_idx=ref_point) + for idx_textregion, _ in enumerate(found_polygons_text_region): + id_of_texts.append(counter.next_region_id) + interest = indexes_sorted_1[indexes_sorted_1 == index_of_types_1[idx_textregion]] if len(interest) > 0: order_of_texts.append(interest[0]) - index_b += 1 - for mm, _ in enumerate(found_polygons_text_region_h): - id_of_texts.append("r" + str(index_b)) - interest = indexes_sorted_2[index_of_types_2[mm]] + for idx_headerregion, _ in enumerate(found_polygons_text_region_h): + id_of_texts.append(counter.next_region_id) + interest = indexes_sorted_2[index_of_types_2[idx_headerregion]] order_of_texts.append(interest) - index_b += 1 return order_of_texts, id_of_texts - diff --git a/qurator/eynollah/writer.py b/qurator/eynollah/writer.py index 874b69c..70ac17b 100644 --- a/qurator/eynollah/writer.py +++ b/qurator/eynollah/writer.py @@ -3,6 +3,7 @@ from pathlib import Path import os.path from .utils.xml import create_page_xml, add_textequiv, xml_reading_order +from .utils.counter import EynollahIdCounter from ocrd_utils import getLogger from lxml import etree as ET @@ -12,6 +13,7 @@ class EynollahXmlWriter(): def __init__(self, *, dir_out, image_filename, curved_line): self.logger = getLogger('eynollah.writer') + self.counter = EynollahIdCounter() self.dir_out = dir_out self.image_filename = image_filename self.image_filename_stem = Path(Path(image_filename).name).stem @@ -139,38 +141,37 @@ class EynollahXmlWriter(): coord_page = ET.SubElement(page_print_sub, "Coords") coord_page.set('points', self.calculate_page_coords(cont_page)) + counter_textregions = EynollahIdCounter() + counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) + id_of_marginalia = [] - for idx_marginal, _ in enumerate(found_polygons_marginals): - id_of_marginalia.append('r%s' % len(order_of_texts) + idx_marginal) + for _ in found_polygons_marginals: + id_of_marginalia.append(counter_marginals.next_region_id) - id_indexer = 0 id_indexer_l = 0 if len(found_polygons_text_region) > 0: xml_reading_order(page, order_of_texts, id_of_texts, found_polygons_marginals) for mm in range(len(found_polygons_text_region)): textregion = ET.SubElement(page, 'TextRegion') - textregion.set('id', 'r%s' % id_indexer) - id_indexer += 1 + textregion.set('id', counter_textregions.next_region_id) textregion.set('type', 'paragraph') coord_text = ET.SubElement(textregion, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord)) id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l) add_textequiv(textregion) - for mm in range(len(found_polygons_marginals)): + for idx_marginal, _ in enumerate(found_polygons_marginals): marginal = ET.SubElement(page, 'TextRegion') - marginal.set('id', id_of_marginalia[mm]) + marginal.set('id', id_of_marginalia[idx_marginal]) marginal.set('type', 'marginalia') coord_text = ET.SubElement(marginal, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord)) id_indexer_l = self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, id_indexer_l) - id_indexer = len(found_polygons_text_region) + len(found_polygons_marginals) for mm in range(len(found_polygons_text_region_img)): textregion = ET.SubElement(page, 'ImageRegion') - textregion.set('id', 'r%s' % id_indexer) - id_indexer += 1 + textregion.set('id', counter_textregions.next_region_id) coord_text = ET.SubElement(textregion, 'Coords') points_co = '' for lmm in range(len(found_polygons_text_region_img[mm])): @@ -192,18 +193,19 @@ class EynollahXmlWriter(): coord_page = ET.SubElement(page_print_sub, "Coords") coord_page.set('points', self.calculate_page_coords(cont_page)) - id_indexer = 0 + counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) + counter_textregions = EynollahIdCounter() + id_indexer_l = 0 id_of_marginalia = [] - for idx_marginal, _ in enumerate(found_polygons_marginals): - id_of_marginalia.append('r%s' % len(order_of_texts) + idx_marginal) + for _ in found_polygons_marginals: + id_of_marginalia.append(counter_marginals.next_region_id) if len(found_polygons_text_region) > 0: xml_reading_order(page, order_of_texts, id_of_texts, found_polygons_marginals) for mm in range(len(found_polygons_text_region)): textregion=ET.SubElement(page, 'TextRegion') - textregion.set('id', 'r%s' % id_indexer) - id_indexer += 1 + textregion.set('id', counter_textregions.next_region_id) textregion.set('type', 'paragraph') coord_text = ET.SubElement(textregion, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord)) @@ -214,8 +216,7 @@ class EynollahXmlWriter(): if len(found_polygons_text_region_h) > 0: for mm in range(len(found_polygons_text_region_h)): textregion=ET.SubElement(page, 'TextRegion') - textregion.set('id', 'r%s' % id_indexer) - id_indexer += 1 + textregion.set('id', counter_textregions.next_region_id) textregion.set('type','header') coord_text = ET.SubElement(textregion, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_h, mm, page_coord)) @@ -223,11 +224,9 @@ class EynollahXmlWriter(): add_textequiv(textregion) if len(found_polygons_drop_capitals) > 0: - id_indexer = len(found_polygons_text_region) + len(found_polygons_text_region_h) + len(found_polygons_marginals) for mm in range(len(found_polygons_drop_capitals)): textregion=ET.SubElement(page, 'TextRegion') - textregion.set('id',' r%s' % id_indexer) - id_indexer += 1 + textregion.set('id', counter_textregions.next_region_id) textregion.set('type', 'drop-capital') coord_text = ET.SubElement(textregion, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals, mm, page_coord)) @@ -241,19 +240,17 @@ class EynollahXmlWriter(): coord_text = ET.SubElement(marginal, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord)) id_indexer_l = self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, id_indexer_l) + counter_textregions.inc('region', counter_marginals.get('region')) - id_indexer = len(found_polygons_text_region) + len(found_polygons_text_region_h) + len(found_polygons_marginals) + len(found_polygons_drop_capitals) for mm in range(len(found_polygons_text_region_img)): textregion=ET.SubElement(page, 'ImageRegion') - textregion.set('id', 'r%s' % id_indexer) - id_indexer += 1 + textregion.set('id', counter_textregions.next_region_id) coord_text = ET.SubElement(textregion, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_img, mm, page_coord)) for mm in range(len(found_polygons_tables)): textregion = ET.SubElement(page, 'TableRegion') - textregion.set('id', 'r%s' %id_indexer) - id_indexer += 1 + textregion.set('id', counter_textregions.next_region_id) coord_text = ET.SubElement(textregion, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_tables, mm, page_coord))