mirror of
				https://github.com/qurator-spk/eynollah.git
				synced 2025-11-04 03:34:16 +01:00 
			
		
		
		
	add EynollahIdCounter class
This commit is contained in:
		
							parent
							
								
									9f5e4af5f0
								
							
						
					
					
						commit
						24da879844
					
				
					 3 changed files with 60 additions and 36 deletions
				
			
		
							
								
								
									
										29
									
								
								qurator/eynollah/utils/counter.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								qurator/eynollah/utils/counter.py
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,29 @@
 | 
			
		|||
from collections import Counter
 | 
			
		||||
 | 
			
		||||
REGION_ID_TEMPLATE = 'region_%04d'
 | 
			
		||||
LINE_ID_TEMPLATE = 'region_%04d_line_%04d'
 | 
			
		||||
 | 
			
		||||
class EynollahIdCounter():
 | 
			
		||||
 | 
			
		||||
    def __init__(self, region_idx=0, line_idx=0):
 | 
			
		||||
        self._counter = Counter()
 | 
			
		||||
 | 
			
		||||
    def inc(self, name, val=1):
 | 
			
		||||
        self._counter.update({name: val})
 | 
			
		||||
 | 
			
		||||
    def get(self, name):
 | 
			
		||||
        return self._counter[name]
 | 
			
		||||
 | 
			
		||||
    def set(self, name, val):
 | 
			
		||||
        self._counter[name] = val
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def next_region_id(self):
 | 
			
		||||
        self.inc('region')
 | 
			
		||||
        self.set('line', 0)
 | 
			
		||||
        return REGION_ID_TEMPLATE % self._counter['region']
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def next_line_id(self):
 | 
			
		||||
        self.inc('line')
 | 
			
		||||
        return LINE_ID_TEMPLATE % (self._counter['region'], self._counter['line'])
 | 
			
		||||
| 
						 | 
				
			
			@ -1,6 +1,7 @@
 | 
			
		|||
# pylint: disable=too-many-locals,wrong-import-position,too-many-lines,too-many-statements,chained-comparison,fixme,broad-except,c-extension-no-member
 | 
			
		||||
# pylint: disable=invalid-name
 | 
			
		||||
from lxml import etree as ET
 | 
			
		||||
from .counter import EynollahIdCounter
 | 
			
		||||
import numpy as np
 | 
			
		||||
 | 
			
		||||
NAMESPACES = {}
 | 
			
		||||
| 
						 | 
				
			
			@ -70,19 +71,16 @@ def order_and_id_of_texts(found_polygons_text_region, found_polygons_text_region
 | 
			
		|||
    index_of_types_2 = index_of_types[kind_of_texts == 2]
 | 
			
		||||
    indexes_sorted_2 = indexes_sorted[kind_of_texts == 2]
 | 
			
		||||
 | 
			
		||||
    index_b = 0 + ref_point
 | 
			
		||||
    for mm, _ in enumerate(found_polygons_text_region):
 | 
			
		||||
        id_of_texts.append("r" + str(index_b))
 | 
			
		||||
        interest = indexes_sorted_1[indexes_sorted_1 == index_of_types_1[mm]]
 | 
			
		||||
    counter = EynollahIdCounter(region_idx=ref_point)
 | 
			
		||||
    for idx_textregion, _ in enumerate(found_polygons_text_region):
 | 
			
		||||
        id_of_texts.append(counter.next_region_id)
 | 
			
		||||
        interest = indexes_sorted_1[indexes_sorted_1 == index_of_types_1[idx_textregion]]
 | 
			
		||||
        if len(interest) > 0:
 | 
			
		||||
            order_of_texts.append(interest[0])
 | 
			
		||||
            index_b += 1
 | 
			
		||||
 | 
			
		||||
    for mm, _ in enumerate(found_polygons_text_region_h):
 | 
			
		||||
        id_of_texts.append("r" + str(index_b))
 | 
			
		||||
        interest = indexes_sorted_2[index_of_types_2[mm]]
 | 
			
		||||
    for idx_headerregion, _ in enumerate(found_polygons_text_region_h):
 | 
			
		||||
        id_of_texts.append(counter.next_region_id)
 | 
			
		||||
        interest = indexes_sorted_2[index_of_types_2[idx_headerregion]]
 | 
			
		||||
        order_of_texts.append(interest)
 | 
			
		||||
        index_b += 1
 | 
			
		||||
 | 
			
		||||
    return order_of_texts, id_of_texts
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -3,6 +3,7 @@ from pathlib import Path
 | 
			
		|||
import os.path
 | 
			
		||||
 | 
			
		||||
from .utils.xml import create_page_xml, add_textequiv, xml_reading_order
 | 
			
		||||
from .utils.counter import EynollahIdCounter
 | 
			
		||||
 | 
			
		||||
from ocrd_utils import getLogger
 | 
			
		||||
from lxml import etree as ET
 | 
			
		||||
| 
						 | 
				
			
			@ -12,6 +13,7 @@ class EynollahXmlWriter():
 | 
			
		|||
 | 
			
		||||
    def __init__(self, *, dir_out, image_filename, curved_line):
 | 
			
		||||
        self.logger = getLogger('eynollah.writer')
 | 
			
		||||
        self.counter = EynollahIdCounter()
 | 
			
		||||
        self.dir_out = dir_out
 | 
			
		||||
        self.image_filename = image_filename
 | 
			
		||||
        self.image_filename_stem = Path(Path(image_filename).name).stem
 | 
			
		||||
| 
						 | 
				
			
			@ -139,38 +141,37 @@ class EynollahXmlWriter():
 | 
			
		|||
        coord_page = ET.SubElement(page_print_sub, "Coords")
 | 
			
		||||
        coord_page.set('points', self.calculate_page_coords(cont_page))
 | 
			
		||||
 | 
			
		||||
        id_of_marginalia = []
 | 
			
		||||
        for  idx_marginal, _ in enumerate(found_polygons_marginals):
 | 
			
		||||
            id_of_marginalia.append('r%s' % len(order_of_texts) + idx_marginal)
 | 
			
		||||
        counter_textregions = EynollahIdCounter()
 | 
			
		||||
        counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts))
 | 
			
		||||
 | 
			
		||||
        id_of_marginalia = []
 | 
			
		||||
        for  _ in found_polygons_marginals:
 | 
			
		||||
            id_of_marginalia.append(counter_marginals.next_region_id)
 | 
			
		||||
 | 
			
		||||
        id_indexer = 0
 | 
			
		||||
        id_indexer_l = 0
 | 
			
		||||
 | 
			
		||||
        if len(found_polygons_text_region) > 0:
 | 
			
		||||
            xml_reading_order(page, order_of_texts, id_of_texts, found_polygons_marginals)
 | 
			
		||||
            for mm in range(len(found_polygons_text_region)):
 | 
			
		||||
                textregion = ET.SubElement(page, 'TextRegion')
 | 
			
		||||
                textregion.set('id', 'r%s' % id_indexer)
 | 
			
		||||
                id_indexer += 1
 | 
			
		||||
                textregion.set('id', counter_textregions.next_region_id)
 | 
			
		||||
                textregion.set('type', 'paragraph')
 | 
			
		||||
                coord_text = ET.SubElement(textregion, 'Coords')
 | 
			
		||||
                coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord))
 | 
			
		||||
                id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l)
 | 
			
		||||
                add_textequiv(textregion)
 | 
			
		||||
 | 
			
		||||
        for mm in range(len(found_polygons_marginals)):
 | 
			
		||||
        for idx_marginal, _ in enumerate(found_polygons_marginals):
 | 
			
		||||
            marginal = ET.SubElement(page, 'TextRegion')
 | 
			
		||||
            marginal.set('id', id_of_marginalia[mm])
 | 
			
		||||
            marginal.set('id', id_of_marginalia[idx_marginal])
 | 
			
		||||
            marginal.set('type', 'marginalia')
 | 
			
		||||
            coord_text = ET.SubElement(marginal, 'Coords')
 | 
			
		||||
            coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord))
 | 
			
		||||
            id_indexer_l = self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, id_indexer_l)
 | 
			
		||||
 | 
			
		||||
        id_indexer = len(found_polygons_text_region) + len(found_polygons_marginals)
 | 
			
		||||
        for mm in range(len(found_polygons_text_region_img)):
 | 
			
		||||
            textregion = ET.SubElement(page, 'ImageRegion')
 | 
			
		||||
            textregion.set('id', 'r%s' % id_indexer)
 | 
			
		||||
            id_indexer += 1
 | 
			
		||||
            textregion.set('id', counter_textregions.next_region_id)
 | 
			
		||||
            coord_text = ET.SubElement(textregion, 'Coords')
 | 
			
		||||
            points_co = ''
 | 
			
		||||
            for lmm in range(len(found_polygons_text_region_img[mm])):
 | 
			
		||||
| 
						 | 
				
			
			@ -192,18 +193,19 @@ class EynollahXmlWriter():
 | 
			
		|||
        coord_page = ET.SubElement(page_print_sub, "Coords")
 | 
			
		||||
        coord_page.set('points', self.calculate_page_coords(cont_page))
 | 
			
		||||
 | 
			
		||||
        id_indexer = 0
 | 
			
		||||
        counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts))
 | 
			
		||||
        counter_textregions = EynollahIdCounter()
 | 
			
		||||
 | 
			
		||||
        id_indexer_l = 0
 | 
			
		||||
        id_of_marginalia = []
 | 
			
		||||
        for  idx_marginal, _ in enumerate(found_polygons_marginals):
 | 
			
		||||
            id_of_marginalia.append('r%s' % len(order_of_texts) + idx_marginal)
 | 
			
		||||
        for  _ in found_polygons_marginals:
 | 
			
		||||
            id_of_marginalia.append(counter_marginals.next_region_id)
 | 
			
		||||
 | 
			
		||||
        if len(found_polygons_text_region) > 0:
 | 
			
		||||
            xml_reading_order(page, order_of_texts, id_of_texts, found_polygons_marginals)
 | 
			
		||||
            for mm in range(len(found_polygons_text_region)):
 | 
			
		||||
                textregion=ET.SubElement(page, 'TextRegion')
 | 
			
		||||
                textregion.set('id', 'r%s' % id_indexer)
 | 
			
		||||
                id_indexer += 1
 | 
			
		||||
                textregion.set('id', counter_textregions.next_region_id)
 | 
			
		||||
                textregion.set('type', 'paragraph')
 | 
			
		||||
                coord_text = ET.SubElement(textregion, 'Coords')
 | 
			
		||||
                coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord))
 | 
			
		||||
| 
						 | 
				
			
			@ -214,8 +216,7 @@ class EynollahXmlWriter():
 | 
			
		|||
        if len(found_polygons_text_region_h) > 0:
 | 
			
		||||
            for mm in range(len(found_polygons_text_region_h)):
 | 
			
		||||
                textregion=ET.SubElement(page, 'TextRegion')
 | 
			
		||||
                textregion.set('id', 'r%s' % id_indexer)
 | 
			
		||||
                id_indexer += 1
 | 
			
		||||
                textregion.set('id', counter_textregions.next_region_id)
 | 
			
		||||
                textregion.set('type','header')
 | 
			
		||||
                coord_text = ET.SubElement(textregion, 'Coords')
 | 
			
		||||
                coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_h, mm, page_coord))
 | 
			
		||||
| 
						 | 
				
			
			@ -223,11 +224,9 @@ class EynollahXmlWriter():
 | 
			
		|||
                add_textequiv(textregion)
 | 
			
		||||
 | 
			
		||||
        if len(found_polygons_drop_capitals) > 0:
 | 
			
		||||
            id_indexer = len(found_polygons_text_region) + len(found_polygons_text_region_h) + len(found_polygons_marginals)
 | 
			
		||||
            for mm in range(len(found_polygons_drop_capitals)):
 | 
			
		||||
                textregion=ET.SubElement(page, 'TextRegion')
 | 
			
		||||
                textregion.set('id',' r%s' % id_indexer)
 | 
			
		||||
                id_indexer += 1
 | 
			
		||||
                textregion.set('id', counter_textregions.next_region_id)
 | 
			
		||||
                textregion.set('type', 'drop-capital')
 | 
			
		||||
                coord_text = ET.SubElement(textregion, 'Coords')
 | 
			
		||||
                coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals, mm, page_coord))
 | 
			
		||||
| 
						 | 
				
			
			@ -241,19 +240,17 @@ class EynollahXmlWriter():
 | 
			
		|||
            coord_text = ET.SubElement(marginal, 'Coords')
 | 
			
		||||
            coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord))
 | 
			
		||||
            id_indexer_l = self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, id_indexer_l)
 | 
			
		||||
        counter_textregions.inc('region', counter_marginals.get('region'))
 | 
			
		||||
 | 
			
		||||
        id_indexer = len(found_polygons_text_region) + len(found_polygons_text_region_h) + len(found_polygons_marginals) + len(found_polygons_drop_capitals)
 | 
			
		||||
        for mm in range(len(found_polygons_text_region_img)):
 | 
			
		||||
            textregion=ET.SubElement(page, 'ImageRegion')
 | 
			
		||||
            textregion.set('id', 'r%s' % id_indexer)
 | 
			
		||||
            id_indexer += 1
 | 
			
		||||
            textregion.set('id', counter_textregions.next_region_id)
 | 
			
		||||
            coord_text = ET.SubElement(textregion, 'Coords')
 | 
			
		||||
            coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_img, mm, page_coord))
 | 
			
		||||
 | 
			
		||||
        for mm in range(len(found_polygons_tables)):
 | 
			
		||||
            textregion = ET.SubElement(page, 'TableRegion')
 | 
			
		||||
            textregion.set('id', 'r%s' %id_indexer)
 | 
			
		||||
            id_indexer += 1
 | 
			
		||||
            textregion.set('id', counter_textregions.next_region_id)
 | 
			
		||||
            coord_text = ET.SubElement(textregion, 'Coords')
 | 
			
		||||
            coord_text.set('points', self.calculate_polygon_coords(found_polygons_tables, mm, page_coord))
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue