mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-10-15 19:09:58 +02:00
writer: simplify
- `build_pagexml_no_full_layout`: delegate to `build_pagexml_full_layout` (removing redundant code)
This commit is contained in:
parent
e32479765c
commit
cbbb3248c7
1 changed files with 49 additions and 84 deletions
|
@ -87,8 +87,50 @@ class EynollahXmlWriter():
|
||||||
with open(self.output_filename, 'w') as f:
|
with open(self.output_filename, 'w') as f:
|
||||||
f.write(to_xml(pcgts))
|
f.write(to_xml(pcgts))
|
||||||
|
|
||||||
def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals_left, found_polygons_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_marginals_left, slopes_marginals_right, cont_page, polygons_seplines, found_polygons_tables, ocr_all_textlines=None, ocr_all_textlines_marginals_left=None, ocr_all_textlines_marginals_right=None, conf_contours_textregion=None, skip_layout_reading_order=False):
|
def build_pagexml_no_full_layout(
|
||||||
self.logger.debug('enter build_pagexml_no_full_layout')
|
self, found_polygons_text_region,
|
||||||
|
page_coord, order_of_texts, id_of_texts,
|
||||||
|
all_found_textline_polygons,
|
||||||
|
all_box_coord,
|
||||||
|
found_polygons_text_region_img,
|
||||||
|
found_polygons_marginals_left, found_polygons_marginals_right,
|
||||||
|
all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right,
|
||||||
|
all_box_coord_marginals_left, all_box_coord_marginals_right,
|
||||||
|
slopes, slopes_marginals_left, slopes_marginals_right,
|
||||||
|
cont_page, polygons_seplines,
|
||||||
|
found_polygons_tables,
|
||||||
|
**kwargs):
|
||||||
|
return self.build_pagexml_full_layout(
|
||||||
|
found_polygons_text_region, [],
|
||||||
|
page_coord, order_of_texts, id_of_texts,
|
||||||
|
all_found_textline_polygons, [],
|
||||||
|
all_box_coord, [],
|
||||||
|
found_polygons_text_region_img, found_polygons_tables, [],
|
||||||
|
found_polygons_marginals_left, found_polygons_marginals_right,
|
||||||
|
all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right,
|
||||||
|
all_box_coord_marginals_left, all_box_coord_marginals_right,
|
||||||
|
slopes, [], slopes_marginals_left, slopes_marginals_right,
|
||||||
|
cont_page, polygons_seplines,
|
||||||
|
**kwargs)
|
||||||
|
|
||||||
|
def build_pagexml_full_layout(
|
||||||
|
self,
|
||||||
|
found_polygons_text_region, found_polygons_text_region_h,
|
||||||
|
page_coord, order_of_texts, id_of_texts,
|
||||||
|
all_found_textline_polygons, all_found_textline_polygons_h,
|
||||||
|
all_box_coord, all_box_coord_h,
|
||||||
|
found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals,
|
||||||
|
found_polygons_marginals_left,found_polygons_marginals_right,
|
||||||
|
all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right,
|
||||||
|
all_box_coord_marginals_left, all_box_coord_marginals_right,
|
||||||
|
slopes, slopes_h, slopes_marginals_left, slopes_marginals_right,
|
||||||
|
cont_page, polygons_seplines,
|
||||||
|
ocr_all_textlines=None, ocr_all_textlines_h=None,
|
||||||
|
ocr_all_textlines_marginals_left=None, ocr_all_textlines_marginals_right=None,
|
||||||
|
ocr_all_textlines_drop=None,
|
||||||
|
conf_contours_textregion=None, conf_contours_textregion_h=None,
|
||||||
|
skip_layout_reading_order=False):
|
||||||
|
self.logger.debug('enter build_pagexml')
|
||||||
|
|
||||||
# create the file structure
|
# create the file structure
|
||||||
pcgts = self.pcgts if self.pcgts else create_page_xml(self.image_filename, self.height_org, self.width_org)
|
pcgts = self.pcgts if self.pcgts else create_page_xml(self.image_filename, self.height_org, self.width_org)
|
||||||
|
@ -108,89 +150,10 @@ class EynollahXmlWriter():
|
||||||
textregion = TextRegionType(
|
textregion = TextRegionType(
|
||||||
id=counter.next_region_id, type_='paragraph',
|
id=counter.next_region_id, type_='paragraph',
|
||||||
Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord,
|
Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord,
|
||||||
skip_layout_reading_order),
|
skip_layout_reading_order))
|
||||||
conf=conf_contours_textregion[mm]),
|
|
||||||
)
|
|
||||||
page.add_TextRegion(textregion)
|
|
||||||
if ocr_all_textlines:
|
|
||||||
ocr_textlines = ocr_all_textlines[mm]
|
|
||||||
else:
|
|
||||||
ocr_textlines = None
|
|
||||||
self.serialize_lines_in_region(textregion, all_found_textline_polygons, mm, page_coord,
|
|
||||||
all_box_coord, slopes, counter, ocr_textlines)
|
|
||||||
|
|
||||||
for mm, region_contour in enumerate(found_polygons_marginals_left):
|
|
||||||
marginal = TextRegionType(
|
|
||||||
id=counter.next_region_id, type_='marginalia',
|
|
||||||
Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord))
|
|
||||||
)
|
|
||||||
page.add_TextRegion(marginal)
|
|
||||||
if ocr_all_textlines_marginals_left:
|
|
||||||
ocr_textlines = ocr_all_textlines_marginals_left[mm]
|
|
||||||
else:
|
|
||||||
ocr_textlines = None
|
|
||||||
self.serialize_lines_in_region(marginal, all_found_textline_polygons_marginals_left, mm, page_coord,
|
|
||||||
all_box_coord_marginals_left, slopes_marginals_left, counter, ocr_textlines)
|
|
||||||
|
|
||||||
for mm, region_contour in enumerate(found_polygons_marginals_right):
|
|
||||||
marginal = TextRegionType(
|
|
||||||
id=counter.next_region_id, type_='marginalia',
|
|
||||||
Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord))
|
|
||||||
)
|
|
||||||
page.add_TextRegion(marginal)
|
|
||||||
if ocr_all_textlines_marginals_right:
|
|
||||||
ocr_textlines = ocr_all_textlines_marginals_right[mm]
|
|
||||||
else:
|
|
||||||
ocr_textlines = None
|
|
||||||
self.serialize_lines_in_region(marginal, all_found_textline_polygons_marginals_right, mm, page_coord,
|
|
||||||
all_box_coord_marginals_right, slopes_marginals_right, counter, ocr_textlines)
|
|
||||||
|
|
||||||
for region_contour in found_polygons_text_region_img:
|
|
||||||
img_region = ImageRegionType(
|
|
||||||
id=counter.next_region_id,
|
|
||||||
Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord))
|
|
||||||
)
|
|
||||||
page.add_ImageRegion(img_region)
|
|
||||||
|
|
||||||
for region_contour in polygons_seplines:
|
|
||||||
sep = SeparatorRegionType(
|
|
||||||
id=counter.next_region_id,
|
|
||||||
Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, [0, 0, 0, 0]))
|
|
||||||
)
|
|
||||||
page.add_SeparatorRegion(sep)
|
|
||||||
|
|
||||||
for region_contour in found_polygons_tables:
|
|
||||||
tab = TableRegionType(
|
|
||||||
id=counter.next_region_id,
|
|
||||||
Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord))
|
|
||||||
)
|
|
||||||
page.add_TableRegion(tab)
|
|
||||||
|
|
||||||
return pcgts
|
|
||||||
|
|
||||||
def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals_left,found_polygons_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_h, slopes_marginals_left, slopes_marginals_right, cont_page, polygons_seplines, ocr_all_textlines=None, ocr_all_textlines_h=None, ocr_all_textlines_marginals_left=None, ocr_all_textlines_marginals_right=None, ocr_all_textlines_drop=None, conf_contours_textregion=None, conf_contours_textregion_h=None):
|
|
||||||
self.logger.debug('enter build_pagexml_full_layout')
|
|
||||||
|
|
||||||
# create the file structure
|
|
||||||
pcgts = self.pcgts if self.pcgts else create_page_xml(self.image_filename, self.height_org, self.width_org)
|
|
||||||
page = pcgts.get_Page()
|
|
||||||
page.set_Border(BorderType(Coords=CoordsType(points=self.calculate_page_coords(cont_page))))
|
|
||||||
|
|
||||||
counter = EynollahIdCounter()
|
|
||||||
if len(order_of_texts):
|
|
||||||
_counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts))
|
|
||||||
id_of_marginalia_left = [_counter_marginals.next_region_id
|
|
||||||
for _ in found_polygons_marginals_left]
|
|
||||||
id_of_marginalia_right = [_counter_marginals.next_region_id
|
|
||||||
for _ in found_polygons_marginals_right]
|
|
||||||
xml_reading_order(page, order_of_texts, id_of_marginalia_left, id_of_marginalia_right)
|
|
||||||
|
|
||||||
for mm, region_contour in enumerate(found_polygons_text_region):
|
|
||||||
textregion = TextRegionType(
|
|
||||||
id=counter.next_region_id, type_='paragraph',
|
|
||||||
Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord),
|
|
||||||
conf=conf_contours_textregion[mm])
|
|
||||||
)
|
)
|
||||||
|
if conf_contours_textregion:
|
||||||
|
textregion.Coords.set_conf(conf_contours_textregion[mm])
|
||||||
page.add_TextRegion(textregion)
|
page.add_TextRegion(textregion)
|
||||||
if ocr_all_textlines:
|
if ocr_all_textlines:
|
||||||
ocr_textlines = ocr_all_textlines[mm]
|
ocr_textlines = ocr_all_textlines[mm]
|
||||||
|
@ -205,6 +168,8 @@ class EynollahXmlWriter():
|
||||||
id=counter.next_region_id, type_='heading',
|
id=counter.next_region_id, type_='heading',
|
||||||
Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord))
|
Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord))
|
||||||
)
|
)
|
||||||
|
if conf_contours_textregion_h:
|
||||||
|
textregion.Coords.set_conf(conf_contours_textregion_h[mm])
|
||||||
page.add_TextRegion(textregion)
|
page.add_TextRegion(textregion)
|
||||||
if ocr_all_textlines_h:
|
if ocr_all_textlines_h:
|
||||||
ocr_textlines = ocr_all_textlines_h[mm]
|
ocr_textlines = ocr_all_textlines_h[mm]
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue