diff --git a/src/eynollah/writer.py b/src/eynollah/writer.py index 67a2989..eee7440 100644 --- a/src/eynollah/writer.py +++ b/src/eynollah/writer.py @@ -87,8 +87,50 @@ class EynollahXmlWriter(): with open(self.output_filename, 'w') as f: f.write(to_xml(pcgts)) - def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals_left, found_polygons_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_marginals_left, slopes_marginals_right, cont_page, polygons_seplines, found_polygons_tables, ocr_all_textlines=None, ocr_all_textlines_marginals_left=None, ocr_all_textlines_marginals_right=None, conf_contours_textregion=None, skip_layout_reading_order=False): - self.logger.debug('enter build_pagexml_no_full_layout') + def build_pagexml_no_full_layout( + self, found_polygons_text_region, + page_coord, order_of_texts, id_of_texts, + all_found_textline_polygons, + all_box_coord, + found_polygons_text_region_img, + found_polygons_marginals_left, found_polygons_marginals_right, + all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, + all_box_coord_marginals_left, all_box_coord_marginals_right, + slopes, slopes_marginals_left, slopes_marginals_right, + cont_page, polygons_seplines, + found_polygons_tables, + **kwargs): + return self.build_pagexml_full_layout( + found_polygons_text_region, [], + page_coord, order_of_texts, id_of_texts, + all_found_textline_polygons, [], + all_box_coord, [], + found_polygons_text_region_img, found_polygons_tables, [], + found_polygons_marginals_left, found_polygons_marginals_right, + all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, + all_box_coord_marginals_left, all_box_coord_marginals_right, + slopes, [], slopes_marginals_left, slopes_marginals_right, + cont_page, polygons_seplines, + **kwargs) + + def build_pagexml_full_layout( + self, + found_polygons_text_region, found_polygons_text_region_h, + page_coord, order_of_texts, id_of_texts, + all_found_textline_polygons, all_found_textline_polygons_h, + all_box_coord, all_box_coord_h, + found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, + found_polygons_marginals_left,found_polygons_marginals_right, + all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, + all_box_coord_marginals_left, all_box_coord_marginals_right, + slopes, slopes_h, slopes_marginals_left, slopes_marginals_right, + cont_page, polygons_seplines, + ocr_all_textlines=None, ocr_all_textlines_h=None, + ocr_all_textlines_marginals_left=None, ocr_all_textlines_marginals_right=None, + ocr_all_textlines_drop=None, + conf_contours_textregion=None, conf_contours_textregion_h=None, + skip_layout_reading_order=False): + self.logger.debug('enter build_pagexml') # create the file structure pcgts = self.pcgts if self.pcgts else create_page_xml(self.image_filename, self.height_org, self.width_org) @@ -108,89 +150,10 @@ class EynollahXmlWriter(): textregion = TextRegionType( id=counter.next_region_id, type_='paragraph', Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord, - skip_layout_reading_order), - conf=conf_contours_textregion[mm]), - ) - page.add_TextRegion(textregion) - if ocr_all_textlines: - ocr_textlines = ocr_all_textlines[mm] - else: - ocr_textlines = None - self.serialize_lines_in_region(textregion, all_found_textline_polygons, mm, page_coord, - all_box_coord, slopes, counter, ocr_textlines) - - for mm, region_contour in enumerate(found_polygons_marginals_left): - marginal = TextRegionType( - id=counter.next_region_id, type_='marginalia', - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) - ) - page.add_TextRegion(marginal) - if ocr_all_textlines_marginals_left: - ocr_textlines = ocr_all_textlines_marginals_left[mm] - else: - ocr_textlines = None - self.serialize_lines_in_region(marginal, all_found_textline_polygons_marginals_left, mm, page_coord, - all_box_coord_marginals_left, slopes_marginals_left, counter, ocr_textlines) - - for mm, region_contour in enumerate(found_polygons_marginals_right): - marginal = TextRegionType( - id=counter.next_region_id, type_='marginalia', - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) - ) - page.add_TextRegion(marginal) - if ocr_all_textlines_marginals_right: - ocr_textlines = ocr_all_textlines_marginals_right[mm] - else: - ocr_textlines = None - self.serialize_lines_in_region(marginal, all_found_textline_polygons_marginals_right, mm, page_coord, - all_box_coord_marginals_right, slopes_marginals_right, counter, ocr_textlines) - - for region_contour in found_polygons_text_region_img: - img_region = ImageRegionType( - id=counter.next_region_id, - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) - ) - page.add_ImageRegion(img_region) - - for region_contour in polygons_seplines: - sep = SeparatorRegionType( - id=counter.next_region_id, - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, [0, 0, 0, 0])) - ) - page.add_SeparatorRegion(sep) - - for region_contour in found_polygons_tables: - tab = TableRegionType( - id=counter.next_region_id, - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) - ) - page.add_TableRegion(tab) - - return pcgts - - def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals_left,found_polygons_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_h, slopes_marginals_left, slopes_marginals_right, cont_page, polygons_seplines, ocr_all_textlines=None, ocr_all_textlines_h=None, ocr_all_textlines_marginals_left=None, ocr_all_textlines_marginals_right=None, ocr_all_textlines_drop=None, conf_contours_textregion=None, conf_contours_textregion_h=None): - self.logger.debug('enter build_pagexml_full_layout') - - # create the file structure - pcgts = self.pcgts if self.pcgts else create_page_xml(self.image_filename, self.height_org, self.width_org) - page = pcgts.get_Page() - page.set_Border(BorderType(Coords=CoordsType(points=self.calculate_page_coords(cont_page)))) - - counter = EynollahIdCounter() - if len(order_of_texts): - _counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) - id_of_marginalia_left = [_counter_marginals.next_region_id - for _ in found_polygons_marginals_left] - id_of_marginalia_right = [_counter_marginals.next_region_id - for _ in found_polygons_marginals_right] - xml_reading_order(page, order_of_texts, id_of_marginalia_left, id_of_marginalia_right) - - for mm, region_contour in enumerate(found_polygons_text_region): - textregion = TextRegionType( - id=counter.next_region_id, type_='paragraph', - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord), - conf=conf_contours_textregion[mm]) + skip_layout_reading_order)) ) + if conf_contours_textregion: + textregion.Coords.set_conf(conf_contours_textregion[mm]) page.add_TextRegion(textregion) if ocr_all_textlines: ocr_textlines = ocr_all_textlines[mm] @@ -205,6 +168,8 @@ class EynollahXmlWriter(): id=counter.next_region_id, type_='heading', Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) ) + if conf_contours_textregion_h: + textregion.Coords.set_conf(conf_contours_textregion_h[mm]) page.add_TextRegion(textregion) if ocr_all_textlines_h: ocr_textlines = ocr_all_textlines_h[mm]