enforce kwargs for writer.build_...

This commit is contained in:
kba 2025-11-27 12:43:45 +01:00
parent 4aa9543a7d
commit c24cf94bce
2 changed files with 218 additions and 81 deletions

View file

@ -8,7 +8,6 @@ document layout analysis (segmentation) with output in PAGE-XML
# FIXME: fix all of those... # FIXME: fix all of those...
# pyright: reportUnnecessaryTypeIgnoreComment=true # pyright: reportUnnecessaryTypeIgnoreComment=true
# pyright: reportPossiblyUnboundVariable=false # pyright: reportPossiblyUnboundVariable=false
# pyright: reportMissingImports=false
# pyright: reportCallIssue=false # pyright: reportCallIssue=false
# pyright: reportOperatorIssue=false # pyright: reportOperatorIssue=false
# pyright: reportUnboundVariable=false # pyright: reportUnboundVariable=false
@ -49,9 +48,9 @@ import statistics
tf_disable_interactive_logs() tf_disable_interactive_logs()
import tensorflow as tf import tensorflow as tf # type: ignore
try: try:
import torch import torch # type: ignore
except ImportError: except ImportError:
torch = None torch = None
try: try:
@ -3372,13 +3371,28 @@ class Eynollah:
conf_contours_textregions =[0] conf_contours_textregions =[0]
pcgts = self.writer.build_pagexml_no_full_layout( pcgts = self.writer.build_pagexml_no_full_layout(
cont_page, page_coord, order_text_new, id_of_texts_tot, found_polygons_text_region=cont_page,
all_found_textline_polygons, page_coord, [], page_coord=page_coord,
[], [], [], [], [], [], order_of_texts=order_text_new,
slopes, [], [], id_of_texts=id_of_texts_tot,
cont_page, [], [], all_found_textline_polygons=all_found_textline_polygons,
all_box_coord=page_coord,
polygons_of_images=[],
polygons_of_marginals_left=[],
polygons_of_marginals_right=[],
all_found_textline_polygons_marginals_left=[],
all_found_textline_polygons_marginals_right=[],
all_box_coord_marginals_left=[],
all_box_coord_marginals_right=[],
slopes=slopes,
slopes_marginals_left=[],
slopes_marginals_right=[],
cont_page=cont_page,
polygons_seplines=[],
contours_tables=[],
conf_contours_textregion=conf_contours_textregions, conf_contours_textregion=conf_contours_textregions,
skip_layout_reading_order=True) skip_layout_reading_order=True
)
self.logger.info("Basic processing complete") self.logger.info("Basic processing complete")
return pcgts return pcgts
@ -3422,8 +3436,26 @@ class Eynollah:
self.logger.info("No columns detected - generating empty PAGE-XML") self.logger.info("No columns detected - generating empty PAGE-XML")
pcgts = self.writer.build_pagexml_no_full_layout( pcgts = self.writer.build_pagexml_no_full_layout(
[], page_coord, [], [], [], [], [], [], [], [], [], [], [], [], [], [], found_polygons_text_region=[],
cont_page, [], []) page_coord=page_coord,
order_of_texts=[],
id_of_texts=[],
all_found_textline_polygons=[],
all_box_coord=[],
polygons_of_images=[],
polygons_of_marginals_left=[],
polygons_of_marginals_right=[],
all_found_textline_polygons_marginals_left=[],
all_found_textline_polygons_marginals_right=[],
all_box_coord_marginals_left=[],
all_box_coord_marginals_right=[],
slopes=[],
slopes_marginals_left=[],
slopes_marginals_right=[],
cont_page=cont_page,
polygons_seplines=[],
contours_tables=[]
)
return pcgts return pcgts
#print("text region early in %.1fs", time.time() - t0) #print("text region early in %.1fs", time.time() - t0)
@ -3636,22 +3668,53 @@ class Eynollah:
empty_marginals = [[]] * len(polygons_of_marginals) empty_marginals = [[]] * len(polygons_of_marginals)
if self.full_layout: if self.full_layout:
pcgts = self.writer.build_pagexml_full_layout( pcgts = self.writer.build_pagexml_full_layout(
[], [], page_coord, [], [], [], [], [], [], contours_only_text_parent=[],
polygons_of_images, contours_tables, [], contours_only_text_parent_h=[],
polygons_of_marginals, polygons_of_marginals, page_coord=page_coord,
empty_marginals, empty_marginals, order_of_texts=[],
empty_marginals, empty_marginals, id_of_texts=[],
[], [], [], [], all_found_textline_polygons=[],
cont_page, polygons_seplines) all_found_textline_polygons_h=[],
all_box_coord=[],
all_box_coord_h=[],
polygons_of_images=polygons_of_images,
contours_tables=contours_tables,
polygons_of_drop_capitals=[],
polygons_of_marginals_left=polygons_of_marginals,
polygons_of_marginals_right=polygons_of_marginals,
all_found_textline_polygons_marginals_left=empty_marginals,
all_found_textline_polygons_marginals_right=empty_marginals,
all_box_coord_marginals_left=empty_marginals,
all_box_coord_marginals_right=empty_marginals,
slopes=[],
slopes_h=[],
slopes_marginals_left=[],
slopes_marginals_right=[],
cont_page=cont_page,
polygons_seplines=polygons_seplines
)
else: else:
pcgts = self.writer.build_pagexml_no_full_layout( pcgts = self.writer.build_pagexml_no_full_layout(
[], page_coord, [], [], [], [], found_polygons_text_region=[],
polygons_of_images, page_coord=page_coord,
polygons_of_marginals, polygons_of_marginals, order_of_texts=[],
empty_marginals, empty_marginals, id_of_texts=[],
empty_marginals, empty_marginals, all_found_textline_polygons=[],
[], [], [], all_box_coord=[],
cont_page, polygons_seplines, contours_tables) polygons_of_images=polygons_of_images,
polygons_of_marginals_left=polygons_of_marginals,
polygons_of_marginals_right=polygons_of_marginals,
all_found_textline_polygons_marginals_left=empty_marginals,
all_found_textline_polygons_marginals_right=empty_marginals,
all_box_coord_marginals_left=empty_marginals,
all_box_coord_marginals_right=empty_marginals,
slopes=[],
slopes_marginals_left=[],
slopes_marginals_right=[],
cont_page=cont_page,
polygons_seplines=polygons_seplines,
contours_tables=contours_tables
)
return pcgts return pcgts
@ -3810,24 +3873,55 @@ class Eynollah:
if self.full_layout: if self.full_layout:
pcgts = self.writer.build_pagexml_full_layout( pcgts = self.writer.build_pagexml_full_layout(
contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, found_polygons_text_region=contours_only_text_parent,
all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_h=contours_only_text_parent_h,
polygons_of_images, contours_tables, polygons_of_drop_capitals, page_coord=page_coord,
polygons_of_marginals_left, polygons_of_marginals_right, order_of_texts=order_text_new,
all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, id_of_texts=id_of_texts_tot,
all_box_coord_marginals_left, all_box_coord_marginals_right, all_found_textline_polygons=all_found_textline_polygons,
slopes, slopes_h, slopes_marginals_left, slopes_marginals_right, all_found_textline_polygons_h=all_found_textline_polygons_h,
cont_page, polygons_seplines, all_box_coord=all_box_coord,
conf_contours_textregions, conf_contours_textregions_h) all_box_coord_h=all_box_coord_h,
polygons_of_images=polygons_of_images,
contours_tables=contours_tables,
polygons_of_drop_capitals=polygons_of_drop_capitals,
polygons_of_marginals_left=polygons_of_marginals_left,
polygons_of_marginals_right=polygons_of_marginals_right,
all_found_textline_polygons_marginals_left=all_found_textline_polygons_marginals_left,
all_found_textline_polygons_marginals_right=all_found_textline_polygons_marginals_right,
all_box_coord_marginals_left=all_box_coord_marginals_left,
all_box_coord_marginals_right=all_box_coord_marginals_right,
slopes=slopes,
slopes_h=slopes_h,
slopes_marginals_left=slopes_marginals_left,
slopes_marginals_right=slopes_marginals_right,
cont_page=cont_page,
polygons_seplines=polygons_seplines,
conf_contours_textregions=conf_contours_textregions,
conf_contours_textregions_h=conf_contours_textregions_h
)
else: else:
pcgts = self.writer.build_pagexml_no_full_layout( pcgts = self.writer.build_pagexml_no_full_layout(
contours_only_text_parent, page_coord, order_text_new, id_of_texts_tot, found_polygons_text_region=contours_only_text_parent,
all_found_textline_polygons, all_box_coord, polygons_of_images, page_coord=page_coord,
polygons_of_marginals_left, polygons_of_marginals_right, order_of_texts=order_text_new,
all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, id_of_texts=id_of_texts_tot,
all_box_coord_marginals_left, all_box_coord_marginals_right, all_found_textline_polygons=all_found_textline_polygons,
slopes, slopes_marginals_left, slopes_marginals_right, all_box_coord=all_box_coord,
cont_page, polygons_seplines, contours_tables, polygons_of_images=polygons_of_images,
conf_contours_textregions=conf_contours_textregions) polygons_of_marginals_left=polygons_of_marginals_left,
polygons_of_marginals_right=polygons_of_marginals_right,
all_found_textline_polygons_marginals_left=all_found_textline_polygons_marginals_left,
all_found_textline_polygons_marginals_right=all_found_textline_polygons_marginals_right,
all_box_coord_marginals_left=all_box_coord_marginals_left,
all_box_coord_marginals_right=all_box_coord_marginals_right,
slopes=slopes,
slopes_marginals_left=slopes_marginals_left,
slopes_marginals_right=slopes_marginals_right,
cont_page=cont_page,
polygons_seplines=polygons_seplines,
contours_tables=contours_tables,
conf_contours_textregions=conf_contours_textregions
)
return pcgts return pcgts

View file

@ -83,48 +83,91 @@ class EynollahXmlWriter:
f.write(to_xml(pcgts)) f.write(to_xml(pcgts))
def build_pagexml_no_full_layout( def build_pagexml_no_full_layout(
self, found_polygons_text_region, self,
page_coord, order_of_texts, id_of_texts, *,
found_polygons_text_region,
page_coord,
order_of_texts,
id_of_texts,
all_found_textline_polygons, all_found_textline_polygons,
all_box_coord, all_box_coord,
found_polygons_text_region_img, found_polygons_text_region_img,
found_polygons_marginals_left, found_polygons_marginals_right, found_polygons_marginals_left,
all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, found_polygons_marginals_right,
all_box_coord_marginals_left, all_box_coord_marginals_right, all_found_textline_polygons_marginals_left,
slopes, slopes_marginals_left, slopes_marginals_right, all_found_textline_polygons_marginals_right,
cont_page, polygons_seplines, all_box_coord_marginals_left,
all_box_coord_marginals_right,
slopes,
slopes_marginals_left,
slopes_marginals_right,
cont_page,
polygons_seplines,
found_polygons_tables, found_polygons_tables,
**kwargs): ):
return self.build_pagexml_full_layout( return self.build_pagexml_full_layout(
found_polygons_text_region, [], found_polygons_text_region=found_polygons_text_region,
page_coord, order_of_texts, id_of_texts, found_polygons_text_region_h=[],
all_found_textline_polygons, [], page_coord=page_coord,
all_box_coord, [], order_of_texts=order_of_texts,
found_polygons_text_region_img, found_polygons_tables, [], id_of_texts=id_of_texts,
found_polygons_marginals_left, found_polygons_marginals_right, all_found_textline_polygons=all_found_textline_polygons,
all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_found_textline_polygons_h=[],
all_box_coord_marginals_left, all_box_coord_marginals_right, all_box_coord=all_box_coord,
slopes, [], slopes_marginals_left, slopes_marginals_right, all_box_coord_h=[],
cont_page, polygons_seplines, found_polygons_text_region_img=found_polygons_text_region_img,
**kwargs) found_polygons_tables=found_polygons_tables,
found_polygons_drop_capitals=[],
found_polygons_marginals_left=found_polygons_marginals_left,
found_polygons_marginals_right=found_polygons_marginals_right,
all_found_textline_polygons_marginals_left=all_found_textline_polygons_marginals_left,
all_found_textline_polygons_marginals_right=all_found_textline_polygons_marginals_right,
all_box_coord_marginals_left=all_box_coord_marginals_left,
all_box_coord_marginals_right=all_box_coord_marginals_right,
slopes=slopes,
slopes_h=[],
slopes_marginals_left=slopes_marginals_left,
slopes_marginals_right=slopes_marginals_right,
cont_page=cont_page,
polygons_seplines=polygons_seplines,
)
def build_pagexml_full_layout( def build_pagexml_full_layout(
self, self,
found_polygons_text_region, found_polygons_text_region_h, *,
page_coord, order_of_texts, id_of_texts, found_polygons_text_region,
all_found_textline_polygons, all_found_textline_polygons_h, found_polygons_text_region_h,
all_box_coord, all_box_coord_h, page_coord,
found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, order_of_texts,
found_polygons_marginals_left,found_polygons_marginals_right, id_of_texts,
all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_found_textline_polygons,
all_box_coord_marginals_left, all_box_coord_marginals_right, all_found_textline_polygons_h,
slopes, slopes_h, slopes_marginals_left, slopes_marginals_right, all_box_coord,
cont_page, polygons_seplines, all_box_coord_h,
ocr_all_textlines=None, ocr_all_textlines_h=None, found_polygons_text_region_img,
ocr_all_textlines_marginals_left=None, ocr_all_textlines_marginals_right=None, found_polygons_tables,
found_polygons_drop_capitals,
found_polygons_marginals_left,
found_polygons_marginals_right,
all_found_textline_polygons_marginals_left,
all_found_textline_polygons_marginals_right,
all_box_coord_marginals_left,
all_box_coord_marginals_right,
slopes,
slopes_h,
slopes_marginals_left,
slopes_marginals_right,
cont_page,
polygons_seplines,
ocr_all_textlines=None,
ocr_all_textlines_h=None,
ocr_all_textlines_marginals_left=None,
ocr_all_textlines_marginals_right=None,
ocr_all_textlines_drop=None, ocr_all_textlines_drop=None,
conf_contours_textregions=None, conf_contours_textregions_h=None, conf_contours_textregions=None,
skip_layout_reading_order=False): conf_contours_textregions_h=None,
skip_layout_reading_order=False,
):
self.logger.debug('enter build_pagexml') self.logger.debug('enter build_pagexml')
# create the file structure # create the file structure