move all XML handling to EynollahXmlWriter

pull/19/head
Konstantin Baierer 4 years ago
parent 22184024fa
commit c730e2eefd

@ -133,7 +133,7 @@ def main(
headers_off, headers_off,
) )
pcgts = eynollah.run() pcgts = eynollah.run()
eynollah.write_pagexml(pcgts) eynollah.writer.write_pagexml(pcgts)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

@ -14,7 +14,6 @@ import warnings
from pathlib import Path from pathlib import Path
from multiprocessing import Process, Queue, cpu_count from multiprocessing import Process, Queue, cpu_count
from lxml import etree as ET
from ocrd_utils import getLogger from ocrd_utils import getLogger
import cv2 import cv2
import numpy as np import numpy as np
@ -41,28 +40,19 @@ from .utils.contour import (
return_contours_of_interested_textline, return_contours_of_interested_textline,
return_parent_contours, return_parent_contours,
) )
from .utils.rotate import ( from .utils.rotate import (
rotate_image, rotate_image,
rotation_not_90_func, rotation_not_90_func,
rotation_not_90_func_full_layout rotation_not_90_func_full_layout)
)
from .utils.separate_lines import ( from .utils.separate_lines import (
textline_contours_postprocessing, textline_contours_postprocessing,
seperate_lines_new2, seperate_lines_new2,
return_deskew_slop, return_deskew_slop)
)
from .utils.drop_capitals import ( from .utils.drop_capitals import (
adhere_drop_capital_region_into_cprresponding_textline, adhere_drop_capital_region_into_cprresponding_textline,
filter_small_drop_capitals_from_no_patch_layout filter_small_drop_capitals_from_no_patch_layout)
)
from .utils.marginals import get_marginals from .utils.marginals import get_marginals
from .utils.resize import resize_image from .utils.resize import resize_image
from .utils import ( from .utils import (
boosting_headers_by_longshot_region_segmentation, boosting_headers_by_longshot_region_segmentation,
crop_image_inside_box, crop_image_inside_box,
@ -75,12 +65,10 @@ from .utils import (
order_and_id_of_texts, order_and_id_of_texts,
order_of_regions, order_of_regions,
find_number_of_columns_in_document, find_number_of_columns_in_document,
return_boxes_of_images_by_order_of_reading_new, return_boxes_of_images_by_order_of_reading_new)
)
from .utils.xml import create_page_xml, add_textequiv, xml_reading_order
from .utils.pil_cv2 import check_dpi from .utils.pil_cv2 import check_dpi
from .plot import EynollahPlotter from .plot import EynollahPlotter
from .writer import EynollahXmlWriter
SLOPE_THRESHOLD = 0.13 SLOPE_THRESHOLD = 0.13
RATIO_OF_TWO_MODEL_THRESHOLD = 95.50 #98.45: RATIO_OF_TWO_MODEL_THRESHOLD = 95.50 #98.45:
@ -107,7 +95,6 @@ class Eynollah:
headers_off=False headers_off=False
): ):
self.image_filename = image_filename self.image_filename = image_filename
self.cont_page = []
self.dir_out = dir_out self.dir_out = dir_out
self.image_filename_stem = image_filename_stem self.image_filename_stem = image_filename_stem
self.allow_enhancement = allow_enhancement self.allow_enhancement = allow_enhancement
@ -123,8 +110,11 @@ class Eynollah:
dir_of_cropped_images=dir_of_cropped_images, dir_of_cropped_images=dir_of_cropped_images,
dir_of_layout=dir_of_layout, dir_of_layout=dir_of_layout,
image_filename=image_filename, image_filename=image_filename,
image_filename_stem=image_filename_stem, image_filename_stem=image_filename_stem)
) self.writer = EynollahXmlWriter(
dir_out=self.dir_out,
image_filename=self.image_filename,
curved_line=self.curved_line)
self.logger = getLogger('eynollah') self.logger = getLogger('eynollah')
self.dir_models = dir_models self.dir_models = dir_models
@ -401,12 +391,16 @@ class Eynollah:
self.image = resize_image(self.image, self.img_hight_int, self.img_width_int) self.image = resize_image(self.image, self.img_hight_int, self.img_width_int)
# Also set for the plotter # Also set for the plotter
# XXX TODO hacky
if self.plotter: if self.plotter:
self.plotter.image_org = self.image_org self.plotter.image_org = self.image_org
self.plotter.scale_y = self.scale_y self.plotter.scale_y = self.scale_y
self.plotter.scale_x = self.scale_x self.plotter.scale_x = self.scale_x
# Also set for the writer
self.writer.image_org = self.image_org
self.writer.scale_y = self.scale_y
self.writer.scale_x = self.scale_x
self.writer.height_org = self.height_org
self.writer.width_org = self.width_org
def get_image_and_scales_after_enhancing(self, img_org, img_res): def get_image_and_scales_after_enhancing(self, img_org, img_res):
self.logger.debug("enter get_image_and_scales_after_enhancing") self.logger.debug("enter get_image_and_scales_after_enhancing")
@ -419,6 +413,18 @@ class Eynollah:
self.scale_y = img_res.shape[0] / float(self.image_org.shape[0]) self.scale_y = img_res.shape[0] / float(self.image_org.shape[0])
self.scale_x = img_res.shape[1] / float(self.image_org.shape[1]) self.scale_x = img_res.shape[1] / float(self.image_org.shape[1])
# Also set for the plotter
if self.plotter:
self.plotter.image_org = self.image_org
self.plotter.scale_y = self.scale_y
self.plotter.scale_x = self.scale_x
# Also set for the writer
self.writer.image_org = self.image_org
self.writer.scale_y = self.scale_y
self.writer.scale_x = self.scale_x
self.writer.height_org = self.height_org
self.writer.width_org = self.width_org
def start_new_session_and_model(self, model_dir): def start_new_session_and_model(self, model_dir):
self.logger.debug("enter start_new_session_and_model (model_dir=%s)", model_dir) self.logger.debug("enter start_new_session_and_model (model_dir=%s)", model_dir)
config = tf.ConfigProto() config = tf.ConfigProto()
@ -570,6 +576,7 @@ class Eynollah:
def extract_page(self): def extract_page(self):
self.logger.debug("enter extract_page") self.logger.debug("enter extract_page")
cont_page = []
model_page, session_page = self.start_new_session_and_model(self.model_page_dir) model_page, session_page = self.start_new_session_and_model(self.model_page_dir)
img = cv2.GaussianBlur(self.image, (5, 5), 0) img = cv2.GaussianBlur(self.image, (5, 5), 0)
img_page_prediction = self.do_prediction(False, img, model_page) img_page_prediction = self.do_prediction(False, img, model_page)
@ -593,12 +600,12 @@ class Eynollah:
box = [x, y, w, h] box = [x, y, w, h]
croped_page, page_coord = crop_image_inside_box(box, self.image) croped_page, page_coord = crop_image_inside_box(box, self.image)
self.cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]])) cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]]))
session_page.close() session_page.close()
K.clear_session() K.clear_session()
self.logger.debug("exit extract_page") self.logger.debug("exit extract_page")
return croped_page, page_coord return croped_page, page_coord, cont_page
def extract_text_regions(self, img, patches, cols): def extract_text_regions(self, img, patches, cols):
self.logger.debug("enter extract_text_regions") self.logger.debug("enter extract_text_regions")
@ -1038,245 +1045,6 @@ class Eynollah:
poly.put(poly_sub) poly.put(poly_sub)
box_sub.put(boxes_sub_new) box_sub.put(boxes_sub_new)
def calculate_polygon_coords(self, contour_list, i, page_coord):
self.logger.debug('enter calculate_polygon_coords')
coords = ''
for j in range(len(contour_list[i])):
if len(contour_list[i][j]) == 2:
coords += str(int((contour_list[i][j][0] + page_coord[2]) / self.scale_x))
coords += ','
coords += str(int((contour_list[i][j][1] + page_coord[0]) / self.scale_y))
else:
coords += str(int((contour_list[i][j][0][0] + page_coord[2]) / self.scale_x))
coords += ','
coords += str(int((contour_list[i][j][0][1] + page_coord[0]) / self.scale_y))
if j < len(contour_list[i]) - 1:
coords=coords + ' '
#print(coords)
return coords
def calculate_page_coords(self):
self.logger.debug('enter calculate_page_coords')
points_page_print = ""
for _, contour in enumerate(self.cont_page[0]):
if len(contour) == 2:
points_page_print += str(int((contour[0]) / self.scale_x))
points_page_print += ','
points_page_print += str(int((contour[1]) / self.scale_y))
else:
points_page_print += str(int((contour[0][0]) / self.scale_x))
points_page_print += ','
points_page_print += str(int((contour[0][1] ) / self.scale_y))
points_page_print = points_page_print + ' '
return points_page_print[:-1]
def serialize_lines_in_marginal(self, marginal, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, id_indexer_l):
for j in range(len(all_found_texline_polygons_marginals[marginal_idx])):
textline = ET.SubElement(marginal, 'TextLine')
textline.set('id', 'l%s' % id_indexer_l)
id_indexer_l += 1
coord = ET.SubElement(textline, 'Coords')
add_textequiv(textline)
points_co = ''
for l in range(len(all_found_texline_polygons_marginals[marginal_idx][j])):
if not self.curved_line:
if len(all_found_texline_polygons_marginals[marginal_idx][j][l]) == 2:
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x))
points_co += ','
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y))
else:
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x))
points_co += ','
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0])/self.scale_y))
else:
if len(all_found_texline_polygons_marginals[marginal_idx][j][l]) == 2:
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0] + page_coord[2]) / self.scale_x))
points_co += ','
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][1] + page_coord[0]) / self.scale_y))
else:
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][0] + page_coord[2]) / self.scale_x))
points_co += ','
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][1] + page_coord[0]) / self.scale_y))
if l < len(all_found_texline_polygons_marginals[marginal_idx][j]) - 1:
points_co += ' '
coord.set('points',points_co)
return id_indexer_l
def serialize_lines_in_region(self, textregion, all_found_texline_polygons, region_idx, page_coord, all_box_coord, slopes, id_indexer_l):
self.logger.debug('enter serialize_lines_in_region')
for j in range(len(all_found_texline_polygons[region_idx])):
textline = ET.SubElement(textregion, 'TextLine')
textline.set('id', 'l%s' % id_indexer_l)
id_indexer_l += 1
coord = ET.SubElement(textline, 'Coords')
add_textequiv(textline)
points_co = ''
for l in range(len(all_found_texline_polygons[region_idx][j])):
if not self.curved_line:
if len(all_found_texline_polygons[region_idx][j][l])==2:
textline_x_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][0] + all_box_coord[region_idx][2] + page_coord[2]) / self.scale_x))
textline_y_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][1] + all_box_coord[region_idx][0] + page_coord[0]) / self.scale_y))
else:
textline_x_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][0][0] + all_box_coord[region_idx][2] + page_coord[2]) / self.scale_x))
textline_y_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][0][1] + all_box_coord[region_idx][0] + page_coord[0]) / self.scale_y))
points_co += str(textline_x_coord)
points_co += ','
points_co += str(textline_y_coord)
if self.curved_line and np.abs(slopes[region_idx]) <= 45:
if len(all_found_texline_polygons[region_idx][j][l]) == 2:
points_co += str(int((all_found_texline_polygons[region_idx][j][l][0] + page_coord[2]) / self.scale_x))
points_co += ','
points_co += str(int((all_found_texline_polygons[region_idx][j][l][1] + page_coord[0]) / self.scale_y))
else:
points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][0] + page_coord[2]) / self.scale_x))
points_co += ','
points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][1] + page_coord[0])/self.scale_y))
elif self.curved_line and np.abs(slopes[region_idx]) > 45:
if len(all_found_texline_polygons[region_idx][j][l])==2:
points_co += str(int((all_found_texline_polygons[region_idx][j][l][0] + all_box_coord[region_idx][2]+page_coord[2])/self.scale_x))
points_co += ','
points_co += str(int((all_found_texline_polygons[region_idx][j][l][1] + all_box_coord[region_idx][0]+page_coord[0])/self.scale_y))
else:
points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][0] + all_box_coord[region_idx][2]+page_coord[2])/self.scale_x))
points_co += ','
points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][1] + all_box_coord[region_idx][0]+page_coord[0])/self.scale_y))
if l < len(all_found_texline_polygons[region_idx][j]) - 1:
points_co += ' '
coord.set('points',points_co)
return id_indexer_l
def write_pagexml(self, pcgts):
self.logger.info("filename stem: '%s'", self.image_filename_stem)
tree = ET.ElementTree(pcgts)
tree.write(os.path.join(self.dir_out, self.image_filename_stem) + ".xml")
def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes):
self.logger.debug('enter build_pagexml_no_full_layout')
# create the file structure
pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org)
page_print_sub = ET.SubElement(page, "Border")
coord_page = ET.SubElement(page_print_sub, "Coords")
coord_page.set('points', self.calculate_page_coords())
id_of_marginalia = []
id_indexer = 0
id_indexer_l = 0
if len(found_polygons_text_region) > 0:
xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals)
for mm in range(len(found_polygons_text_region)):
textregion = ET.SubElement(page, 'TextRegion')
textregion.set('id', 'r%s' % id_indexer)
id_indexer += 1
textregion.set('type', 'paragraph')
coord_text = ET.SubElement(textregion, 'Coords')
coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord))
id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l)
add_textequiv(textregion)
for marginal_idx in range(len(found_polygons_marginals)):
marginal = ET.SubElement(page, 'TextRegion')
marginal.set('id', id_of_marginalia[mm])
marginal.set('type', 'marginalia')
coord_text = ET.SubElement(marginal, 'Coords')
coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord))
self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, id_indexer_l)
id_indexer = len(found_polygons_text_region) + len(found_polygons_marginals)
for mm in range(len(found_polygons_text_region_img)):
textregion = ET.SubElement(page, 'ImageRegion')
textregion.set('id', 'r%s' % id_indexer)
id_indexer += 1
coord_text = ET.SubElement(textregion, 'Coords')
points_co = ''
for lmm in range(len(found_polygons_text_region_img[mm])):
points_co += str(int((found_polygons_text_region_img[mm][lmm,0,0] + page_coord[2]) / self.scale_x))
points_co += ','
points_co += str(int((found_polygons_text_region_img[mm][lmm,0,1] + page_coord[0]) / self.scale_y))
if lmm < len(found_polygons_text_region_img[mm]) - 1:
points_co += ' '
coord_text.set('points', points_co)
return pcgts
def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes):
self.logger.debug('enter build_pagexml_full_layout')
# create the file structure
pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org)
page_print_sub = ET.SubElement(page, "Border")
coord_page = ET.SubElement(page_print_sub, "Coords")
coord_page.set('points', self.calculate_page_coords())
id_indexer = 0
id_indexer_l = 0
id_of_marginalia = []
if len(found_polygons_text_region) > 0:
xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals)
for mm in range(len(found_polygons_text_region)):
textregion=ET.SubElement(page, 'TextRegion')
textregion.set('id', 'r%s' % id_indexer)
id_indexer += 1
textregion.set('type', 'paragraph')
coord_text = ET.SubElement(textregion, 'Coords')
coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord))
id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l)
add_textequiv(textregion)
self.logger.debug('len(found_polygons_text_region_h) %s', len(found_polygons_text_region_h))
if len(found_polygons_text_region_h) > 0:
for mm in range(len(found_polygons_text_region_h)):
textregion=ET.SubElement(page, 'TextRegion')
textregion.set('id', 'r%s' % id_indexer)
id_indexer += 1
textregion.set('type','header')
coord_text = ET.SubElement(textregion, 'Coords')
coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_h, mm, page_coord))
id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes, id_indexer_l)
add_textequiv(textregion)
if len(found_polygons_drop_capitals) > 0:
id_indexer = len(found_polygons_text_region) + len(found_polygons_text_region_h) + len(found_polygons_marginals)
for mm in range(len(found_polygons_drop_capitals)):
textregion=ET.SubElement(page, 'TextRegion')
textregion.set('id',' r%s' % id_indexer)
id_indexer += 1
textregion.set('type', 'drop-capital')
coord_text = ET.SubElement(textregion, 'Coords')
coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals, mm, page_coord))
add_textequiv(textregion)
for marginal_idx in range(len(found_polygons_marginals)):
marginal = ET.SubElement(page, 'TextRegion')
add_textequiv(textregion)
marginal.set('id', id_of_marginalia[mm])
marginal.set('type', 'marginalia')
coord_text = ET.SubElement(marginal, 'Coords')
coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord))
self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, id_indexer_l)
id_indexer = len(found_polygons_text_region) + len(found_polygons_text_region_h) + len(found_polygons_marginals) + len(found_polygons_drop_capitals)
for mm in range(len(found_polygons_text_region_img)):
textregion=ET.SubElement(page, 'ImageRegion')
textregion.set('id', 'r%s' % id_indexer)
id_indexer += 1
coord_text = ET.SubElement(textregion, 'Coords')
coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_img, mm, page_coord))
for mm in range(len(found_polygons_tables)):
textregion = ET.SubElement(page, 'TableRegion')
textregion.set('id', 'r%s' %id_indexer)
id_indexer += 1
coord_text = ET.SubElement(textregion, 'Coords')
coord_text.set('points', self.calculate_polygon_coords(found_polygons_tables, mm, page_coord))
return pcgts
def get_regions_from_xy_2models(self,img,is_image_enhanced): def get_regions_from_xy_2models(self,img,is_image_enhanced):
self.logger.debug("enter get_regions_from_xy_2models") self.logger.debug("enter get_regions_from_xy_2models")
img_org = np.copy(img) img_org = np.copy(img)
@ -1621,7 +1389,7 @@ class Eynollah:
img_g3[:, :, 1] = img_g[:, :] img_g3[:, :, 1] = img_g[:, :]
img_g3[:, :, 2] = img_g[:, :] img_g3[:, :, 2] = img_g[:, :]
image_page, page_coord = self.extract_page() image_page, page_coord, cont_page = self.extract_page()
if self.plotter: if self.plotter:
self.plotter.save_page_image(image_page) self.plotter.save_page_image(image_page)
@ -1642,7 +1410,7 @@ class Eynollah:
except Exception as why: except Exception as why:
self.logger.error(why) self.logger.error(why)
num_col = None num_col = None
return num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, text_regions_p_1 return num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, text_regions_p_1, cont_page
def run_enhancement(self): def run_enhancement(self):
self.logger.info("resize and enhance image") self.logger.info("resize and enhance image")
@ -1835,13 +1603,14 @@ class Eynollah:
self.logger.info("Textregion detection took %ss ", str(time.time() - t1)) self.logger.info("Textregion detection took %ss ", str(time.time() - t1))
t1 = time.time() t1 = time.time()
num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, text_regions_p_1 = \ num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, text_regions_p_1, cont_page = \
self.run_graphics_and_columns(text_regions_p_1, num_col_classifier, num_column_is_classified) self.run_graphics_and_columns(text_regions_p_1, num_col_classifier, num_column_is_classified)
self.logger.info("Graphics detection took %ss ", str(time.time() - t1)) self.logger.info("Graphics detection took %ss ", str(time.time() - t1))
self.logger.info('cont_page %s', cont_page)
if not num_col: if not num_col:
self.logger.info("No columns detected, outputting an empty PAGE-XML") self.logger.info("No columns detected, outputting an empty PAGE-XML")
pcgts = self.build_pagexml_no_full_layout([], page_coord, [], [], [], [], [], [], [], [], []) pcgts = self.writer.build_pagexml_no_full_layout([], page_coord, [], [], [], [], [], [], [], [], [], cont_page)
self.logger.info("Job done in %ss", str(time.time() - t1)) self.logger.info("Job done in %ss", str(time.time() - t1))
return pcgts return pcgts
@ -2040,7 +1809,7 @@ class Eynollah:
else: else:
order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d) order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d)
pcgts = self.build_pagexml_full_layout(contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, polygons_of_tabels, polygons_of_drop_capitals, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes) pcgts = self.writer.build_pagexml_full_layout(contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, polygons_of_tabels, polygons_of_drop_capitals, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, cont_page)
self.logger.info("Job done in %ss", str(time.time() - t0)) self.logger.info("Job done in %ss", str(time.time() - t0))
return pcgts return pcgts
else: else:
@ -2050,7 +1819,6 @@ class Eynollah:
else: else:
contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con]) contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con])
order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d) order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d)
pcgts = self.build_pagexml_no_full_layout(txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes) pcgts = self.writer.build_pagexml_no_full_layout(txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, cont_page)
self.logger.info("Job done in %ss", str(time.time() - t0)) self.logger.info("Job done in %ss", str(time.time() - t0))
return pcgts return pcgts

@ -0,0 +1,261 @@
# pylint: disable=too-many-locals,wrong-import-position,too-many-lines,too-many-statements,chained-comparison,fixme,broad-except,c-extension-no-member
from pathlib import Path
import os.path
from .utils.xml import create_page_xml, add_textequiv, xml_reading_order
from ocrd_utils import getLogger
from lxml import etree as ET
import numpy as np
class EynollahXmlWriter():
def __init__(self, *, dir_out, image_filename, curved_line):
self.logger = getLogger('eynollah.writer')
self.dir_out = dir_out
self.image_filename = image_filename
self.image_filename_stem = Path(Path(image_filename).name).stem
self.curved_line = curved_line
self.scale_x = None # XXX set outside __init__
self.scale_y = None # XXX set outside __init__
self.height_org = None # XXX set outside __init__
self.width_org = None # XXX set outside __init__
def calculate_page_coords(self, cont_page):
self.logger.debug('enter calculate_page_coords')
points_page_print = ""
for _, contour in enumerate(cont_page[0]):
if len(contour) == 2:
points_page_print += str(int((contour[0]) / self.scale_x))
points_page_print += ','
points_page_print += str(int((contour[1]) / self.scale_y))
else:
points_page_print += str(int((contour[0][0]) / self.scale_x))
points_page_print += ','
points_page_print += str(int((contour[0][1] ) / self.scale_y))
points_page_print = points_page_print + ' '
return points_page_print[:-1]
def serialize_lines_in_marginal(self, marginal, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, id_indexer_l):
for j in range(len(all_found_texline_polygons_marginals[marginal_idx])):
textline = ET.SubElement(marginal, 'TextLine')
textline.set('id', 'l%s' % id_indexer_l)
id_indexer_l += 1
coord = ET.SubElement(textline, 'Coords')
add_textequiv(textline)
points_co = ''
for l in range(len(all_found_texline_polygons_marginals[marginal_idx][j])):
if not self.curved_line:
if len(all_found_texline_polygons_marginals[marginal_idx][j][l]) == 2:
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x))
points_co += ','
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y))
else:
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x))
points_co += ','
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0])/self.scale_y))
else:
if len(all_found_texline_polygons_marginals[marginal_idx][j][l]) == 2:
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0] + page_coord[2]) / self.scale_x))
points_co += ','
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][1] + page_coord[0]) / self.scale_y))
else:
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][0] + page_coord[2]) / self.scale_x))
points_co += ','
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][1] + page_coord[0]) / self.scale_y))
if l < len(all_found_texline_polygons_marginals[marginal_idx][j]) - 1:
points_co += ' '
coord.set('points',points_co)
return id_indexer_l
def serialize_lines_in_region(self, textregion, all_found_texline_polygons, region_idx, page_coord, all_box_coord, slopes, id_indexer_l):
self.logger.debug('enter serialize_lines_in_region')
for j in range(len(all_found_texline_polygons[region_idx])):
textline = ET.SubElement(textregion, 'TextLine')
textline.set('id', 'l%s' % id_indexer_l)
id_indexer_l += 1
coord = ET.SubElement(textline, 'Coords')
add_textequiv(textline)
points_co = ''
for l in range(len(all_found_texline_polygons[region_idx][j])):
if not self.curved_line:
if len(all_found_texline_polygons[region_idx][j][l])==2:
textline_x_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][0] + all_box_coord[region_idx][2] + page_coord[2]) / self.scale_x))
textline_y_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][1] + all_box_coord[region_idx][0] + page_coord[0]) / self.scale_y))
else:
textline_x_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][0][0] + all_box_coord[region_idx][2] + page_coord[2]) / self.scale_x))
textline_y_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][0][1] + all_box_coord[region_idx][0] + page_coord[0]) / self.scale_y))
points_co += str(textline_x_coord)
points_co += ','
points_co += str(textline_y_coord)
if self.curved_line and np.abs(slopes[region_idx]) <= 45:
if len(all_found_texline_polygons[region_idx][j][l]) == 2:
points_co += str(int((all_found_texline_polygons[region_idx][j][l][0] + page_coord[2]) / self.scale_x))
points_co += ','
points_co += str(int((all_found_texline_polygons[region_idx][j][l][1] + page_coord[0]) / self.scale_y))
else:
points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][0] + page_coord[2]) / self.scale_x))
points_co += ','
points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][1] + page_coord[0])/self.scale_y))
elif self.curved_line and np.abs(slopes[region_idx]) > 45:
if len(all_found_texline_polygons[region_idx][j][l])==2:
points_co += str(int((all_found_texline_polygons[region_idx][j][l][0] + all_box_coord[region_idx][2]+page_coord[2])/self.scale_x))
points_co += ','
points_co += str(int((all_found_texline_polygons[region_idx][j][l][1] + all_box_coord[region_idx][0]+page_coord[0])/self.scale_y))
else:
points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][0] + all_box_coord[region_idx][2]+page_coord[2])/self.scale_x))
points_co += ','
points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][1] + all_box_coord[region_idx][0]+page_coord[0])/self.scale_y))
if l < len(all_found_texline_polygons[region_idx][j]) - 1:
points_co += ' '
coord.set('points',points_co)
return id_indexer_l
def write_pagexml(self, pcgts):
self.logger.info("filename stem: '%s'", self.image_filename_stem)
tree = ET.ElementTree(pcgts)
tree.write(os.path.join(self.dir_out, self.image_filename_stem) + ".xml")
def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, cont_page):
self.logger.debug('enter build_pagexml_no_full_layout')
# create the file structure
pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org)
page_print_sub = ET.SubElement(page, "Border")
coord_page = ET.SubElement(page_print_sub, "Coords")
coord_page.set('points', self.calculate_page_coords(cont_page))
id_of_marginalia = []
id_indexer = 0
id_indexer_l = 0
if len(found_polygons_text_region) > 0:
xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals)
for mm in range(len(found_polygons_text_region)):
textregion = ET.SubElement(page, 'TextRegion')
textregion.set('id', 'r%s' % id_indexer)
id_indexer += 1
textregion.set('type', 'paragraph')
coord_text = ET.SubElement(textregion, 'Coords')
coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord))
id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l)
add_textequiv(textregion)
for marginal_idx in range(len(found_polygons_marginals)):
marginal = ET.SubElement(page, 'TextRegion')
marginal.set('id', id_of_marginalia[mm])
marginal.set('type', 'marginalia')
coord_text = ET.SubElement(marginal, 'Coords')
coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord))
self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, id_indexer_l)
id_indexer = len(found_polygons_text_region) + len(found_polygons_marginals)
for mm in range(len(found_polygons_text_region_img)):
textregion = ET.SubElement(page, 'ImageRegion')
textregion.set('id', 'r%s' % id_indexer)
id_indexer += 1
coord_text = ET.SubElement(textregion, 'Coords')
points_co = ''
for lmm in range(len(found_polygons_text_region_img[mm])):
points_co += str(int((found_polygons_text_region_img[mm][lmm,0,0] + page_coord[2]) / self.scale_x))
points_co += ','
points_co += str(int((found_polygons_text_region_img[mm][lmm,0,1] + page_coord[0]) / self.scale_y))
if lmm < len(found_polygons_text_region_img[mm]) - 1:
points_co += ' '
coord_text.set('points', points_co)
return pcgts
def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, cont_page):
self.logger.debug('enter build_pagexml_full_layout')
# create the file structure
pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org)
page_print_sub = ET.SubElement(page, "Border")
coord_page = ET.SubElement(page_print_sub, "Coords")
coord_page.set('points', self.calculate_page_coords(cont_page))
id_indexer = 0
id_indexer_l = 0
id_of_marginalia = []
if len(found_polygons_text_region) > 0:
xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals)
for mm in range(len(found_polygons_text_region)):
textregion=ET.SubElement(page, 'TextRegion')
textregion.set('id', 'r%s' % id_indexer)
id_indexer += 1
textregion.set('type', 'paragraph')
coord_text = ET.SubElement(textregion, 'Coords')
coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord))
id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l)
add_textequiv(textregion)
self.logger.debug('len(found_polygons_text_region_h) %s', len(found_polygons_text_region_h))
if len(found_polygons_text_region_h) > 0:
for mm in range(len(found_polygons_text_region_h)):
textregion=ET.SubElement(page, 'TextRegion')
textregion.set('id', 'r%s' % id_indexer)
id_indexer += 1
textregion.set('type','header')
coord_text = ET.SubElement(textregion, 'Coords')
coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_h, mm, page_coord))
id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes, id_indexer_l)
add_textequiv(textregion)
if len(found_polygons_drop_capitals) > 0:
id_indexer = len(found_polygons_text_region) + len(found_polygons_text_region_h) + len(found_polygons_marginals)
for mm in range(len(found_polygons_drop_capitals)):
textregion=ET.SubElement(page, 'TextRegion')
textregion.set('id',' r%s' % id_indexer)
id_indexer += 1
textregion.set('type', 'drop-capital')
coord_text = ET.SubElement(textregion, 'Coords')
coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals, mm, page_coord))
add_textequiv(textregion)
for marginal_idx in range(len(found_polygons_marginals)):
marginal = ET.SubElement(page, 'TextRegion')
add_textequiv(textregion)
marginal.set('id', id_of_marginalia[mm])
marginal.set('type', 'marginalia')
coord_text = ET.SubElement(marginal, 'Coords')
coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord))
self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, id_indexer_l)
id_indexer = len(found_polygons_text_region) + len(found_polygons_text_region_h) + len(found_polygons_marginals) + len(found_polygons_drop_capitals)
for mm in range(len(found_polygons_text_region_img)):
textregion=ET.SubElement(page, 'ImageRegion')
textregion.set('id', 'r%s' % id_indexer)
id_indexer += 1
coord_text = ET.SubElement(textregion, 'Coords')
coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_img, mm, page_coord))
for mm in range(len(found_polygons_tables)):
textregion = ET.SubElement(page, 'TableRegion')
textregion.set('id', 'r%s' %id_indexer)
id_indexer += 1
coord_text = ET.SubElement(textregion, 'Coords')
coord_text.set('points', self.calculate_polygon_coords(found_polygons_tables, mm, page_coord))
return pcgts
def calculate_polygon_coords(self, contour_list, i, page_coord):
self.logger.debug('enter calculate_polygon_coords')
coords = ''
for j in range(len(contour_list[i])):
if len(contour_list[i][j]) == 2:
coords += str(int((contour_list[i][j][0] + page_coord[2]) / self.scale_x))
coords += ','
coords += str(int((contour_list[i][j][1] + page_coord[0]) / self.scale_y))
else:
coords += str(int((contour_list[i][j][0][0] + page_coord[2]) / self.scale_x))
coords += ','
coords += str(int((contour_list[i][j][0][1] + page_coord[0]) / self.scale_y))
if j < len(contour_list[i]) - 1:
coords=coords + ' '
return coords
Loading…
Cancel
Save