commit
932c3fb479
@ -0,0 +1 @@
|
||||
__import__("pkg_resources").declare_namespace(__name__)
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,169 @@
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.patches as mpatches
|
||||
import numpy as np
|
||||
import os.path
|
||||
import cv2
|
||||
from scipy.ndimage import gaussian_filter1d
|
||||
|
||||
from .utils import crop_image_inside_box
|
||||
from .utils.rotate import rotyate_image_different
|
||||
from .utils.resize import resize_image
|
||||
|
||||
class EynollahPlotter():
|
||||
"""
|
||||
Class collecting all the plotting and image writing methods
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
dir_of_all,
|
||||
dir_of_deskewed,
|
||||
dir_of_layout,
|
||||
dir_of_cropped_images,
|
||||
image_filename,
|
||||
image_filename_stem,
|
||||
image_org=None,
|
||||
scale_x=1,
|
||||
scale_y=1,
|
||||
):
|
||||
self.dir_of_all = dir_of_all
|
||||
self.dir_of_layout = dir_of_layout
|
||||
self.dir_of_cropped_images = dir_of_cropped_images
|
||||
self.dir_of_deskewed = dir_of_deskewed
|
||||
self.image_filename = image_filename
|
||||
self.image_filename_stem = image_filename_stem
|
||||
# XXX TODO hacky these cannot be set at init time
|
||||
self.image_org = image_org
|
||||
self.scale_x = scale_x
|
||||
self.scale_y = scale_y
|
||||
|
||||
def save_plot_of_layout_main(self, text_regions_p, image_page):
|
||||
if self.dir_of_layout is not None:
|
||||
values = np.unique(text_regions_p[:, :])
|
||||
# pixels=['Background' , 'Main text' , 'Heading' , 'Marginalia' ,'Drop capitals' , 'Images' , 'Seperators' , 'Tables', 'Graphics']
|
||||
pixels=['Background' , 'Main text' , 'Image' , 'Separator','Marginalia']
|
||||
values_indexes = [0, 1, 2, 3, 4]
|
||||
plt.figure(figsize=(40, 40))
|
||||
plt.rcParams["font.size"] = "40"
|
||||
im = plt.imshow(text_regions_p[:, :])
|
||||
colors = [im.cmap(im.norm(value)) for value in values]
|
||||
patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
|
||||
plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=40)
|
||||
plt.savefig(os.path.join(self.dir_of_layout, self.image_filename_stem + "_layout_main.png"))
|
||||
|
||||
|
||||
def save_plot_of_layout_main_all(self, text_regions_p, image_page):
|
||||
if self.dir_of_all is not None:
|
||||
values = np.unique(text_regions_p[:, :])
|
||||
# pixels=['Background' , 'Main text' , 'Heading' , 'Marginalia' ,'Drop capitals' , 'Images' , 'Seperators' , 'Tables', 'Graphics']
|
||||
pixels=['Background' , 'Main text' , 'Image' , 'Separator','Marginalia']
|
||||
values_indexes = [0, 1, 2, 3, 4]
|
||||
plt.figure(figsize=(80, 40))
|
||||
plt.rcParams["font.size"] = "40"
|
||||
plt.subplot(1, 2, 1)
|
||||
plt.imshow(image_page)
|
||||
plt.subplot(1, 2, 2)
|
||||
im = plt.imshow(text_regions_p[:, :])
|
||||
colors = [im.cmap(im.norm(value)) for value in values]
|
||||
patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
|
||||
plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=60)
|
||||
plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem + "_layout_main_and_page.png"))
|
||||
|
||||
def save_plot_of_layout(self, text_regions_p, image_page):
|
||||
if self.dir_of_layout is not None:
|
||||
values = np.unique(text_regions_p[:, :])
|
||||
# pixels=['Background' , 'Main text' , 'Heading' , 'Marginalia' ,'Drop capitals' , 'Images' , 'Seperators' , 'Tables', 'Graphics']
|
||||
pixels = ["Background", "Main text", "Header", "Marginalia", "Drop capital", "Image", "Separator"]
|
||||
values_indexes = [0, 1, 2, 8, 4, 5, 6]
|
||||
plt.figure(figsize=(40, 40))
|
||||
plt.rcParams["font.size"] = "40"
|
||||
im = plt.imshow(text_regions_p[:, :])
|
||||
colors = [im.cmap(im.norm(value)) for value in values]
|
||||
patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
|
||||
plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=40)
|
||||
plt.savefig(os.path.join(self.dir_of_layout, self.image_filename_stem + "_layout.png"))
|
||||
|
||||
def save_plot_of_layout_all(self, text_regions_p, image_page):
|
||||
if self.dir_of_all is not None:
|
||||
values = np.unique(text_regions_p[:, :])
|
||||
# pixels=['Background' , 'Main text' , 'Heading' , 'Marginalia' ,'Drop capitals' , 'Images' , 'Seperators' , 'Tables', 'Graphics']
|
||||
pixels = ["Background", "Main text", "Header", "Marginalia", "Drop capital", "Image", "Separator"]
|
||||
values_indexes = [0, 1, 2, 8, 4, 5, 6]
|
||||
plt.figure(figsize=(80, 40))
|
||||
plt.rcParams["font.size"] = "40"
|
||||
plt.subplot(1, 2, 1)
|
||||
plt.imshow(image_page)
|
||||
plt.subplot(1, 2, 2)
|
||||
im = plt.imshow(text_regions_p[:, :])
|
||||
colors = [im.cmap(im.norm(value)) for value in values]
|
||||
patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
|
||||
plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=60)
|
||||
plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem + "_layout_and_page.png"))
|
||||
|
||||
def save_plot_of_textlines(self, textline_mask_tot_ea, image_page):
|
||||
if self.dir_of_all is not None:
|
||||
values = np.unique(textline_mask_tot_ea[:, :])
|
||||
pixels = ["Background", "Textlines"]
|
||||
values_indexes = [0, 1]
|
||||
plt.figure(figsize=(80, 40))
|
||||
plt.rcParams["font.size"] = "40"
|
||||
plt.subplot(1, 2, 1)
|
||||
plt.imshow(image_page)
|
||||
plt.subplot(1, 2, 2)
|
||||
im = plt.imshow(textline_mask_tot_ea[:, :])
|
||||
colors = [im.cmap(im.norm(value)) for value in values]
|
||||
patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
|
||||
plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=60)
|
||||
plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem + "_textline_and_page.png"))
|
||||
|
||||
def save_deskewed_image(self, slope_deskew):
|
||||
if self.dir_of_all is not None:
|
||||
cv2.imwrite(os.path.join(self.dir_of_all, self.image_filename_stem + "_org.png"), self.image_org)
|
||||
if self.dir_of_deskewed is not None:
|
||||
img_rotated = rotyate_image_different(self.image_org, slope_deskew)
|
||||
cv2.imwrite(os.path.join(self.dir_of_deskewed, self.image_filename_stem + "_deskewed.png"), img_rotated)
|
||||
|
||||
def save_page_image(self, image_page):
|
||||
if self.dir_of_all is not None:
|
||||
cv2.imwrite(os.path.join(self.dir_of_all, self.image_filename_stem + "_page.png"), image_page)
|
||||
|
||||
def save_plot_of_textline_density(self, img_patch_org):
|
||||
if self.dir_of_all is not None:
|
||||
plt.figure(figsize=(80,40))
|
||||
plt.rcParams['font.size']='50'
|
||||
plt.subplot(1,2,1)
|
||||
plt.imshow(img_patch_org)
|
||||
plt.subplot(1,2,2)
|
||||
plt.plot(gaussian_filter1d(img_patch_org.sum(axis=1), 3),np.array(range(len(gaussian_filter1d(img_patch_org.sum(axis=1), 3)))),linewidth=8)
|
||||
plt.xlabel('Density of textline prediction in direction of X axis',fontsize=60)
|
||||
plt.ylabel('Height',fontsize=60)
|
||||
plt.yticks([0,len(gaussian_filter1d(img_patch_org.sum(axis=1), 3))])
|
||||
plt.gca().invert_yaxis()
|
||||
plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem+'_density_of_textline.png'))
|
||||
|
||||
def save_plot_of_rotation_angle(self, angels, var_res):
|
||||
if self.dir_of_all is not None:
|
||||
plt.figure(figsize=(60,30))
|
||||
plt.rcParams['font.size']='50'
|
||||
plt.plot(angels,np.array(var_res),'-o',markersize=25,linewidth=4)
|
||||
plt.xlabel('angle',fontsize=50)
|
||||
plt.ylabel('variance of sum of rotated textline in direction of x axis',fontsize=50)
|
||||
plt.plot(angels[np.argmax(var_res)],var_res[np.argmax(np.array(var_res))] ,'*',markersize=50,label='Angle of deskewing=' +str("{:.2f}".format(angels[np.argmax(var_res)]))+r'$\degree$')
|
||||
plt.legend(loc='best')
|
||||
plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem+'_rotation_angle.png'))
|
||||
|
||||
def write_images_into_directory(self, img_contoures, image_page):
|
||||
if self.dir_of_cropped_images is not None:
|
||||
index = 0
|
||||
for cont_ind in img_contoures:
|
||||
x, y, w, h = cv2.boundingRect(cont_ind)
|
||||
box = [x, y, w, h]
|
||||
croped_page, page_coord = crop_image_inside_box(box, image_page)
|
||||
|
||||
croped_page = resize_image(croped_page, int(croped_page.shape[0] / self.scale_y), int(croped_page.shape[1] / self.scale_x))
|
||||
|
||||
path = os.path.join(self.dir_of_cropped_images, self.image_filename_stem + "_" + str(index) + ".jpg")
|
||||
cv2.imwrite(path, croped_page)
|
||||
index += 1
|
||||
|
@ -0,0 +1,24 @@
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
from ocrd_models import OcrdExif
|
||||
from cv2 import COLOR_GRAY2BGR, COLOR_RGB2BGR, cvtColor, imread
|
||||
|
||||
# from sbb_binarization
|
||||
|
||||
def cv2pil(img):
|
||||
return Image.fromarray(img.astype('uint8'))
|
||||
|
||||
def pil2cv(img):
|
||||
# from ocrd/workspace.py
|
||||
color_conversion = COLOR_GRAY2BGR if img.mode in ('1', 'L') else COLOR_RGB2BGR
|
||||
pil_as_np_array = np.array(img).astype('uint8') if img.mode == '1' else np.array(img)
|
||||
return cvtColor(pil_as_np_array, color_conversion)
|
||||
|
||||
def check_dpi(image_filename):
|
||||
exif = OcrdExif(Image.open(image_filename))
|
||||
print(exif.to_xml())
|
||||
resolution = exif.resolution
|
||||
if exif.resolutionUnit == 'cm':
|
||||
resolution /= 2.54
|
||||
return int(resolution)
|
||||
|
@ -0,0 +1,62 @@
|
||||
from lxml import etree as ET
|
||||
|
||||
NAMESPACES = {}
|
||||
NAMESPACES['page'] = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
|
||||
NAMESPACES['xsi'] = "http://www.w3.org/2001/XMLSchema-instance"
|
||||
NAMESPACES[None] = NAMESPACES['page']
|
||||
|
||||
def create_page_xml(imageFilename, height, width):
|
||||
pcgts = ET.Element("PcGts", nsmap=NAMESPACES)
|
||||
|
||||
pcgts.set("{%s}schemaLocation" % NAMESPACES['xsi'], NAMESPACES['page'])
|
||||
|
||||
metadata = ET.SubElement(pcgts, "Metadata")
|
||||
|
||||
author = ET.SubElement(metadata, "Creator")
|
||||
author.text = "SBB_QURATOR"
|
||||
|
||||
created = ET.SubElement(metadata, "Created")
|
||||
created.text = "2019-06-17T18:15:12"
|
||||
|
||||
changetime = ET.SubElement(metadata, "LastChange")
|
||||
changetime.text = "2019-06-17T18:15:12"
|
||||
|
||||
page = ET.SubElement(pcgts, "Page")
|
||||
|
||||
page.set("imageFilename", imageFilename)
|
||||
page.set("imageHeight", str(height))
|
||||
page.set("imageWidth", str(width))
|
||||
page.set("type", "content")
|
||||
page.set("readingDirection", "left-to-right")
|
||||
page.set("textLineOrder", "top-to-bottom")
|
||||
|
||||
return pcgts, page
|
||||
|
||||
def add_textequiv(parent, text=''):
|
||||
textequiv = ET.SubElement(parent, 'TextEquiv')
|
||||
unireg = ET.SubElement(textequiv, 'Unicode')
|
||||
unireg.text = text
|
||||
|
||||
def xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals):
|
||||
"""
|
||||
XXX side-effect: extends id_of_marginalia
|
||||
"""
|
||||
region_order = ET.SubElement(page, 'ReadingOrder')
|
||||
region_order_sub = ET.SubElement(region_order, 'OrderedGroup')
|
||||
region_order_sub.set('id', "ro357564684568544579089")
|
||||
indexer_region = 0
|
||||
for vj in order_of_texts:
|
||||
name = "coord_text_%s" % vj
|
||||
name = ET.SubElement(region_order_sub, 'RegionRefIndexed')
|
||||
name.set('index', str(indexer_region))
|
||||
name.set('regionRef', id_of_texts[vj])
|
||||
indexer_region += 1
|
||||
for vm in range(len(found_polygons_marginals)):
|
||||
id_of_marginalia.append('r%s' % indexer_region)
|
||||
name = "coord_text_%s" % indexer_region
|
||||
name = ET.SubElement(region_order_sub, 'RegionRefIndexed')
|
||||
name.set('index', str(indexer_region))
|
||||
name.set('regionRef', 'r%s' % indexer_region)
|
||||
indexer_region += 1
|
||||
return id_of_marginalia
|
||||
|
@ -0,0 +1,272 @@
|
||||
# pylint: disable=too-many-locals,wrong-import-position,too-many-lines,too-many-statements,chained-comparison,fixme,broad-except,c-extension-no-member
|
||||
from pathlib import Path
|
||||
import os.path
|
||||
|
||||
from .utils.xml import create_page_xml, add_textequiv, xml_reading_order
|
||||
|
||||
from ocrd_utils import getLogger
|
||||
from lxml import etree as ET
|
||||
import numpy as np
|
||||
|
||||
class EynollahXmlWriter():
|
||||
|
||||
def __init__(self, *, dir_out, image_filename, curved_line):
|
||||
self.logger = getLogger('eynollah.writer')
|
||||
self.dir_out = dir_out
|
||||
self.image_filename = image_filename
|
||||
self.image_filename_stem = Path(Path(image_filename).name).stem
|
||||
self.curved_line = curved_line
|
||||
self.scale_x = None # XXX set outside __init__
|
||||
self.scale_y = None # XXX set outside __init__
|
||||
self.height_org = None # XXX set outside __init__
|
||||
self.width_org = None # XXX set outside __init__
|
||||
|
||||
def calculate_page_coords(self, cont_page):
|
||||
self.logger.debug('enter calculate_page_coords')
|
||||
points_page_print = ""
|
||||
for _, contour in enumerate(cont_page[0]):
|
||||
if len(contour) == 2:
|
||||
points_page_print += str(int((contour[0]) / self.scale_x))
|
||||
points_page_print += ','
|
||||
points_page_print += str(int((contour[1]) / self.scale_y))
|
||||
else:
|
||||
points_page_print += str(int((contour[0][0]) / self.scale_x))
|
||||
points_page_print += ','
|
||||
points_page_print += str(int((contour[0][1] ) / self.scale_y))
|
||||
points_page_print = points_page_print + ' '
|
||||
return points_page_print[:-1]
|
||||
|
||||
def serialize_lines_in_marginal(self, marginal, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, slopes_marginals, id_indexer_l):
|
||||
for j in range(len(all_found_texline_polygons_marginals[marginal_idx])):
|
||||
textline = ET.SubElement(marginal, 'TextLine')
|
||||
textline.set('id', 'l%s' % id_indexer_l)
|
||||
id_indexer_l += 1
|
||||
coord = ET.SubElement(textline, 'Coords')
|
||||
add_textequiv(textline)
|
||||
points_co = ''
|
||||
for l in range(len(all_found_texline_polygons_marginals[marginal_idx][j])):
|
||||
if not self.curved_line:
|
||||
if len(all_found_texline_polygons_marginals[marginal_idx][j][l]) == 2:
|
||||
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x))
|
||||
points_co += ','
|
||||
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y))
|
||||
else:
|
||||
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x))
|
||||
points_co += ','
|
||||
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0])/self.scale_y))
|
||||
if self.curved_line and np.abs(slopes_marginals[marginal_idx]) <= 45:
|
||||
if len(all_found_texline_polygons_marginals[marginal_idx][j][l]) == 2:
|
||||
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0] + page_coord[2]) / self.scale_x))
|
||||
points_co += ','
|
||||
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][1] + page_coord[0]) / self.scale_y))
|
||||
else:
|
||||
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][0] + page_coord[2]) / self.scale_x))
|
||||
points_co += ','
|
||||
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][1] + page_coord[0]) / self.scale_y))
|
||||
|
||||
elif self.curved_line and np.abs(slopes_marginals[marginal_idx]) > 45:
|
||||
if len(all_found_texline_polygons_marginals[marginal_idx][j][l]) == 2:
|
||||
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x))
|
||||
points_co += ','
|
||||
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y))
|
||||
else:
|
||||
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x))
|
||||
points_co += ','
|
||||
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y))
|
||||
|
||||
if l < len(all_found_texline_polygons_marginals[marginal_idx][j]) - 1:
|
||||
points_co += ' '
|
||||
coord.set('points',points_co)
|
||||
return id_indexer_l
|
||||
|
||||
def serialize_lines_in_region(self, textregion, all_found_texline_polygons, region_idx, page_coord, all_box_coord, slopes, id_indexer_l):
|
||||
self.logger.debug('enter serialize_lines_in_region')
|
||||
for j in range(len(all_found_texline_polygons[region_idx])):
|
||||
textline = ET.SubElement(textregion, 'TextLine')
|
||||
textline.set('id', 'l%s' % id_indexer_l)
|
||||
id_indexer_l += 1
|
||||
coord = ET.SubElement(textline, 'Coords')
|
||||
add_textequiv(textline)
|
||||
|
||||
points_co = ''
|
||||
for l in range(len(all_found_texline_polygons[region_idx][j])):
|
||||
if not self.curved_line:
|
||||
if len(all_found_texline_polygons[region_idx][j][l])==2:
|
||||
textline_x_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][0] + all_box_coord[region_idx][2] + page_coord[2]) / self.scale_x))
|
||||
textline_y_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][1] + all_box_coord[region_idx][0] + page_coord[0]) / self.scale_y))
|
||||
else:
|
||||
textline_x_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][0][0] + all_box_coord[region_idx][2] + page_coord[2]) / self.scale_x))
|
||||
textline_y_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][0][1] + all_box_coord[region_idx][0] + page_coord[0]) / self.scale_y))
|
||||
points_co += str(textline_x_coord)
|
||||
points_co += ','
|
||||
points_co += str(textline_y_coord)
|
||||
|
||||
if self.curved_line and np.abs(slopes[region_idx]) <= 45:
|
||||
if len(all_found_texline_polygons[region_idx][j][l]) == 2:
|
||||
points_co += str(int((all_found_texline_polygons[region_idx][j][l][0] + page_coord[2]) / self.scale_x))
|
||||
points_co += ','
|
||||
points_co += str(int((all_found_texline_polygons[region_idx][j][l][1] + page_coord[0]) / self.scale_y))
|
||||
else:
|
||||
points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][0] + page_coord[2]) / self.scale_x))
|
||||
points_co += ','
|
||||
points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][1] + page_coord[0])/self.scale_y))
|
||||
elif self.curved_line and np.abs(slopes[region_idx]) > 45:
|
||||
if len(all_found_texline_polygons[region_idx][j][l])==2:
|
||||
points_co += str(int((all_found_texline_polygons[region_idx][j][l][0] + all_box_coord[region_idx][2]+page_coord[2])/self.scale_x))
|
||||
points_co += ','
|
||||
points_co += str(int((all_found_texline_polygons[region_idx][j][l][1] + all_box_coord[region_idx][0]+page_coord[0])/self.scale_y))
|
||||
else:
|
||||
points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][0] + all_box_coord[region_idx][2]+page_coord[2])/self.scale_x))
|
||||
points_co += ','
|
||||
points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][1] + all_box_coord[region_idx][0]+page_coord[0])/self.scale_y))
|
||||
|
||||
if l < len(all_found_texline_polygons[region_idx][j]) - 1:
|
||||
points_co += ' '
|
||||
coord.set('points',points_co)
|
||||
return id_indexer_l
|
||||
|
||||
def write_pagexml(self, pcgts):
|
||||
self.logger.info("filename stem: '%s'", self.image_filename_stem)
|
||||
tree = ET.ElementTree(pcgts)
|
||||
tree.write(os.path.join(self.dir_out, self.image_filename_stem) + ".xml")
|
||||
|
||||
def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page):
|
||||
self.logger.debug('enter build_pagexml_no_full_layout')
|
||||
|
||||
# create the file structure
|
||||
pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org)
|
||||
page_print_sub = ET.SubElement(page, "Border")
|
||||
coord_page = ET.SubElement(page_print_sub, "Coords")
|
||||
coord_page.set('points', self.calculate_page_coords(cont_page))
|
||||
|
||||
id_of_marginalia = []
|
||||
id_indexer = 0
|
||||
id_indexer_l = 0
|
||||
if len(found_polygons_text_region) > 0:
|
||||
id_of_marginalia = xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals)
|
||||
for mm in range(len(found_polygons_text_region)):
|
||||
textregion = ET.SubElement(page, 'TextRegion')
|
||||
textregion.set('id', 'r%s' % id_indexer)
|
||||
id_indexer += 1
|
||||
textregion.set('type', 'paragraph')
|
||||
coord_text = ET.SubElement(textregion, 'Coords')
|
||||
coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord))
|
||||
id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l)
|
||||
add_textequiv(textregion)
|
||||
|
||||
for mm in range(len(found_polygons_marginals)):
|
||||
marginal = ET.SubElement(page, 'TextRegion')
|
||||
marginal.set('id', id_of_marginalia[mm])
|
||||
marginal.set('type', 'marginalia')
|
||||
coord_text = ET.SubElement(marginal, 'Coords')
|
||||
coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord))
|
||||
id_indexer_l = self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, id_indexer_l)
|
||||
|
||||
id_indexer = len(found_polygons_text_region) + len(found_polygons_marginals)
|
||||
for mm in range(len(found_polygons_text_region_img)):
|
||||
textregion = ET.SubElement(page, 'ImageRegion')
|
||||
textregion.set('id', 'r%s' % id_indexer)
|
||||
id_indexer += 1
|
||||
coord_text = ET.SubElement(textregion, 'Coords')
|
||||
points_co = ''
|
||||
for lmm in range(len(found_polygons_text_region_img[mm])):
|
||||
points_co += str(int((found_polygons_text_region_img[mm][lmm,0,0] + page_coord[2]) / self.scale_x))
|
||||
points_co += ','
|
||||
points_co += str(int((found_polygons_text_region_img[mm][lmm,0,1] + page_coord[0]) / self.scale_y))
|
||||
if lmm < len(found_polygons_text_region_img[mm]) - 1:
|
||||
points_co += ' '
|
||||
coord_text.set('points', points_co)
|
||||
|
||||
return pcgts
|
||||
|
||||
def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page):
|
||||
self.logger.debug('enter build_pagexml_full_layout')
|
||||
|
||||
# create the file structure
|
||||
pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org)
|
||||
page_print_sub = ET.SubElement(page, "Border")
|
||||
coord_page = ET.SubElement(page_print_sub, "Coords")
|
||||
coord_page.set('points', self.calculate_page_coords(cont_page))
|
||||
|
||||
id_indexer = 0
|
||||
id_indexer_l = 0
|
||||
id_of_marginalia = []
|
||||
|
||||
if len(found_polygons_text_region) > 0:
|
||||
id_of_marginalia = xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals)
|
||||
for mm in range(len(found_polygons_text_region)):
|
||||
textregion=ET.SubElement(page, 'TextRegion')
|
||||
textregion.set('id', 'r%s' % id_indexer)
|
||||
id_indexer += 1
|
||||
textregion.set('type', 'paragraph')
|
||||
coord_text = ET.SubElement(textregion, 'Coords')
|
||||
coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord))
|
||||
id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l)
|
||||
add_textequiv(textregion)
|
||||
|
||||
self.logger.debug('len(found_polygons_text_region_h) %s', len(found_polygons_text_region_h))
|
||||
if len(found_polygons_text_region_h) > 0:
|
||||
for mm in range(len(found_polygons_text_region_h)):
|
||||
textregion=ET.SubElement(page, 'TextRegion')
|
||||
textregion.set('id', 'r%s' % id_indexer)
|
||||
id_indexer += 1
|
||||
textregion.set('type','header')
|
||||
coord_text = ET.SubElement(textregion, 'Coords')
|
||||
coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_h, mm, page_coord))
|
||||
id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes, id_indexer_l)
|
||||
add_textequiv(textregion)
|
||||
|
||||
if len(found_polygons_drop_capitals) > 0:
|
||||
id_indexer = len(found_polygons_text_region) + len(found_polygons_text_region_h) + len(found_polygons_marginals)
|
||||
for mm in range(len(found_polygons_drop_capitals)):
|
||||
textregion=ET.SubElement(page, 'TextRegion')
|
||||
textregion.set('id',' r%s' % id_indexer)
|
||||
id_indexer += 1
|
||||
textregion.set('type', 'drop-capital')
|
||||
coord_text = ET.SubElement(textregion, 'Coords')
|
||||
coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals, mm, page_coord))
|
||||
add_textequiv(textregion)
|
||||
|
||||
for mm in range(len(found_polygons_marginals)):
|
||||
marginal = ET.SubElement(page, 'TextRegion')
|
||||
add_textequiv(textregion)
|
||||
marginal.set('id', id_of_marginalia[mm])
|
||||
marginal.set('type', 'marginalia')
|
||||
coord_text = ET.SubElement(marginal, 'Coords')
|
||||
coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord))
|
||||
id_indexer_l = self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, id_indexer_l)
|
||||
|
||||
id_indexer = len(found_polygons_text_region) + len(found_polygons_text_region_h) + len(found_polygons_marginals) + len(found_polygons_drop_capitals)
|
||||
for mm in range(len(found_polygons_text_region_img)):
|
||||
textregion=ET.SubElement(page, 'ImageRegion')
|
||||
textregion.set('id', 'r%s' % id_indexer)
|
||||
id_indexer += 1
|
||||
coord_text = ET.SubElement(textregion, 'Coords')
|
||||
coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_img, mm, page_coord))
|
||||
|
||||
for mm in range(len(found_polygons_tables)):
|
||||
textregion = ET.SubElement(page, 'TableRegion')
|
||||
textregion.set('id', 'r%s' %id_indexer)
|
||||
id_indexer += 1
|
||||
coord_text = ET.SubElement(textregion, 'Coords')
|
||||
coord_text.set('points', self.calculate_polygon_coords(found_polygons_tables, mm, page_coord))
|
||||
|
||||
return pcgts
|
||||
|
||||
def calculate_polygon_coords(self, contour_list, i, page_coord):
|
||||
self.logger.debug('enter calculate_polygon_coords')
|
||||
coords = ''
|
||||
for j in range(len(contour_list[i])):
|
||||
if len(contour_list[i][j]) == 2:
|
||||
coords += str(int((contour_list[i][j][0] + page_coord[2]) / self.scale_x))
|
||||
coords += ','
|
||||
coords += str(int((contour_list[i][j][1] + page_coord[0]) / self.scale_y))
|
||||
else:
|
||||
coords += str(int((contour_list[i][j][0][0] + page_coord[2]) / self.scale_x))
|
||||
coords += ','
|
||||
coords += str(int((contour_list[i][j][0][1] + page_coord[0]) / self.scale_y))
|
||||
|
||||
if j < len(contour_list[i]) - 1:
|
||||
coords=coords + ' '
|
||||
return coords
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,34 +0,0 @@
|
||||
from lxml import etree as ET
|
||||
|
||||
NAMESPACES = {}
|
||||
NAMESPACES['page'] = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
|
||||
NAMESPACES['xsi'] = "http://www.w3.org/2001/XMLSchema-instance"
|
||||
NAMESPACES[None] = NAMESPACES['page']
|
||||
|
||||
def create_page_xml(imageFilename, height, width):
|
||||
pcgts = ET.Element("PcGts", nsmap=NAMESPACES)
|
||||
|
||||
pcgts.set("{%s}schemaLocation" % NAMESPACES['xsi'], NAMESPACES['page'])
|
||||
|
||||
metadata = ET.SubElement(pcgts, "Metadata")
|
||||
|
||||
author = ET.SubElement(metadata, "Creator")
|
||||
author.text = "SBB_QURATOR"
|
||||
|
||||
created = ET.SubElement(metadata, "Created")
|
||||
created.text = "2019-06-17T18:15:12"
|
||||
|
||||
changetime = ET.SubElement(metadata, "LastChange")
|
||||
changetime.text = "2019-06-17T18:15:12"
|
||||
|
||||
page = ET.SubElement(pcgts, "Page")
|
||||
|
||||
page.set("imageFilename", imageFilename)
|
||||
page.set("imageHeight", str(height))
|
||||
page.set("imageWidth", str(width))
|
||||
page.set("type", "content")
|
||||
page.set("readingDirection", "left-to-right")
|
||||
page.set("textLineOrder", "top-to-bottom")
|
||||
|
||||
return pcgts, page
|
||||
|
@ -0,0 +1,54 @@
|
||||
# pylint: disable=unused-import
|
||||
|
||||
from os.path import dirname, realpath
|
||||
from os import chdir
|
||||
import sys
|
||||
import logging
|
||||
import io
|
||||
import collections
|
||||
from unittest import TestCase as VanillaTestCase, skip, main as unittests_main
|
||||
import pytest
|
||||
from ocrd_utils import disableLogging, initLogging
|
||||
|
||||
def main(fn=None):
|
||||
if fn:
|
||||
sys.exit(pytest.main([fn]))
|
||||
else:
|
||||
unittests_main()
|
||||
|
||||
class TestCase(VanillaTestCase):
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
chdir(dirname(realpath(__file__)) + '/..')
|
||||
|
||||
def setUp(self):
|
||||
disableLogging()
|
||||
initLogging()
|
||||
|
||||
class CapturingTestCase(TestCase):
|
||||
"""
|
||||
A TestCase that needs to capture stderr/stdout and invoke click CLI.
|
||||
"""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _setup_pytest_capfd(self, capfd):
|
||||
self.capfd = capfd
|
||||
|
||||
def invoke_cli(self, cli, args):
|
||||
"""
|
||||
Substitution for click.CliRunner.invooke that works together nicely
|
||||
with unittests/pytest capturing stdout/stderr.
|
||||
"""
|
||||
self.capture_out_err() # XXX snapshot just before executing the CLI
|
||||
code = 0
|
||||
sys.argv[1:] = args # XXX necessary because sys.argv reflects pytest args not cli args
|
||||
try:
|
||||
cli.main(args=args)
|
||||
except SystemExit as e:
|
||||
code = e.code
|
||||
out, err = self.capture_out_err()
|
||||
return code, out, err
|
||||
|
||||
def capture_out_err(self):
|
||||
return self.capfd.readouterr()
|
@ -0,0 +1,10 @@
|
||||
from pathlib import Path
|
||||
from qurator.eynollah.utils.pil_cv2 import check_dpi
|
||||
from tests.base import main
|
||||
|
||||
def test_dpi():
|
||||
fpath = Path(__file__).parent.joinpath('resources', 'kant_aufklaerung_1784_0020.tif')
|
||||
assert 300 == check_dpi(str(fpath))
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(__file__)
|
@ -0,0 +1,24 @@
|
||||
from os import environ
|
||||
from pathlib import Path
|
||||
from ocrd_utils import pushd_popd
|
||||
from tests.base import CapturingTestCase as TestCase, main
|
||||
from qurator.eynollah.cli import main as eynollah_cli
|
||||
|
||||
testdir = Path(__file__).parent.resolve()
|
||||
|
||||
EYNOLLAH_MODELS = environ.get('EYNOLLAH_MODELS', str(testdir.joinpath('..', 'models_eynollah').resolve()))
|
||||
|
||||
class TestEynollahRun(TestCase):
|
||||
|
||||
def test_full_run(self):
|
||||
with pushd_popd(tempdir=True) as tempdir:
|
||||
code, out, err = self.invoke_cli(eynollah_cli, [
|
||||
'-m', EYNOLLAH_MODELS,
|
||||
'-i', str(testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')),
|
||||
'-o', tempdir
|
||||
])
|
||||
print(code, out, err)
|
||||
assert not code
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(__file__)
|
@ -1,7 +1,7 @@
|
||||
def test_utils_import():
|
||||
import sbb_newspapers_org_image.utils
|
||||
import sbb_newspapers_org_image.utils.contour
|
||||
import sbb_newspapers_org_image.utils.drop_capitals
|
||||
import sbb_newspapers_org_image.utils.drop_capitals
|
||||
import sbb_newspapers_org_image.utils.is_nan
|
||||
import sbb_newspapers_org_image.utils.rotate
|
||||
import qurator.eynollah.utils
|
||||
import qurator.eynollah.utils.contour
|
||||
import qurator.eynollah.utils.drop_capitals
|
||||
import qurator.eynollah.utils.drop_capitals
|
||||
import qurator.eynollah.utils.is_nan
|
||||
import qurator.eynollah.utils.rotate
|
||||
|
Loading…
Reference in New Issue