mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-12-01 08:44:13 +01:00
🔥 refactor eynollah ocr
.
This commit is contained in:
parent
30f9c695dc
commit
b161e33854
5 changed files with 769 additions and 865 deletions
|
|
@ -88,7 +88,6 @@ def ocr_cli(
|
|||
tr_ocr,
|
||||
do_not_mask_with_textline_contour,
|
||||
batch_size,
|
||||
dataset_abbrevation,
|
||||
min_conf_value_of_textline_text,
|
||||
):
|
||||
"""
|
||||
|
|
@ -101,7 +100,6 @@ def ocr_cli(
|
|||
tr_ocr=tr_ocr,
|
||||
do_not_mask_with_textline_contour=do_not_mask_with_textline_contour,
|
||||
batch_size=batch_size,
|
||||
pref_of_dataset=dataset_abbrevation,
|
||||
min_conf_value_of_textline_text=min_conf_value_of_textline_text)
|
||||
eynollah_ocr.run(overwrite=overwrite,
|
||||
dir_in=dir_in,
|
||||
|
|
|
|||
|
|
@ -1,24 +1,22 @@
|
|||
# FIXME: fix all of those...
|
||||
# pyright: reportPossiblyUnboundVariable=false
|
||||
# pyright: reportOptionalMemberAccess=false
|
||||
# pyright: reportArgumentType=false
|
||||
# pyright: reportCallIssue=false
|
||||
# pyright: reportOptionalSubscript=false
|
||||
|
||||
from logging import Logger, getLogger
|
||||
from typing import Optional
|
||||
from typing import List, Optional
|
||||
from pathlib import Path
|
||||
import os
|
||||
import gc
|
||||
import sys
|
||||
import math
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
|
||||
import cv2
|
||||
import xml.etree.ElementTree as ET
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
from cv2.typing import MatLike
|
||||
from xml.etree import ElementTree as ET
|
||||
from PIL import Image, ImageDraw
|
||||
import numpy as np
|
||||
from eynollah.model_zoo import EynollahModelZoo
|
||||
from eynollah.utils.font import get_font
|
||||
from eynollah.utils.xml import etree_namespace_for_element_tag
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
|
|
@ -38,11 +36,13 @@ from .utils.utils_ocr import (
|
|||
rotate_image_with_padding,
|
||||
)
|
||||
|
||||
# cannot use importlib.resources until we move to 3.9+ forimportlib.resources.files
|
||||
if sys.version_info < (3, 10):
|
||||
import importlib_resources
|
||||
else:
|
||||
import importlib.resources as importlib_resources
|
||||
# TODO: refine typing
|
||||
@dataclass
|
||||
class EynollahOcrResult:
|
||||
extracted_texts_merged: List
|
||||
extracted_conf_value_merged: Optional[List]
|
||||
cropped_lines_region_indexer: List
|
||||
total_bb_coordinates:List
|
||||
|
||||
class Eynollah_ocr:
|
||||
def __init__(
|
||||
|
|
@ -76,6 +76,7 @@ class Eynollah_ocr:
|
|||
|
||||
@property
|
||||
def device(self):
|
||||
assert torch
|
||||
if torch.cuda.is_available():
|
||||
self.logger.info("Using GPU acceleration")
|
||||
return torch.device("cuda:0")
|
||||
|
|
@ -83,59 +84,17 @@ class Eynollah_ocr:
|
|||
self.logger.info("Using CPU processing")
|
||||
return torch.device("cpu")
|
||||
|
||||
def run(self, overwrite: bool = False,
|
||||
dir_in: Optional[str] = None,
|
||||
# Prediction with RGB and binarized images for selected pages, should not be the default
|
||||
dir_in_bin: Optional[str] = None,
|
||||
image_filename: Optional[str] = None,
|
||||
dir_xmls: Optional[str] = None,
|
||||
dir_out_image_text: Optional[str] = None,
|
||||
dir_out: Optional[str] = None,
|
||||
):
|
||||
if dir_in:
|
||||
ls_imgs = [os.path.join(dir_in, image_filename)
|
||||
for image_filename in filter(is_image_filename,
|
||||
os.listdir(dir_in))]
|
||||
else:
|
||||
assert image_filename
|
||||
ls_imgs = [image_filename]
|
||||
def run_trocr(
|
||||
self,
|
||||
*,
|
||||
img: MatLike,
|
||||
page_tree: ET.ElementTree,
|
||||
page_ns,
|
||||
tr_ocr_input_height_and_width,
|
||||
) -> EynollahOcrResult:
|
||||
|
||||
if self.tr_ocr:
|
||||
tr_ocr_input_height_and_width = 384
|
||||
for dir_img in ls_imgs:
|
||||
file_name = Path(dir_img).stem
|
||||
assert dir_xmls # FIXME: check the logic
|
||||
dir_xml = os.path.join(dir_xmls, file_name+'.xml')
|
||||
assert dir_out # FIXME: check the logic
|
||||
out_file_ocr = os.path.join(dir_out, file_name+'.xml')
|
||||
|
||||
if os.path.exists(out_file_ocr):
|
||||
if overwrite:
|
||||
self.logger.warning("will overwrite existing output file '%s'", out_file_ocr)
|
||||
else:
|
||||
self.logger.warning("will skip input for existing output file '%s'", out_file_ocr)
|
||||
continue
|
||||
|
||||
img = cv2.imread(dir_img)
|
||||
|
||||
if dir_out_image_text:
|
||||
out_image_with_text = os.path.join(dir_out_image_text, file_name+'.png')
|
||||
image_text = Image.new("RGB", (img.shape[1], img.shape[0]), "white")
|
||||
draw = ImageDraw.Draw(image_text)
|
||||
total_bb_coordinates = []
|
||||
|
||||
##file_name = Path(dir_xmls).stem
|
||||
tree1 = ET.parse(dir_xml, parser = ET.XMLParser(encoding="utf-8"))
|
||||
root1=tree1.getroot()
|
||||
alltags=[elem.tag for elem in root1.iter()]
|
||||
link=alltags[0].split('}')[0]+'}'
|
||||
|
||||
name_space = alltags[0].split('}')[0]
|
||||
name_space = name_space.split('{')[1]
|
||||
|
||||
region_tags=np.unique([x for x in alltags if x.endswith('TextRegion')])
|
||||
|
||||
|
||||
|
||||
cropped_lines = []
|
||||
cropped_lines_region_indexer = []
|
||||
|
|
@ -146,7 +105,7 @@ class Eynollah_ocr:
|
|||
indexer_text_region = 0
|
||||
indexer_b_s = 0
|
||||
|
||||
for nn in root1.iter(region_tags):
|
||||
for nn in page_tree.getroot().iter(f'{{{page_ns}}}TextRegion'):
|
||||
for child_textregion in nn:
|
||||
if child_textregion.tag.endswith("TextLine"):
|
||||
|
||||
|
|
@ -159,7 +118,6 @@ class Eynollah_ocr:
|
|||
for x in p_h] )
|
||||
x,y,w,h = cv2.boundingRect(textline_coords)
|
||||
|
||||
if dir_out_image_text:
|
||||
total_bb_coordinates.append([x,y,w,h])
|
||||
|
||||
h2w_ratio = h/float(w)
|
||||
|
|
@ -301,185 +259,37 @@ class Eynollah_ocr:
|
|||
extracted_texts_merged = [ind for ind in extracted_texts_merged if ind is not None]
|
||||
#print(extracted_texts_merged, len(extracted_texts_merged))
|
||||
|
||||
unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer)
|
||||
return EynollahOcrResult(
|
||||
extracted_texts_merged=extracted_texts_merged,
|
||||
extracted_conf_value_merged=None,
|
||||
cropped_lines_region_indexer=cropped_lines_region_indexer,
|
||||
total_bb_coordinates=total_bb_coordinates,
|
||||
)
|
||||
|
||||
if dir_out_image_text:
|
||||
def run_cnn(
|
||||
self,
|
||||
*,
|
||||
img: MatLike,
|
||||
img_bin: Optional[MatLike],
|
||||
page_tree: ET.ElementTree,
|
||||
page_ns,
|
||||
image_width,
|
||||
image_height,
|
||||
) -> EynollahOcrResult:
|
||||
|
||||
#font_path = "Charis-7.000/Charis-Regular.ttf" # Make sure this file exists!
|
||||
font = importlib_resources.files(__package__) / "Charis-Regular.ttf"
|
||||
with importlib_resources.as_file(font) as font:
|
||||
font = ImageFont.truetype(font=font, size=40)
|
||||
|
||||
for indexer_text, bb_ind in enumerate(total_bb_coordinates):
|
||||
|
||||
|
||||
x_bb = bb_ind[0]
|
||||
y_bb = bb_ind[1]
|
||||
w_bb = bb_ind[2]
|
||||
h_bb = bb_ind[3]
|
||||
|
||||
font = fit_text_single_line(draw, extracted_texts_merged[indexer_text],
|
||||
font.path, w_bb, int(h_bb*0.4) )
|
||||
|
||||
##draw.rectangle([x_bb, y_bb, x_bb + w_bb, y_bb + h_bb], outline="red", width=2)
|
||||
|
||||
text_bbox = draw.textbbox((0, 0), extracted_texts_merged[indexer_text], font=font)
|
||||
text_width = text_bbox[2] - text_bbox[0]
|
||||
text_height = text_bbox[3] - text_bbox[1]
|
||||
|
||||
text_x = x_bb + (w_bb - text_width) // 2 # Center horizontally
|
||||
text_y = y_bb + (h_bb - text_height) // 2 # Center vertically
|
||||
|
||||
# Draw the text
|
||||
draw.text((text_x, text_y), extracted_texts_merged[indexer_text], fill="black", font=font)
|
||||
image_text.save(out_image_with_text)
|
||||
|
||||
#print(len(unique_cropped_lines_region_indexer), 'unique_cropped_lines_region_indexer')
|
||||
#######text_by_textregion = []
|
||||
#######for ind in unique_cropped_lines_region_indexer:
|
||||
#######ind = np.array(cropped_lines_region_indexer)==ind
|
||||
#######extracted_texts_merged_un = np.array(extracted_texts_merged)[ind]
|
||||
#######text_by_textregion.append(" ".join(extracted_texts_merged_un))
|
||||
|
||||
text_by_textregion = []
|
||||
for ind in unique_cropped_lines_region_indexer:
|
||||
ind = np.array(cropped_lines_region_indexer) == ind
|
||||
extracted_texts_merged_un = np.array(extracted_texts_merged)[ind]
|
||||
if len(extracted_texts_merged_un)>1:
|
||||
text_by_textregion_ind = ""
|
||||
next_glue = ""
|
||||
for indt in range(len(extracted_texts_merged_un)):
|
||||
if (extracted_texts_merged_un[indt].endswith('⸗') or
|
||||
extracted_texts_merged_un[indt].endswith('-') or
|
||||
extracted_texts_merged_un[indt].endswith('¬')):
|
||||
text_by_textregion_ind += next_glue + extracted_texts_merged_un[indt][:-1]
|
||||
next_glue = ""
|
||||
else:
|
||||
text_by_textregion_ind += next_glue + extracted_texts_merged_un[indt]
|
||||
next_glue = " "
|
||||
text_by_textregion.append(text_by_textregion_ind)
|
||||
else:
|
||||
text_by_textregion.append(" ".join(extracted_texts_merged_un))
|
||||
|
||||
|
||||
indexer = 0
|
||||
indexer_textregion = 0
|
||||
for nn in root1.iter(region_tags):
|
||||
#id_textregion = nn.attrib['id']
|
||||
#id_textregions.append(id_textregion)
|
||||
#textregions_by_existing_ids.append(text_by_textregion[indexer_textregion])
|
||||
|
||||
is_textregion_text = False
|
||||
for childtest in nn:
|
||||
if childtest.tag.endswith("TextEquiv"):
|
||||
is_textregion_text = True
|
||||
|
||||
if not is_textregion_text:
|
||||
text_subelement_textregion = ET.SubElement(nn, 'TextEquiv')
|
||||
unicode_textregion = ET.SubElement(text_subelement_textregion, 'Unicode')
|
||||
|
||||
|
||||
has_textline = False
|
||||
for child_textregion in nn:
|
||||
if child_textregion.tag.endswith("TextLine"):
|
||||
|
||||
is_textline_text = False
|
||||
for childtest2 in child_textregion:
|
||||
if childtest2.tag.endswith("TextEquiv"):
|
||||
is_textline_text = True
|
||||
|
||||
|
||||
if not is_textline_text:
|
||||
text_subelement = ET.SubElement(child_textregion, 'TextEquiv')
|
||||
##text_subelement.set('conf', f"{extracted_conf_value_merged[indexer]:.2f}")
|
||||
unicode_textline = ET.SubElement(text_subelement, 'Unicode')
|
||||
unicode_textline.text = extracted_texts_merged[indexer]
|
||||
else:
|
||||
for childtest3 in child_textregion:
|
||||
if childtest3.tag.endswith("TextEquiv"):
|
||||
for child_uc in childtest3:
|
||||
if child_uc.tag.endswith("Unicode"):
|
||||
##childtest3.set('conf', f"{extracted_conf_value_merged[indexer]:.2f}")
|
||||
child_uc.text = extracted_texts_merged[indexer]
|
||||
|
||||
indexer = indexer + 1
|
||||
has_textline = True
|
||||
if has_textline:
|
||||
if is_textregion_text:
|
||||
for child4 in nn:
|
||||
if child4.tag.endswith("TextEquiv"):
|
||||
for childtr_uc in child4:
|
||||
if childtr_uc.tag.endswith("Unicode"):
|
||||
childtr_uc.text = text_by_textregion[indexer_textregion]
|
||||
else:
|
||||
unicode_textregion.text = text_by_textregion[indexer_textregion]
|
||||
indexer_textregion = indexer_textregion + 1
|
||||
|
||||
###sample_order = [(id_to_order[tid], text)
|
||||
### for tid, text in zip(id_textregions, textregions_by_existing_ids)
|
||||
### if tid in id_to_order]
|
||||
|
||||
##ordered_texts_sample = [text for _, text in sorted(sample_order)]
|
||||
##tot_page_text = ' '.join(ordered_texts_sample)
|
||||
|
||||
##for page_element in root1.iter(link+'Page'):
|
||||
##text_page = ET.SubElement(page_element, 'TextEquiv')
|
||||
##unicode_textpage = ET.SubElement(text_page, 'Unicode')
|
||||
##unicode_textpage.text = tot_page_text
|
||||
|
||||
ET.register_namespace("",name_space)
|
||||
tree1.write(out_file_ocr,xml_declaration=True,method='xml',encoding="utf-8",default_namespace=None)
|
||||
else:
|
||||
###max_len = 280#512#280#512
|
||||
###padding_token = 1500#299#1500#299
|
||||
image_width = 512#max_len * 4
|
||||
image_height = 32
|
||||
|
||||
|
||||
img_size=(image_width, image_height)
|
||||
|
||||
for dir_img in ls_imgs:
|
||||
file_name = Path(dir_img).stem
|
||||
dir_xml = os.path.join(dir_xmls, file_name+'.xml')
|
||||
out_file_ocr = os.path.join(dir_out, file_name+'.xml')
|
||||
|
||||
if os.path.exists(out_file_ocr):
|
||||
if overwrite:
|
||||
self.logger.warning("will overwrite existing output file '%s'", out_file_ocr)
|
||||
else:
|
||||
self.logger.warning("will skip input for existing output file '%s'", out_file_ocr)
|
||||
continue
|
||||
|
||||
img = cv2.imread(dir_img)
|
||||
if dir_in_bin is not None:
|
||||
cropped_lines_bin = []
|
||||
img_bin = cv2.imread(os.path.join(dir_in_bin, file_name+'.png'))
|
||||
|
||||
if dir_out_image_text:
|
||||
out_image_with_text = os.path.join(dir_out_image_text, file_name+'.png')
|
||||
image_text = Image.new("RGB", (img.shape[1], img.shape[0]), "white")
|
||||
draw = ImageDraw.Draw(image_text)
|
||||
total_bb_coordinates = []
|
||||
|
||||
tree1 = ET.parse(dir_xml, parser = ET.XMLParser(encoding="utf-8"))
|
||||
root1=tree1.getroot()
|
||||
alltags=[elem.tag for elem in root1.iter()]
|
||||
link=alltags[0].split('}')[0]+'}'
|
||||
|
||||
name_space = alltags[0].split('}')[0]
|
||||
name_space = name_space.split('{')[1]
|
||||
|
||||
region_tags=np.unique([x for x in alltags if x.endswith('TextRegion')])
|
||||
|
||||
cropped_lines = []
|
||||
img_crop_bin = None
|
||||
imgs_bin = None
|
||||
imgs_bin_ver_flipped = None
|
||||
cropped_lines_bin = []
|
||||
cropped_lines_ver_index = []
|
||||
cropped_lines_region_indexer = []
|
||||
cropped_lines_meging_indexing = []
|
||||
|
||||
tinl = time.time()
|
||||
indexer_text_region = 0
|
||||
indexer_textlines = 0
|
||||
for nn in root1.iter(region_tags):
|
||||
for nn in page_tree.getroot().iter(f'{{{page_ns}}}TextRegion'):
|
||||
try:
|
||||
type_textregion = nn.attrib['type']
|
||||
except:
|
||||
|
|
@ -502,13 +312,12 @@ class Eynollah_ocr:
|
|||
if type_textregion=='drop-capital':
|
||||
angle_degrees = 0
|
||||
|
||||
if dir_out_image_text:
|
||||
total_bb_coordinates.append([x,y,w,h])
|
||||
|
||||
w_scaled = w * image_height/float(h)
|
||||
|
||||
img_poly_on_img = np.copy(img)
|
||||
if dir_in_bin is not None:
|
||||
if img_bin:
|
||||
img_poly_on_img_bin = np.copy(img_bin)
|
||||
img_crop_bin = img_poly_on_img_bin[y:y+h, x:x+w, :]
|
||||
|
||||
|
|
@ -528,7 +337,7 @@ class Eynollah_ocr:
|
|||
better_des_slope = get_orientation_moments(textline_coords)
|
||||
|
||||
img_crop = rotate_image_with_padding(img_crop, better_des_slope)
|
||||
if dir_in_bin is not None:
|
||||
if img_bin:
|
||||
img_crop_bin = rotate_image_with_padding(img_crop_bin, better_des_slope)
|
||||
|
||||
mask_poly = rotate_image_with_padding(mask_poly, better_des_slope)
|
||||
|
|
@ -542,13 +351,13 @@ class Eynollah_ocr:
|
|||
|
||||
if not self.do_not_mask_with_textline_contour:
|
||||
img_crop[mask_poly==0] = 255
|
||||
if dir_in_bin is not None:
|
||||
if img_bin:
|
||||
img_crop_bin = img_crop_bin[y_n:y_n+h_n, x_n:x_n+w_n, :]
|
||||
if not self.do_not_mask_with_textline_contour:
|
||||
img_crop_bin[mask_poly==0] = 255
|
||||
|
||||
if mask_poly[:,:,0].sum() /float(w_n*h_n) < 0.50 and w_scaled > 90:
|
||||
if dir_in_bin is not None:
|
||||
if img_bin:
|
||||
img_crop, img_crop_bin = \
|
||||
break_curved_line_into_small_pieces_and_then_merge(
|
||||
img_crop, mask_poly, img_crop_bin)
|
||||
|
|
@ -561,14 +370,14 @@ class Eynollah_ocr:
|
|||
better_des_slope = 0
|
||||
if not self.do_not_mask_with_textline_contour:
|
||||
img_crop[mask_poly==0] = 255
|
||||
if dir_in_bin is not None:
|
||||
if img_bin:
|
||||
if not self.do_not_mask_with_textline_contour:
|
||||
img_crop_bin[mask_poly==0] = 255
|
||||
if type_textregion=='drop-capital':
|
||||
pass
|
||||
else:
|
||||
if mask_poly[:,:,0].sum() /float(w*h) < 0.50 and w_scaled > 90:
|
||||
if dir_in_bin is not None:
|
||||
if img_bin:
|
||||
img_crop, img_crop_bin = \
|
||||
break_curved_line_into_small_pieces_and_then_merge(
|
||||
img_crop, mask_poly, img_crop_bin)
|
||||
|
|
@ -587,13 +396,13 @@ class Eynollah_ocr:
|
|||
cropped_lines_ver_index.append(0)
|
||||
|
||||
cropped_lines_meging_indexing.append(0)
|
||||
if dir_in_bin is not None:
|
||||
if img_bin:
|
||||
img_fin = preprocess_and_resize_image_for_ocrcnn_model(
|
||||
img_crop_bin, image_height, image_width)
|
||||
cropped_lines_bin.append(img_fin)
|
||||
else:
|
||||
splited_images, splited_images_bin = return_textlines_split_if_needed(
|
||||
img_crop, img_crop_bin if dir_in_bin is not None else None)
|
||||
img_crop, img_crop_bin if img_bin else None)
|
||||
if splited_images:
|
||||
img_fin = preprocess_and_resize_image_for_ocrcnn_model(
|
||||
splited_images[0], image_height, image_width)
|
||||
|
|
@ -616,7 +425,7 @@ class Eynollah_ocr:
|
|||
else:
|
||||
cropped_lines_ver_index.append(0)
|
||||
|
||||
if dir_in_bin is not None:
|
||||
if img_bin:
|
||||
img_fin = preprocess_and_resize_image_for_ocrcnn_model(
|
||||
splited_images_bin[0], image_height, image_width)
|
||||
cropped_lines_bin.append(img_fin)
|
||||
|
|
@ -635,7 +444,7 @@ class Eynollah_ocr:
|
|||
else:
|
||||
cropped_lines_ver_index.append(0)
|
||||
|
||||
if dir_in_bin is not None:
|
||||
if img_bin:
|
||||
img_fin = preprocess_and_resize_image_for_ocrcnn_model(
|
||||
img_crop_bin, image_height, image_width)
|
||||
cropped_lines_bin.append(img_fin)
|
||||
|
|
@ -648,6 +457,7 @@ class Eynollah_ocr:
|
|||
|
||||
n_iterations = math.ceil(len(cropped_lines) / self.b_s)
|
||||
|
||||
# FIXME: copy pasta
|
||||
for i in range(n_iterations):
|
||||
if i==(n_iterations-1):
|
||||
n_start = i*self.b_s
|
||||
|
|
@ -667,7 +477,7 @@ class Eynollah_ocr:
|
|||
else:
|
||||
imgs_ver_flipped = None
|
||||
|
||||
if dir_in_bin is not None:
|
||||
if img_bin:
|
||||
imgs_bin = cropped_lines_bin[n_start:]
|
||||
imgs_bin = np.array(imgs_bin)
|
||||
imgs_bin = imgs_bin.reshape(imgs_bin.shape[0], image_height, image_width, 3)
|
||||
|
|
@ -697,7 +507,7 @@ class Eynollah_ocr:
|
|||
imgs_ver_flipped = None
|
||||
|
||||
|
||||
if dir_in_bin is not None:
|
||||
if img_bin:
|
||||
imgs_bin = cropped_lines_bin[n_start:n_end]
|
||||
imgs_bin = np.array(imgs_bin).reshape(self.b_s, image_height, image_width, 3)
|
||||
|
||||
|
|
@ -743,7 +553,8 @@ class Eynollah_ocr:
|
|||
indices_to_be_replaced = indices_ver[indices_where_flipped_conf_value_is_higher]
|
||||
preds[indices_to_be_replaced,:,:] = \
|
||||
preds_flipped[indices_where_flipped_conf_value_is_higher, :, :]
|
||||
if dir_in_bin is not None:
|
||||
|
||||
if img_bin:
|
||||
preds_bin = self.model_zoo.get('ocr').predict(imgs_bin, verbose=0)
|
||||
|
||||
if len(indices_ver)>0:
|
||||
|
|
@ -797,7 +608,6 @@ class Eynollah_ocr:
|
|||
extracted_texts.append("")
|
||||
extracted_conf_value.append(0)
|
||||
del cropped_lines
|
||||
if dir_in_bin is not None:
|
||||
del cropped_lines_bin
|
||||
gc.collect()
|
||||
|
||||
|
|
@ -808,24 +618,46 @@ class Eynollah_ocr:
|
|||
else None
|
||||
for ind in range(len(cropped_lines_meging_indexing))]
|
||||
|
||||
extracted_conf_value_merged = [extracted_conf_value[ind]
|
||||
extracted_conf_value_merged = [extracted_conf_value[ind] # type: ignore
|
||||
if cropped_lines_meging_indexing[ind]==0
|
||||
else (extracted_conf_value[ind]+extracted_conf_value[ind+1])/2.
|
||||
if cropped_lines_meging_indexing[ind]==1
|
||||
else None
|
||||
for ind in range(len(cropped_lines_meging_indexing))]
|
||||
|
||||
extracted_conf_value_merged = [extracted_conf_value_merged[ind_cfm]
|
||||
extracted_conf_value_merged: List[float] = [extracted_conf_value_merged[ind_cfm]
|
||||
for ind_cfm in range(len(extracted_texts_merged))
|
||||
if extracted_texts_merged[ind_cfm] is not None]
|
||||
extracted_texts_merged = [ind for ind in extracted_texts_merged if ind is not None]
|
||||
unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer)
|
||||
|
||||
if dir_out_image_text:
|
||||
#font_path = "Charis-7.000/Charis-Regular.ttf" # Make sure this file exists!
|
||||
font = importlib_resources.files(__package__) / "Charis-Regular.ttf"
|
||||
with importlib_resources.as_file(font) as font:
|
||||
font = ImageFont.truetype(font=font, size=40)
|
||||
extracted_texts_merged = [ind for ind in extracted_texts_merged if ind is not None]
|
||||
|
||||
return EynollahOcrResult(
|
||||
extracted_texts_merged=extracted_texts_merged,
|
||||
extracted_conf_value_merged=extracted_conf_value_merged,
|
||||
cropped_lines_region_indexer=cropped_lines_region_indexer,
|
||||
total_bb_coordinates=total_bb_coordinates,
|
||||
)
|
||||
|
||||
def write_ocr(
|
||||
self,
|
||||
*,
|
||||
result: EynollahOcrResult,
|
||||
page_tree: ET.ElementTree,
|
||||
out_file_ocr,
|
||||
page_ns,
|
||||
img,
|
||||
out_image_with_text,
|
||||
):
|
||||
cropped_lines_region_indexer = result.cropped_lines_region_indexer
|
||||
total_bb_coordinates = result.total_bb_coordinates
|
||||
extracted_texts_merged = result.extracted_texts_merged
|
||||
extracted_conf_value_merged = result.extracted_conf_value_merged
|
||||
|
||||
unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer)
|
||||
if out_image_with_text:
|
||||
image_text = Image.new("RGB", (img.shape[1], img.shape[0]), "white")
|
||||
draw = ImageDraw.Draw(image_text)
|
||||
font = get_font()
|
||||
|
||||
for indexer_text, bb_ind in enumerate(total_bb_coordinates):
|
||||
x_bb = bb_ind[0]
|
||||
|
|
@ -868,25 +700,10 @@ class Eynollah_ocr:
|
|||
text_by_textregion.append(text_by_textregion_ind)
|
||||
else:
|
||||
text_by_textregion.append(" ".join(extracted_texts_merged_un))
|
||||
#print(text_by_textregion, 'text_by_textregiontext_by_textregiontext_by_textregiontext_by_textregiontext_by_textregion')
|
||||
|
||||
###index_tot_regions = []
|
||||
###tot_region_ref = []
|
||||
|
||||
###for jj in root1.iter(link+'RegionRefIndexed'):
|
||||
###index_tot_regions.append(jj.attrib['index'])
|
||||
###tot_region_ref.append(jj.attrib['regionRef'])
|
||||
|
||||
###id_to_order = {tid: ro for tid, ro in zip(tot_region_ref, index_tot_regions)}
|
||||
|
||||
#id_textregions = []
|
||||
#textregions_by_existing_ids = []
|
||||
indexer = 0
|
||||
indexer_textregion = 0
|
||||
for nn in root1.iter(region_tags):
|
||||
#id_textregion = nn.attrib['id']
|
||||
#id_textregions.append(id_textregion)
|
||||
#textregions_by_existing_ids.append(text_by_textregion[indexer_textregion])
|
||||
for nn in page_tree.getroot().iter(f'{{{page_ns}}}TextRegion'):
|
||||
|
||||
is_textregion_text = False
|
||||
for childtest in nn:
|
||||
|
|
@ -910,6 +727,7 @@ class Eynollah_ocr:
|
|||
|
||||
if not is_textline_text:
|
||||
text_subelement = ET.SubElement(child_textregion, 'TextEquiv')
|
||||
if extracted_conf_value_merged:
|
||||
text_subelement.set('conf', f"{extracted_conf_value_merged[indexer]:.2f}")
|
||||
unicode_textline = ET.SubElement(text_subelement, 'Unicode')
|
||||
unicode_textline.text = extracted_texts_merged[indexer]
|
||||
|
|
@ -918,8 +736,8 @@ class Eynollah_ocr:
|
|||
if childtest3.tag.endswith("TextEquiv"):
|
||||
for child_uc in childtest3:
|
||||
if child_uc.tag.endswith("Unicode"):
|
||||
childtest3.set('conf',
|
||||
f"{extracted_conf_value_merged[indexer]:.2f}")
|
||||
if extracted_conf_value_merged:
|
||||
childtest3.set('conf', f"{extracted_conf_value_merged[indexer]:.2f}")
|
||||
child_uc.text = extracted_texts_merged[indexer]
|
||||
|
||||
indexer = indexer + 1
|
||||
|
|
@ -935,18 +753,85 @@ class Eynollah_ocr:
|
|||
unicode_textregion.text = text_by_textregion[indexer_textregion]
|
||||
indexer_textregion = indexer_textregion + 1
|
||||
|
||||
###sample_order = [(id_to_order[tid], text)
|
||||
### for tid, text in zip(id_textregions, textregions_by_existing_ids)
|
||||
### if tid in id_to_order]
|
||||
ET.register_namespace("",page_ns)
|
||||
page_tree.write(out_file_ocr, xml_declaration=True, method='xml', encoding="utf-8", default_namespace=None)
|
||||
|
||||
##ordered_texts_sample = [text for _, text in sorted(sample_order)]
|
||||
##tot_page_text = ' '.join(ordered_texts_sample)
|
||||
def run(
|
||||
self,
|
||||
*,
|
||||
overwrite: bool = False,
|
||||
dir_in: Optional[str] = None,
|
||||
dir_in_bin: Optional[str] = None,
|
||||
image_filename: Optional[str] = None,
|
||||
dir_xmls: str,
|
||||
dir_out_image_text: Optional[str] = None,
|
||||
dir_out: str,
|
||||
):
|
||||
"""
|
||||
Run OCR.
|
||||
|
||||
##for page_element in root1.iter(link+'Page'):
|
||||
##text_page = ET.SubElement(page_element, 'TextEquiv')
|
||||
##unicode_textpage = ET.SubElement(text_page, 'Unicode')
|
||||
##unicode_textpage.text = tot_page_text
|
||||
Args:
|
||||
|
||||
ET.register_namespace("",name_space)
|
||||
tree1.write(out_file_ocr,xml_declaration=True,method='xml',encoding="utf-8",default_namespace=None)
|
||||
#print("Job done in %.1fs", time.time() - t0)
|
||||
dir_in_bin (str): Prediction with RGB and binarized images for selected pages, should not be the default
|
||||
"""
|
||||
if dir_in:
|
||||
ls_imgs = [os.path.join(dir_in, image_filename)
|
||||
for image_filename in filter(is_image_filename,
|
||||
os.listdir(dir_in))]
|
||||
else:
|
||||
assert image_filename
|
||||
ls_imgs = [image_filename]
|
||||
|
||||
for img_filename in ls_imgs:
|
||||
file_stem = Path(img_filename).stem
|
||||
page_file_in = os.path.join(dir_xmls, file_stem+'.xml')
|
||||
out_file_ocr = os.path.join(dir_out, file_stem+'.xml')
|
||||
|
||||
if os.path.exists(out_file_ocr):
|
||||
if overwrite:
|
||||
self.logger.warning("will overwrite existing output file '%s'", out_file_ocr)
|
||||
else:
|
||||
self.logger.warning("will skip input for existing output file '%s'", out_file_ocr)
|
||||
return
|
||||
|
||||
img = cv2.imread(img_filename)
|
||||
|
||||
page_tree = ET.parse(page_file_in, parser = ET.XMLParser(encoding="utf-8"))
|
||||
page_ns = etree_namespace_for_element_tag(page_tree.getroot().tag)
|
||||
|
||||
out_image_with_text = None
|
||||
if dir_out_image_text:
|
||||
out_image_with_text = os.path.join(dir_out_image_text, file_stem + '.png')
|
||||
|
||||
img_bin = None
|
||||
if dir_in_bin:
|
||||
img_bin = cv2.imread(os.path.join(dir_in_bin, file_stem+'.png'))
|
||||
|
||||
|
||||
if self.tr_ocr:
|
||||
result = self.run_trocr(
|
||||
img=img,
|
||||
page_tree=page_tree,
|
||||
page_ns=page_ns,
|
||||
|
||||
tr_ocr_input_height_and_width = 384
|
||||
)
|
||||
else:
|
||||
result = self.run_cnn(
|
||||
img=img,
|
||||
page_tree=page_tree,
|
||||
page_ns=page_ns,
|
||||
|
||||
img_bin=img_bin,
|
||||
image_width=512,
|
||||
image_height=32,
|
||||
)
|
||||
|
||||
self.write_ocr(
|
||||
result=result,
|
||||
img=img,
|
||||
page_tree=page_tree,
|
||||
page_ns=page_ns,
|
||||
out_file_ocr=out_file_ocr,
|
||||
out_image_with_text=out_image_with_text,
|
||||
)
|
||||
|
|
|
|||
16
src/eynollah/utils/font.py
Normal file
16
src/eynollah/utils/font.py
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
|
||||
# cannot use importlib.resources until we move to 3.9+ forimportlib.resources.files
|
||||
import sys
|
||||
from PIL import ImageFont
|
||||
|
||||
if sys.version_info < (3, 10):
|
||||
import importlib_resources
|
||||
else:
|
||||
import importlib.resources as importlib_resources
|
||||
|
||||
|
||||
def get_font():
|
||||
#font_path = "Charis-7.000/Charis-Regular.ttf" # Make sure this file exists!
|
||||
font = importlib_resources.files(__package__) / "../Charis-Regular.ttf"
|
||||
with importlib_resources.as_file(font) as font:
|
||||
return ImageFont.truetype(font=font, size=40)
|
||||
|
|
@ -128,6 +128,7 @@ def return_textlines_split_if_needed(textline_image, textline_image_bin=None):
|
|||
return [image1, image2], None
|
||||
else:
|
||||
return None, None
|
||||
|
||||
def preprocess_and_resize_image_for_ocrcnn_model(img, image_height, image_width):
|
||||
if img.shape[0]==0 or img.shape[1]==0:
|
||||
img_fin = np.ones((image_height, image_width, 3))
|
||||
|
|
|
|||
|
|
@ -88,3 +88,7 @@ def order_and_id_of_texts(found_polygons_text_region, found_polygons_text_region
|
|||
order_of_texts.append(interest)
|
||||
|
||||
return order_of_texts, id_of_texts
|
||||
|
||||
def etree_namespace_for_element_tag(tag: str):
|
||||
right = tag.find('}')
|
||||
return tag[1:right]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue