Eynollah_ocr: adapt to inference model, improve and simplify…

- drop `end_character` mechanics and `characters` model type
  for decoding output probability (not needed)
- drop `decode_batch_predictions()` and `num_to_char` model type
  (part of inference model)
- drop roughshot confidence estimation calculation
  (returned precisely by inference model)
- adapt model prediction to inference model: just omit zeros,
  map to bytes, filter OOV tokens and decode UTF-8 to str
- if no binarization input was provided, then compute it on the fly
  using `binarization` model
- also apply `min_conf_value_of_textline_text` (as for TrOCR)
- batching over entire page instead of region-wise
  (which underfilled batches)
- simplify and avoid copied redundant code
- rename `extracted_conf_value_merged` → `extracted_confs_merged`
- move `batched()` from `utils.utils_ocr` to `utils`
- drop `utils_ocr.distortion_free_resize()` (not needed)
- simplify `utils_ocr.break_curved_line_into_small_pieces_and_then_merge()`
- drop `utils_ocr.return_textline_contour_with_added_box_coordinate()`
  and `utils_ocr.return_rnn_cnn_ocr_of_given_textlines()` (not needed)
This commit is contained in:
Robert Sachunsky 2026-06-02 21:20:06 +02:00
parent a391ee24e6
commit 8ffc4ed8d3
3 changed files with 206 additions and 631 deletions

View file

@ -19,27 +19,29 @@ from ocrd_utils import polygon_from_points, xywh_from_polygon
from .eynollah import Eynollah from .eynollah import Eynollah
from .model_zoo import EynollahModelZoo from .model_zoo import EynollahModelZoo
from .utils import is_image_filename from .utils import (
is_image_filename,
batched,
pairwise,
)
from .utils.font import get_font from .utils.font import get_font
from .utils.xml import etree_namespace_for_element_tag from .utils.xml import etree_namespace_for_element_tag
from .utils.resize import resize_image from .utils.resize import resize_image
from .utils.utils_ocr import ( from .utils.utils_ocr import (
break_curved_line_into_small_pieces_and_then_merge, break_curved_line_into_small_pieces_and_then_merge,
decode_batch_predictions,
fit_text_single_line, fit_text_single_line,
get_contours_and_bounding_boxes, get_contours_and_bounding_boxes,
get_orientation_moments, get_orientation_moments,
preprocess_and_resize_image_for_ocrcnn_model, preprocess_and_resize_image_for_ocrcnn_model,
return_textlines_split_if_needed, return_textlines_split_if_needed,
rotate_image_with_padding, rotate_image_with_padding,
batched,
) )
# TODO: refine typing # TODO: refine typing
@dataclass @dataclass
class EynollahOcrResult: class EynollahOcrResult:
extracted_texts_merged: List extracted_texts_merged: List
extracted_conf_value_merged: Optional[List] extracted_confs_merged: Optional[List]
cropped_lines_region_indexer: List cropped_lines_region_indexer: List
total_bb_coordinates:List total_bb_coordinates:List
@ -73,10 +75,8 @@ class Eynollah_ocr(Eynollah):
device=device) device=device)
else: else:
self.model_zoo.load_models('ocr', self.model_zoo.load_models('ocr',
'num_to_char', 'binarization',
'characters',
device=device) device=device)
self.end_character = len(self.model_zoo.get('characters')) + 2
@property @property
def device(self): def device(self):
@ -95,8 +95,6 @@ class Eynollah_ocr(Eynollah):
cropped_lines = [] cropped_lines = []
cropped_lines_region_indexer = [] cropped_lines_region_indexer = []
cropped_lines_meging_indexing = [] cropped_lines_meging_indexing = []
extracted_texts = []
extracted_confs = []
for n_region, region in enumerate(page_tree.getroot().iter('{%s}TextRegion' % page_ns)): for n_region, region in enumerate(page_tree.getroot().iter('{%s}TextRegion' % page_ns)):
for n_line, line in enumerate(region.iter('{%s}TextLine' % page_ns)): for n_line, line in enumerate(region.iter('{%s}TextLine' % page_ns)):
@ -139,7 +137,8 @@ class Eynollah_ocr(Eynollah):
cropped_lines.append(img_crop) cropped_lines.append(img_crop)
cropped_lines_meging_indexing.append(0) cropped_lines_meging_indexing.append(0)
extracted_texts = []
extracted_confs = []
self.logger.debug("processing %d lines for %d regions", self.logger.debug("processing %d lines for %d regions",
len(cropped_lines), len(set(cropped_lines_region_indexer))) len(cropped_lines), len(set(cropped_lines_region_indexer)))
for imgs in batched(cropped_lines, self.b_s): for imgs in batched(cropped_lines, self.b_s):
@ -157,6 +156,10 @@ class Eynollah_ocr(Eynollah):
conf = output.sequences_scores.exp().clamp(0.0, 1.0).tolist() conf = output.sequences_scores.exp().clamp(0.0, 1.0).tolist()
else: else:
conf = [1.0] * len(output.sequences) conf = [1.0] * len(output.sequences)
if conf < self.min_conf_value_of_textline_text:
extracted_confs.extend(0)
extracted_texts.extend("")
continue
text = self.model_zoo.get('trocr_processor').batch_decode( text = self.model_zoo.get('trocr_processor').batch_decode(
output.sequences, output.sequences,
skip_special_tokens=True, skip_special_tokens=True,
@ -179,7 +182,7 @@ class Eynollah_ocr(Eynollah):
return EynollahOcrResult( return EynollahOcrResult(
extracted_texts_merged=extracted_texts_merged, extracted_texts_merged=extracted_texts_merged,
extracted_conf_value_merged=extracted_confs_merged, extracted_confs_merged=extracted_confs_merged,
cropped_lines_region_indexer=cropped_lines_region_indexer, cropped_lines_region_indexer=cropped_lines_region_indexer,
total_bb_coordinates=total_bb_coordinates, total_bb_coordinates=total_bb_coordinates,
) )
@ -196,362 +199,163 @@ class Eynollah_ocr(Eynollah):
) -> EynollahOcrResult: ) -> EynollahOcrResult:
total_bb_coordinates = [] total_bb_coordinates = []
cropped_lines_rgb = []
cropped_lines = []
img_crop_bin = None
imgs_bin = None
imgs_bin_ver_flipped = None
cropped_lines_bin = [] cropped_lines_bin = []
cropped_lines_ver_index = [] cropped_lines_ver_index = []
cropped_lines_region_indexer = [] cropped_lines_region_indexer = []
cropped_lines_meging_indexing = [] cropped_lines_meging_indexing = []
indexer_text_region = 0 img_rgb = img # cosmetic
for nn in page_tree.getroot().iter(f'{{{page_ns}}}TextRegion'): if img_bin is None:
try: # run ad-hoc binarization
type_textregion = nn.attrib['type'] self.logger.info("running binarization for ensemble input")
except: img_bin = self.do_prediction(True, img, self.model_zoo.get("binarization"),
type_textregion = 'paragraph' n_batch_inference=5)
for child_textregion in nn: img_bin = np.repeat(img_bin[:, :, np.newaxis], 3, axis=2)
if child_textregion.tag.endswith("TextLine"): img_bin = 255 * (img_bin == 0).astype(np.uint8)
for child_textlines in child_textregion:
if child_textlines.tag.endswith("Coords"):
cropped_lines_region_indexer.append(indexer_text_region)
p_h=child_textlines.attrib['points'].split(' ')
textline_coords = np.array( [ [int(x.split(',')[0]),
int(x.split(',')[1]) ]
for x in p_h] )
x,y,w,h = cv2.boundingRect(textline_coords) for n_region, region in enumerate(page_tree.getroot().iter('{%s}TextRegion' % page_ns)):
type_textregion = region.attrib.get('type', 'paragraph')
for n_line, line in enumerate(region.iter('{%s}TextLine' % page_ns)):
cropped_lines_region_indexer.append(n_region)
angle_radians = math.atan2(h, w) coords = line.find('{%s}Coords' % page_ns)
# Convert to degrees if coords is None:
angle_degrees = math.degrees(angle_radians) self.logger.warning("region '%s' line '%s' has no Coords", region.attrib['id'], line.attrib['id'])
if type_textregion=='drop-capital': continue
angle_degrees = 0 poly = np.array(polygon_from_points(coords.attrib['points'])).astype(int)
cont = poly[:, np.newaxis]
xywh = xywh_from_polygon(poly)
x, y, w, h = xywh['x'], xywh['y'], xywh['w'], xywh['h']
total_bb_coordinates.append([x,y,w,h]) angle_radians = math.atan2(h, w)
angle_degrees = math.degrees(angle_radians)
if type_textregion=='drop-capital':
angle_degrees = 0
w_scaled = w * image_height/float(h) total_bb_coordinates.append([x, y, w, h])
img_poly_on_img = np.copy(img) w_scaled = w * image_height / float(h)
if img_bin:
img_poly_on_img_bin = np.copy(img_bin)
img_crop_bin = img_poly_on_img_bin[y:y+h, x:x+w, :]
mask_poly = np.zeros(img.shape) img_crop_rgb = img_rgb[y: y + h, x: x + w]
mask_poly = cv2.fillPoly(mask_poly, pts=[textline_coords], color=(1, 1, 1)) img_crop_bin = img_bin[y: y + h, x: x + w]
mask_poly = np.zeros(img_crop_rgb.shape[:2], dtype=np.uint8)
mask_poly = cv2.fillPoly(mask_poly, pts=[cont - [x, y]], color=1)
mask_poly = mask_poly[y:y+h, x:x+w, :] if angle_degrees > 3:
img_crop = img_poly_on_img[y:y+h, x:x+w, :] better_des_slope = get_orientation_moments(cont)
img_crop_rgb = rotate_image_with_padding(img_crop_rgb, better_des_slope)
img_crop_bin = rotate_image_with_padding(img_crop_bin, better_des_slope)
mask_poly = rotate_image_with_padding(mask_poly, better_des_slope)
# get new bounding box
x_n, y_n, w_n, h_n = get_contours_and_bounding_boxes(mask_poly)
img_crop_rgb = img_crop_rgb[y_n: y_n + h_n, x_n: x_n + w_n]
img_crop_bin = img_crop_bin[y_n: y_n + h_n, x_n: x_n + w_n]
mask_poly = mask_poly[y_n: y_n + h_n, x_n: x_n + w_n]
else:
better_des_slope = 0
# print(file_name, angle_degrees, w*h, if not self.do_not_mask_with_textline_contour:
# mask_poly[:,:,0].sum(), img_crop_rgb[mask_poly == 0] = 255 # FIXME: or median color?
# mask_poly[:,:,0].sum() /float(w*h) , img_crop_bin[mask_poly == 0] = 255
# 'didi')
if angle_degrees > 3: if (type_textregion !='drop-capital' and
better_des_slope = get_orientation_moments(textline_coords) mask_poly.sum() < 0.50 * mask_poly.size and
w_scaled > 90):
img_crop = rotate_image_with_padding(img_crop, better_des_slope) img_crop_rgb, img_crop_bin = \
if img_bin: break_curved_line_into_small_pieces_and_then_merge(
img_crop_bin = rotate_image_with_padding(img_crop_bin, better_des_slope) img_crop_rgb, img_crop_bin, mask_poly)
mask_poly = rotate_image_with_padding(mask_poly, better_des_slope) if w_scaled < 750:#1.5*image_width:
mask_poly = mask_poly.astype('uint8') img_crop_split_rgb = img_crop_split_bin = None
else:
img_crop_split_rgb, img_crop_split_bin = return_textlines_split_if_needed(
img_crop_rgb, img_crop_bin)
if img_crop_split_rgb:
cropped_lines_rgb.extend(img_crop_split_rgb)
cropped_lines_bin.extend(img_crop_split_bin)
if abs(better_des_slope) > 45:
cropped_lines_ver_index.append(1)
cropped_lines_ver_index.append(1)
else:
cropped_lines_ver_index.append(0)
cropped_lines_ver_index.append(0)
cropped_lines_meging_indexing.append(1)
cropped_lines_meging_indexing.append(-1)
else:
cropped_lines_rgb.append(img_crop_rgb)
cropped_lines_bin.append(img_crop_bin)
if abs(better_des_slope) > 45:
cropped_lines_ver_index.append(1)
else:
cropped_lines_ver_index.append(0)
cropped_lines_meging_indexing.append(0)
#new bounding box cropped_lines_rgb = [preprocess_and_resize_image_for_ocrcnn_model(img, image_height, image_width)
x_n, y_n, w_n, h_n = get_contours_and_bounding_boxes(mask_poly[:,:,0]) for img in cropped_lines_rgb]
cropped_lines_bin = [preprocess_and_resize_image_for_ocrcnn_model(img, image_height, image_width)
mask_poly = mask_poly[y_n:y_n+h_n, x_n:x_n+w_n, :] for img in cropped_lines_bin]
img_crop = img_crop[y_n:y_n+h_n, x_n:x_n+w_n, :]
if not self.do_not_mask_with_textline_contour:
img_crop[mask_poly==0] = 255
if img_bin:
img_crop_bin = img_crop_bin[y_n:y_n+h_n, x_n:x_n+w_n, :]
if not self.do_not_mask_with_textline_contour:
img_crop_bin[mask_poly==0] = 255
if mask_poly[:,:,0].sum() /float(w_n*h_n) < 0.50 and w_scaled > 90:
if img_bin:
img_crop, img_crop_bin = \
break_curved_line_into_small_pieces_and_then_merge(
img_crop, mask_poly, img_crop_bin)
else:
img_crop, _ = \
break_curved_line_into_small_pieces_and_then_merge(
img_crop, mask_poly)
else:
better_des_slope = 0
if not self.do_not_mask_with_textline_contour:
img_crop[mask_poly==0] = 255
if img_bin:
if not self.do_not_mask_with_textline_contour:
img_crop_bin[mask_poly==0] = 255
if type_textregion=='drop-capital':
pass
else:
if mask_poly[:,:,0].sum() /float(w*h) < 0.50 and w_scaled > 90:
if img_bin:
img_crop, img_crop_bin = \
break_curved_line_into_small_pieces_and_then_merge(
img_crop, mask_poly, img_crop_bin)
else:
img_crop, _ = \
break_curved_line_into_small_pieces_and_then_merge(
img_crop, mask_poly)
if w_scaled < 750:#1.5*image_width:
img_fin = preprocess_and_resize_image_for_ocrcnn_model(
img_crop, image_height, image_width)
cropped_lines.append(img_fin)
if abs(better_des_slope) > 45:
cropped_lines_ver_index.append(1)
else:
cropped_lines_ver_index.append(0)
cropped_lines_meging_indexing.append(0)
if img_bin:
img_fin = preprocess_and_resize_image_for_ocrcnn_model(
img_crop_bin, image_height, image_width)
cropped_lines_bin.append(img_fin)
else:
splited_images, splited_images_bin = return_textlines_split_if_needed(
img_crop, img_crop_bin if img_bin else None)
if splited_images:
img_fin = preprocess_and_resize_image_for_ocrcnn_model(
splited_images[0], image_height, image_width)
cropped_lines.append(img_fin)
cropped_lines_meging_indexing.append(1)
if abs(better_des_slope) > 45:
cropped_lines_ver_index.append(1)
else:
cropped_lines_ver_index.append(0)
img_fin = preprocess_and_resize_image_for_ocrcnn_model(
splited_images[1], image_height, image_width)
cropped_lines.append(img_fin)
cropped_lines_meging_indexing.append(-1)
if abs(better_des_slope) > 45:
cropped_lines_ver_index.append(1)
else:
cropped_lines_ver_index.append(0)
if img_bin:
img_fin = preprocess_and_resize_image_for_ocrcnn_model(
splited_images_bin[0], image_height, image_width)
cropped_lines_bin.append(img_fin)
img_fin = preprocess_and_resize_image_for_ocrcnn_model(
splited_images_bin[1], image_height, image_width)
cropped_lines_bin.append(img_fin)
else:
img_fin = preprocess_and_resize_image_for_ocrcnn_model(
img_crop, image_height, image_width)
cropped_lines.append(img_fin)
cropped_lines_meging_indexing.append(0)
if abs(better_des_slope) > 45:
cropped_lines_ver_index.append(1)
else:
cropped_lines_ver_index.append(0)
if img_bin:
img_fin = preprocess_and_resize_image_for_ocrcnn_model(
img_crop_bin, image_height, image_width)
cropped_lines_bin.append(img_fin)
indexer_text_region = indexer_text_region +1
extracted_texts = [] extracted_texts = []
extracted_conf_value = [] extracted_confs = []
self.logger.debug("processing %d lines for %d regions",
len(cropped_lines_rgb), len(set(cropped_lines_region_indexer)))
cropped_lines = zip(cropped_lines_rgb, cropped_lines_bin, cropped_lines_ver_index)
for batch in batched(cropped_lines, self.b_s):
imgs_rgb, imgs_bin, ver_index = zip(*batch)
ver_index = np.array(ver_index)
imgs_rgb = np.stack(imgs_rgb)
imgs_bin = np.stack(imgs_bin)
imgs_rgb_ver = imgs_rgb[ver_index > 0, ::-1, ::-1]
imgs_bin_ver = imgs_bin[ver_index > 0, ::-1, ::-1]
n_iterations = math.ceil(len(cropped_lines) / self.b_s) # inference model now yields (char-bytes, line-prob) instead of vocidx-softmax
# (so ctc_decode and inverse StringLookup are included)
# also, the model now expects a secondary binary input image
preds, probs = self.model_zoo.get('ocr').predict((imgs_rgb, imgs_bin), verbose=0)
# FIXME: copy pasta if ver_index.any():
for i in range(n_iterations): preds_ver, probs_ver = self.model_zoo.get('ocr').predict((imgs_rgb_ver, imgs_bin_ver), verbose=0)
if i==(n_iterations-1): flipped_ver_is_better = np.flatnonzero(probs_ver > probs[ver_index > 0])
n_start = i*self.b_s if len(flipped_ver_is_better):
imgs = cropped_lines[n_start:] self.logger.info("%d skewed lines perform better when flipped", len(flipped_ver_is_better))
imgs = np.array(imgs) preds[ver_index > 0][flipped_ver_is_better] = preds_ver[flipped_ver_is_better]
imgs = imgs.reshape(imgs.shape[0], image_height, image_width, 3) probs[ver_index > 0][flipped_ver_is_better] = probs_ver[flipped_ver_is_better]
ver_imgs = np.array( cropped_lines_ver_index[n_start:] ) def nooov(x):
indices_ver = np.where(ver_imgs == 1)[0] return x != b'[UNK]'
for pred, prob in zip(preds, probs):
#print(indices_ver, 'indices_ver') if prob < self.min_conf_value_of_textline_text:
if len(indices_ver)>0:
imgs_ver_flipped = imgs[indices_ver, : ,: ,:]
imgs_ver_flipped = imgs_ver_flipped[:,::-1,::-1,:]
#print(imgs_ver_flipped, 'imgs_ver_flipped')
else:
imgs_ver_flipped = None
if img_bin:
imgs_bin = cropped_lines_bin[n_start:]
imgs_bin = np.array(imgs_bin)
imgs_bin = imgs_bin.reshape(imgs_bin.shape[0], image_height, image_width, 3)
if len(indices_ver)>0:
imgs_bin_ver_flipped = imgs_bin[indices_ver, : ,: ,:]
imgs_bin_ver_flipped = imgs_bin_ver_flipped[:,::-1,::-1,:]
#print(imgs_ver_flipped, 'imgs_ver_flipped')
else:
imgs_bin_ver_flipped = None
else:
n_start = i*self.b_s
n_end = (i+1)*self.b_s
imgs = cropped_lines[n_start:n_end]
imgs = np.array(imgs).reshape(self.b_s, image_height, image_width, 3)
ver_imgs = np.array( cropped_lines_ver_index[n_start:n_end] )
indices_ver = np.where(ver_imgs == 1)[0]
#print(indices_ver, 'indices_ver')
if len(indices_ver)>0:
imgs_ver_flipped = imgs[indices_ver, : ,: ,:]
imgs_ver_flipped = imgs_ver_flipped[:,::-1,::-1,:]
#print(imgs_ver_flipped, 'imgs_ver_flipped')
else:
imgs_ver_flipped = None
if img_bin:
imgs_bin = cropped_lines_bin[n_start:n_end]
imgs_bin = np.array(imgs_bin).reshape(self.b_s, image_height, image_width, 3)
if len(indices_ver)>0:
imgs_bin_ver_flipped = imgs_bin[indices_ver, : ,: ,:]
imgs_bin_ver_flipped = imgs_bin_ver_flipped[:,::-1,::-1,:]
#print(imgs_ver_flipped, 'imgs_ver_flipped')
else:
imgs_bin_ver_flipped = None
self.logger.debug("processing next %d lines", len(imgs))
preds = self.model_zoo.get('ocr').predict(imgs, verbose=0)
if len(indices_ver)>0:
preds_flipped = self.model_zoo.get('ocr').predict(imgs_ver_flipped, verbose=0)
preds_max_fliped = np.max(preds_flipped, axis=2 )
preds_max_args_flipped = np.argmax(preds_flipped, axis=2 )
pred_max_not_unk_mask_bool_flipped = preds_max_args_flipped[:,:]!=self.end_character
masked_means_flipped = \
np.sum(preds_max_fliped * pred_max_not_unk_mask_bool_flipped, axis=1) / \
np.sum(pred_max_not_unk_mask_bool_flipped, axis=1)
masked_means_flipped[np.isnan(masked_means_flipped)] = 0
preds_max = np.max(preds, axis=2 )
preds_max_args = np.argmax(preds, axis=2 )
pred_max_not_unk_mask_bool = preds_max_args[:,:]!=self.end_character
masked_means = \
np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / \
np.sum(pred_max_not_unk_mask_bool, axis=1)
masked_means[np.isnan(masked_means)] = 0
masked_means_ver = masked_means[indices_ver]
#print(masked_means_ver, 'pred_max_not_unk')
indices_where_flipped_conf_value_is_higher = \
np.where(masked_means_flipped > masked_means_ver)[0]
#print(indices_where_flipped_conf_value_is_higher, 'indices_where_flipped_conf_value_is_higher')
if len(indices_where_flipped_conf_value_is_higher)>0:
indices_to_be_replaced = indices_ver[indices_where_flipped_conf_value_is_higher]
preds[indices_to_be_replaced,:,:] = \
preds_flipped[indices_where_flipped_conf_value_is_higher, :, :]
if img_bin:
preds_bin = self.model_zoo.get('ocr').predict(imgs_bin, verbose=0)
if len(indices_ver)>0:
preds_flipped = self.model_zoo.get('ocr').predict(imgs_bin_ver_flipped, verbose=0)
preds_max_fliped = np.max(preds_flipped, axis=2 )
preds_max_args_flipped = np.argmax(preds_flipped, axis=2 )
pred_max_not_unk_mask_bool_flipped = preds_max_args_flipped[:,:]!=self.end_character
masked_means_flipped = \
np.sum(preds_max_fliped * pred_max_not_unk_mask_bool_flipped, axis=1) / \
np.sum(pred_max_not_unk_mask_bool_flipped, axis=1)
masked_means_flipped[np.isnan(masked_means_flipped)] = 0
preds_max = np.max(preds, axis=2 )
preds_max_args = np.argmax(preds, axis=2 )
pred_max_not_unk_mask_bool = preds_max_args[:,:]!=self.end_character
masked_means = \
np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / \
np.sum(pred_max_not_unk_mask_bool, axis=1)
masked_means[np.isnan(masked_means)] = 0
masked_means_ver = masked_means[indices_ver]
#print(masked_means_ver, 'pred_max_not_unk')
indices_where_flipped_conf_value_is_higher = \
np.where(masked_means_flipped > masked_means_ver)[0]
#print(indices_where_flipped_conf_value_is_higher, 'indices_where_flipped_conf_value_is_higher')
if len(indices_where_flipped_conf_value_is_higher)>0:
indices_to_be_replaced = indices_ver[indices_where_flipped_conf_value_is_higher]
preds_bin[indices_to_be_replaced,:,:] = \
preds_flipped[indices_where_flipped_conf_value_is_higher, :, :]
preds = (preds + preds_bin) / 2.
pred_texts = decode_batch_predictions(preds, self.model_zoo.get('num_to_char'))
preds_max = np.max(preds, axis=2 )
preds_max_args = np.argmax(preds, axis=2 )
pred_max_not_unk_mask_bool = preds_max_args[:,:]!=self.end_character
masked_means = \
np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / \
np.sum(pred_max_not_unk_mask_bool, axis=1)
for ib in range(imgs.shape[0]):
pred_texts_ib = pred_texts[ib].replace("[UNK]", "")
if masked_means[ib] >= self.min_conf_value_of_textline_text:
extracted_texts.append(pred_texts_ib)
extracted_conf_value.append(masked_means[ib])
else:
extracted_texts.append("") extracted_texts.append("")
extracted_conf_value.append(0) extracted_confs.append(0)
del cropped_lines else:
text = b''.join(
filter(nooov,
map(bytes,
(filter(None, char)
for char in pred.tolist())))).decode('utf-8')
extracted_texts.append(text)
extracted_confs.append(prob)
del cropped_lines_rgb
del cropped_lines_bin del cropped_lines_bin
gc.collect() gc.collect()
extracted_texts_merged = [extracted_texts[ind] extracted_texts_merged = [extracted_texts[ind]
if cropped_lines_meging_indexing[ind]==0 if cropped_lines_meging_indexing[ind] == 0
else extracted_texts[ind]+" "+extracted_texts[ind+1] else extracted_texts[ind] + " " + extracted_texts[ind + 1]
if cropped_lines_meging_indexing[ind]==1 for ind in range(len(cropped_lines_meging_indexing))
else None if cropped_lines_meging_indexing[ind] >= 0]
for ind in range(len(cropped_lines_meging_indexing))] extracted_confs_merged = [extracted_confs[ind]
if cropped_lines_meging_indexing[ind] == 0
extracted_conf_value_merged = [extracted_conf_value[ind] # type: ignore else 0.5 * (extracted_confs[ind] + extracted_confs[ind + 1])
if cropped_lines_meging_indexing[ind]==0 for ind in range(len(cropped_lines_meging_indexing))
else (extracted_conf_value[ind]+extracted_conf_value[ind+1])/2. if cropped_lines_meging_indexing[ind] >= 0]
if cropped_lines_meging_indexing[ind]==1
else None
for ind in range(len(cropped_lines_meging_indexing))]
extracted_conf_value_merged: List[float] = [extracted_conf_value_merged[ind_cfm]
for ind_cfm in range(len(extracted_texts_merged))
if extracted_texts_merged[ind_cfm] is not None]
extracted_texts_merged = [ind for ind in extracted_texts_merged if ind is not None]
return EynollahOcrResult( return EynollahOcrResult(
extracted_texts_merged=extracted_texts_merged, extracted_texts_merged=extracted_texts_merged,
extracted_conf_value_merged=extracted_conf_value_merged, extracted_confs_merged=extracted_confs_merged,
cropped_lines_region_indexer=cropped_lines_region_indexer, cropped_lines_region_indexer=cropped_lines_region_indexer,
total_bb_coordinates=total_bb_coordinates, total_bb_coordinates=total_bb_coordinates,
) )
@ -569,7 +373,7 @@ class Eynollah_ocr(Eynollah):
cropped_lines_region_indexer = result.cropped_lines_region_indexer cropped_lines_region_indexer = result.cropped_lines_region_indexer
total_bb_coordinates = result.total_bb_coordinates total_bb_coordinates = result.total_bb_coordinates
extracted_texts_merged = result.extracted_texts_merged extracted_texts_merged = result.extracted_texts_merged
extracted_conf_value_merged = result.extracted_conf_value_merged extracted_confs_merged = result.extracted_confs_merged
unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer) unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer)
if out_image_with_text: if out_image_with_text:
@ -646,8 +450,8 @@ class Eynollah_ocr(Eynollah):
if not is_textline_text: if not is_textline_text:
text_subelement = ET.SubElement(child_textregion, 'TextEquiv') text_subelement = ET.SubElement(child_textregion, 'TextEquiv')
if extracted_conf_value_merged: if extracted_confs_merged:
text_subelement.set('conf', f"{extracted_conf_value_merged[indexer]:.2f}") text_subelement.set('conf', f"{extracted_confs_merged[indexer]:.2f}")
unicode_textline = ET.SubElement(text_subelement, 'Unicode') unicode_textline = ET.SubElement(text_subelement, 'Unicode')
unicode_textline.text = extracted_texts_merged[indexer] unicode_textline.text = extracted_texts_merged[indexer]
else: else:
@ -655,8 +459,8 @@ class Eynollah_ocr(Eynollah):
if childtest3.tag.endswith("TextEquiv"): if childtest3.tag.endswith("TextEquiv"):
for child_uc in childtest3: for child_uc in childtest3:
if child_uc.tag.endswith("Unicode"): if child_uc.tag.endswith("Unicode"):
if extracted_conf_value_merged: if extracted_confs_merged:
childtest3.set('conf', f"{extracted_conf_value_merged[indexer]:.2f}") childtest3.set('conf', f"{extracted_confs_merged[indexer]:.2f}")
child_uc.text = extracted_texts_merged[indexer] child_uc.text = extracted_texts_merged[indexer]
indexer = indexer + 1 indexer = indexer + 1

View file

@ -2,6 +2,7 @@ from typing import Iterable, List, Tuple
from logging import getLogger from logging import getLogger
import time import time
import math import math
from itertools import islice
try: try:
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
@ -33,6 +34,11 @@ def pairwise(iterable):
yield a, b yield a, b
a = b a = b
def batched(iterable, n):
iterator = iter(iterable)
while batch := tuple(islice(iterator, n)):
yield batch
def return_multicol_separators_x_start_end( def return_multicol_separators_x_start_end(
regions_without_separators, peak_points, top, bot, regions_without_separators, peak_points, top, bot,
x_min_hor_some, x_max_hor_some, cy_hor_some, y_min_hor_some, y_max_hor_some): x_min_hor_some, x_max_hor_some, cy_hor_some, y_min_hor_some, y_max_hor_some):

View file

@ -1,6 +1,5 @@
import math import math
import copy import copy
from itertools import islice
import numpy as np import numpy as np
import cv2 import cv2
@ -11,6 +10,7 @@ from scipy.signal import find_peaks
from scipy.ndimage import gaussian_filter1d from scipy.ndimage import gaussian_filter1d
from PIL import Image, ImageDraw, ImageFont from PIL import Image, ImageDraw, ImageFont
from . import pairwise
from .resize import resize_image from .resize import resize_image
@ -42,45 +42,6 @@ def decode_batch_predictions(pred, num_to_char, max_len = 128):
output.append(d) output.append(d)
return output return output
def distortion_free_resize(image, img_size):
import tensorflow as tf
w, h = img_size
image = tf.image.resize(image, size=(h, w), preserve_aspect_ratio=True)
# Check tha amount of padding needed to be done.
pad_height = h - tf.shape(image)[0]
pad_width = w - tf.shape(image)[1]
# Only necessary if you want to do same amount of padding on both sides.
if pad_height % 2 != 0:
height = pad_height // 2
pad_height_top = height + 1
pad_height_bottom = height
else:
pad_height_top = pad_height_bottom = pad_height // 2
if pad_width % 2 != 0:
width = pad_width // 2
pad_width_left = width + 1
pad_width_right = width
else:
pad_width_left = pad_width_right = pad_width // 2
image = tf.pad(
image,
paddings=[
[pad_height_top, pad_height_bottom],
[pad_width_left, pad_width_right],
[0, 0],
],
)
image = tf.transpose(image, (1, 0, 2))
image = tf.image.flip_left_right(image)
return image
def return_start_and_end_of_common_text_of_textline_ocr_without_common_section(textline_image): def return_start_and_end_of_common_text_of_textline_ocr_without_common_section(textline_image):
width = np.shape(textline_image)[1] width = np.shape(textline_image)[1]
height = np.shape(textline_image)[0] height = np.shape(textline_image)[0]
@ -263,254 +224,58 @@ def return_splitting_point_of_image(image_to_spliited):
return np.sort(peaks_sort_4) return np.sort(peaks_sort_4)
def break_curved_line_into_small_pieces_and_then_merge(img_curved, mask_curved, img_bin_curved=None): def break_curved_line_into_small_pieces_and_then_merge(img_rgb_curved, img_bin_curved, mask_curved):
peaks_4 = return_splitting_point_of_image(img_curved) peaks_4 = return_splitting_point_of_image(img_rgb_curved)
if len(peaks_4)>0: if len(peaks_4):
imgs_tot = [] imgs_tot = []
for left, right in pairwise([None] + peaks_4 + [None]):
for ind in range(len(peaks_4)+1): img_rgb = img_rgb_curved[:, left: right]
if ind==0: img_bin = img_bin_curved[:, left: right]
img = img_curved[:, :peaks_4[ind], :] mask = mask_curved[:, left: right]
if img_bin_curved is not None:
img_bin = img_bin_curved[:, :peaks_4[ind], :]
mask = mask_curved[:, :peaks_4[ind], :]
elif ind==len(peaks_4):
img = img_curved[:, peaks_4[ind-1]:, :]
if img_bin_curved is not None:
img_bin = img_bin_curved[:, peaks_4[ind-1]:, :]
mask = mask_curved[:, peaks_4[ind-1]:, :]
else:
img = img_curved[:, peaks_4[ind-1]:peaks_4[ind], :]
if img_bin_curved is not None:
img_bin = img_bin_curved[:, peaks_4[ind-1]:peaks_4[ind], :]
mask = mask_curved[:, peaks_4[ind-1]:peaks_4[ind], :]
or_ma = get_orientation_moments_of_mask(mask) or_ma = get_orientation_moments_of_mask(mask)
imgs_tot.append([img_rgb, img_bin, mask, or_ma])
if img_bin_curved is not None:
imgs_tot.append([img, mask, or_ma, img_bin] )
else:
imgs_tot.append([img, mask, or_ma] )
w_tot_des_list = [] w_tot_des_list = []
w_tot_des = 0 imgs_rgb_deskewed_list = []
imgs_deskewed_list = []
imgs_bin_deskewed_list = [] imgs_bin_deskewed_list = []
for ind in range(len(imgs_tot)): for img_rgb_in, img_bin_in, mask_in, ori_in in imgs_tot:
img_in = imgs_tot[ind][0] if abs(ori_in) < 45:
mask_in = imgs_tot[ind][1] img_rgb_in_des = rotate_image_with_padding(img_rgb_in, ori_in, border_value=(255,255,255) )
ori_in = imgs_tot[ind][2] img_bin_in_des = rotate_image_with_padding(img_bin_in, ori_in, border_value=(255,255,255) )
if img_bin_curved is not None:
img_bin_in = imgs_tot[ind][3]
if abs(ori_in)<45:
img_in_des = rotate_image_with_padding(img_in, ori_in, border_value=(255,255,255) )
if img_bin_curved is not None:
img_bin_in_des = rotate_image_with_padding(img_bin_in, ori_in, border_value=(255,255,255) )
mask_in_des = rotate_image_with_padding(mask_in, ori_in) mask_in_des = rotate_image_with_padding(mask_in, ori_in)
mask_in_des = mask_in_des.astype('uint8') # get new bounding box
x_n, y_n, w_n, h_n = get_contours_and_bounding_boxes(mask_in_des)
#new bounding box if w_n and h_n:
x_n, y_n, w_n, h_n = get_contours_and_bounding_boxes(mask_in_des[:,:,0]) img_rgb_in_des = img_rgb_in_des[y_n: y_n + h_n, x_n: x_n + w_n]
img_bin_in_des = img_bin_in_des[y_n: y_n + h_n, x_n: x_n + w_n]
if w_n==0 or h_n==0:
img_in_des = np.copy(img_in)
if img_bin_curved is not None:
img_bin_in_des = np.copy(img_bin_in)
w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) )
if w_relative==0:
w_relative = img_in_des.shape[1]
img_in_des = resize_image(img_in_des, 32, w_relative)
if img_bin_curved is not None:
img_bin_in_des = resize_image(img_bin_in_des, 32, w_relative)
else: else:
mask_in_des = mask_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] img_rgb_in_des = np.copy(img_rgb_in)
img_in_des = img_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :]
if img_bin_curved is not None:
img_bin_in_des = img_bin_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :]
w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) )
if w_relative==0:
w_relative = img_in_des.shape[1]
img_in_des = resize_image(img_in_des, 32, w_relative)
if img_bin_curved is not None:
img_bin_in_des = resize_image(img_bin_in_des, 32, w_relative)
else:
img_in_des = np.copy(img_in)
if img_bin_curved is not None:
img_bin_in_des = np.copy(img_bin_in) img_bin_in_des = np.copy(img_bin_in)
w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) ) else:
if w_relative==0: img_rgb_in_des = np.copy(img_rgb_in)
w_relative = img_in_des.shape[1] img_bin_in_des = np.copy(img_bin_in)
img_in_des = resize_image(img_in_des, 32, w_relative)
if img_bin_curved is not None:
img_bin_in_des = resize_image(img_bin_in_des, 32, w_relative)
w_tot_des+=img_in_des.shape[1] h, w = img_rgb_in_des.shape[:2]
w_tot_des_list.append(img_in_des.shape[1]) new_h = 32
imgs_deskewed_list.append(img_in_des) new_w = 32 * w // h
if img_bin_curved is not None: new_w = new_w or w
imgs_bin_deskewed_list.append(img_bin_in_des) img_rgb_in_des = resize_image(img_rgb_in_des, new_h, new_w)
img_bin_in_des = resize_image(img_bin_in_des, new_h, new_w)
w_tot_des_list.append(new_w)
imgs_rgb_deskewed_list.append(img_rgb_in_des)
imgs_bin_deskewed_list.append(img_bin_in_des)
img_rgb_final_deskewed = np.ones((new_h, sum(w_tot_des_list), 3)) * 255
img_bin_final_deskewed = np.ones((new_h, sum(w_tot_des_list), 3)) * 255
img_final_deskewed = np.zeros((32, w_tot_des, 3))+255
if img_bin_curved is not None:
img_bin_final_deskewed = np.zeros((32, w_tot_des, 3))+255
else:
img_bin_final_deskewed = None
w_indexer = 0 w_indexer = 0
for ind in range(len(w_tot_des_list)): for ind in range(len(w_tot_des_list)):
img_final_deskewed[:,w_indexer:w_indexer+w_tot_des_list[ind],:] = imgs_deskewed_list[ind][:,:,:] w_indexer2 = w_indexer + w_tot_des_list[ind]
if img_bin_curved is not None: img_rgb_final_deskewed[:, w_indexer: w_indexer2] = imgs_rgb_deskewed_list[ind]
img_bin_final_deskewed[:,w_indexer:w_indexer+w_tot_des_list[ind],:] = imgs_bin_deskewed_list[ind][:,:,:] img_bin_final_deskewed[:, w_indexer: w_indexer2] = imgs_bin_deskewed_list[ind]
w_indexer = w_indexer+w_tot_des_list[ind] w_indexer = w_indexer2
return img_final_deskewed, img_bin_final_deskewed return img_rgb_final_deskewed, img_bin_final_deskewed
else: else:
return img_curved, img_bin_curved return img_rgb_curved, img_bin_curved
def return_textline_contour_with_added_box_coordinate(textline_contour, box_ind):
textline_contour[:,:,0] += box_ind[2]
textline_contour[:,:,1] += box_ind[0]
return textline_contour
def return_rnn_cnn_ocr_of_given_textlines(image,
all_found_textline_polygons,
all_box_coord,
prediction_model,
b_s_ocr, num_to_char,
curved_line=False):
max_len = 512
padding_token = 299
image_width = 512#max_len * 4
image_height = 32
ind_tot = 0
#cv2.imwrite('./img_out.png', image_page)
ocr_all_textlines = []
cropped_lines_region_indexer = []
cropped_lines_meging_indexing = []
cropped_lines = []
indexer_text_region = 0
for indexing, ind_poly_first in enumerate(all_found_textline_polygons):
#ocr_textline_in_textregion = []
if len(ind_poly_first)==0:
cropped_lines_region_indexer.append(indexer_text_region)
cropped_lines_meging_indexing.append(0)
img_fin = np.ones((image_height, image_width, 3))*1
cropped_lines.append(img_fin)
else:
for indexing2, ind_poly in enumerate(ind_poly_first):
cropped_lines_region_indexer.append(indexer_text_region)
if not curved_line:
ind_poly = copy.deepcopy(ind_poly)
box_ind = all_box_coord[indexing]
ind_poly = return_textline_contour_with_added_box_coordinate(ind_poly, box_ind)
#print(ind_poly_copy)
ind_poly[ind_poly<0] = 0
x, y, w, h = cv2.boundingRect(ind_poly)
w_scaled = w * image_height/float(h)
mask_poly = np.zeros(image.shape)
img_poly_on_img = np.copy(image)
mask_poly = cv2.fillPoly(mask_poly, pts=[ind_poly], color=(1, 1, 1))
mask_poly = mask_poly[y:y+h, x:x+w, :]
img_crop = img_poly_on_img[y:y+h, x:x+w, :]
img_crop[mask_poly==0] = 255
if w_scaled < 640:#1.5*image_width:
img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width)
cropped_lines.append(img_fin)
cropped_lines_meging_indexing.append(0)
else:
splited_images, splited_images_bin = return_textlines_split_if_needed(img_crop, None)
if splited_images:
img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images[0],
image_height,
image_width)
cropped_lines.append(img_fin)
cropped_lines_meging_indexing.append(1)
img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images[1],
image_height,
image_width)
cropped_lines.append(img_fin)
cropped_lines_meging_indexing.append(-1)
else:
img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop,
image_height,
image_width)
cropped_lines.append(img_fin)
cropped_lines_meging_indexing.append(0)
indexer_text_region+=1
extracted_texts = []
n_iterations = math.ceil(len(cropped_lines) / b_s_ocr)
for i in range(n_iterations):
if i==(n_iterations-1):
n_start = i*b_s_ocr
imgs = cropped_lines[n_start:]
imgs = np.array(imgs)
imgs = imgs.reshape(imgs.shape[0], image_height, image_width, 3)
else:
n_start = i*b_s_ocr
n_end = (i+1)*b_s_ocr
imgs = cropped_lines[n_start:n_end]
imgs = np.array(imgs).reshape(b_s_ocr, image_height, image_width, 3)
preds = prediction_model.predict(imgs, verbose=0)
pred_texts = decode_batch_predictions(preds, num_to_char)
for ib in range(imgs.shape[0]):
pred_texts_ib = pred_texts[ib].replace("[UNK]", "")
extracted_texts.append(pred_texts_ib)
extracted_texts_merged = [extracted_texts[ind]
if cropped_lines_meging_indexing[ind]==0
else extracted_texts[ind]+" "+extracted_texts[ind+1]
if cropped_lines_meging_indexing[ind]==1
else None
for ind in range(len(cropped_lines_meging_indexing))]
extracted_texts_merged = [ind for ind in extracted_texts_merged if ind is not None]
unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer)
ocr_all_textlines = []
for ind in unique_cropped_lines_region_indexer:
ocr_textline_in_textregion = []
extracted_texts_merged_un = np.array(extracted_texts_merged)[np.array(cropped_lines_region_indexer)==ind]
for it_ind, text_textline in enumerate(extracted_texts_merged_un):
ocr_textline_in_textregion.append(text_textline)
ocr_all_textlines.append(ocr_textline_in_textregion)
return ocr_all_textlines
def batched(iterable, n):
iterator = iter(iterable)
while batch := tuple(islice(iterator, n)):
yield batch