Introduce model_zoo to Eynollah_ocr

This commit is contained in:
kba 2025-10-20 21:14:52 +02:00
parent d609a532bf
commit 062f317d2e
4 changed files with 149 additions and 138 deletions

View file

@ -271,12 +271,12 @@ class Eynollah:
if self.ocr:
if self.tr:
loadable.append(('ocr', 'tr'))
loadable.append(('ocr_tr_processor', 'tr'))
loadable.append(('trocr_processor', 'tr'))
else:
loadable.append('ocr')
loadable.append('num_to_char')
self.models = self.model_zoo.load_models(*loadable)
self.model_zoo.load_models(*loadable)
def __del__(self):
if hasattr(self, 'executor') and getattr(self, 'executor'):
@ -338,8 +338,8 @@ class Eynollah:
def predict_enhancement(self, img):
self.logger.debug("enter predict_enhancement")
img_height_model = self.models["enhancement"].layers[-1].output_shape[1]
img_width_model = self.models["enhancement"].layers[-1].output_shape[2]
img_height_model = self.model_zoo.get("enhancement").layers[-1].output_shape[1]
img_width_model = self.model_zoo.get("enhancement").layers[-1].output_shape[2]
if img.shape[0] < img_height_model:
img = cv2.resize(img, (img.shape[1], img_width_model), interpolation=cv2.INTER_NEAREST)
if img.shape[1] < img_width_model:
@ -380,7 +380,7 @@ class Eynollah:
index_y_d = img_h - img_height_model
img_patch = img[np.newaxis, index_y_d:index_y_u, index_x_d:index_x_u, :]
label_p_pred = self.models["enhancement"].predict(img_patch, verbose=0)
label_p_pred = self.model_zoo.get("enhancement").predict(img_patch, verbose=0)
seg = label_p_pred[0, :, :, :] * 255
if i == 0 and j == 0:
@ -555,7 +555,7 @@ class Eynollah:
img_in[0, :, :, 1] = img_1ch[:, :]
img_in[0, :, :, 2] = img_1ch[:, :]
label_p_pred = self.models["col_classifier"].predict(img_in, verbose=0)
label_p_pred = self.model_zoo.get("col_classifier").predict(img_in, verbose=0)
num_col = np.argmax(label_p_pred[0]) + 1
self.logger.info("Found %s columns (%s)", num_col, label_p_pred)
@ -573,7 +573,7 @@ class Eynollah:
self.logger.info("Detected %s DPI", dpi)
if self.input_binary:
img = self.imread()
prediction_bin = self.do_prediction(True, img, self.models["binarization"], n_batch_inference=5)
prediction_bin = self.do_prediction(True, img, self.model_zoo.get("binarization"), n_batch_inference=5)
prediction_bin = 255 * (prediction_bin[:,:,0] == 0)
prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2).astype(np.uint8)
img= np.copy(prediction_bin)
@ -613,7 +613,7 @@ class Eynollah:
img_in[0, :, :, 1] = img_1ch[:, :]
img_in[0, :, :, 2] = img_1ch[:, :]
label_p_pred = self.models["col_classifier"].predict(img_in, verbose=0)
label_p_pred = self.model_zoo.get("col_classifier").predict(img_in, verbose=0)
num_col = np.argmax(label_p_pred[0]) + 1
elif (self.num_col_upper and self.num_col_lower) and (self.num_col_upper!=self.num_col_lower):
@ -634,7 +634,7 @@ class Eynollah:
img_in[0, :, :, 1] = img_1ch[:, :]
img_in[0, :, :, 2] = img_1ch[:, :]
label_p_pred = self.models["col_classifier"].predict(img_in, verbose=0)
label_p_pred = self.model_zoo.get("col_classifier").predict(img_in, verbose=0)
num_col = np.argmax(label_p_pred[0]) + 1
if num_col > self.num_col_upper:
@ -1486,7 +1486,7 @@ class Eynollah:
cont_page = []
if not self.ignore_page_extraction:
img = np.copy(self.image)#cv2.GaussianBlur(self.image, (5, 5), 0)
img_page_prediction = self.do_prediction(False, img, self.models["page"])
img_page_prediction = self.do_prediction(False, img, self.model_zoo.get("page"))
imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY)
_, thresh = cv2.threshold(imgray, 0, 255, 0)
##thresh = cv2.dilate(thresh, KERNEL, iterations=3)
@ -1534,7 +1534,7 @@ class Eynollah:
else:
img = self.imread()
img = cv2.GaussianBlur(img, (5, 5), 0)
img_page_prediction = self.do_prediction(False, img, self.models["page"])
img_page_prediction = self.do_prediction(False, img, self.model_zoo.get("page"))
imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY)
_, thresh = cv2.threshold(imgray, 0, 255, 0)
@ -1560,7 +1560,7 @@ class Eynollah:
self.logger.debug("enter extract_text_regions")
img_height_h = img.shape[0]
img_width_h = img.shape[1]
model_region = self.models["region_fl"] if patches else self.models["region_fl_np"]
model_region = self.model_zoo.get("region_fl") if patches else self.model_zoo.get("region_fl_np")
if self.light_version:
thresholding_for_fl_light_version = True
@ -1595,7 +1595,7 @@ class Eynollah:
self.logger.debug("enter extract_text_regions")
img_height_h = img.shape[0]
img_width_h = img.shape[1]
model_region = self.models["region_fl"] if patches else self.models["region_fl_np"]
model_region = self.model_zoo.get("region_fl") if patches else self.model_zoo.get("region_fl_np")
if not patches:
img = otsu_copy_binary(img)
@ -1816,14 +1816,14 @@ class Eynollah:
img_w = img_org.shape[1]
img = resize_image(img_org, int(img_org.shape[0] * scaler_h), int(img_org.shape[1] * scaler_w))
prediction_textline = self.do_prediction(use_patches, img, self.models["textline"],
prediction_textline = self.do_prediction(use_patches, img, self.model_zoo.get("textline"),
marginal_of_patch_percent=0.15,
n_batch_inference=3,
thresholding_for_artificial_class_in_light_version=self.textline_light,
threshold_art_class_textline=self.threshold_art_class_textline)
#if not self.textline_light:
#if num_col_classifier==1:
#prediction_textline_nopatch = self.do_prediction(False, img, self.models["textline"])
#prediction_textline_nopatch = self.do_prediction(False, img, self.model_zoo.get_model("textline"))
#prediction_textline[:,:][prediction_textline_nopatch[:,:]==0] = 0
prediction_textline = resize_image(prediction_textline, img_h, img_w)
@ -1894,7 +1894,7 @@ class Eynollah:
#cv2.imwrite('prediction_textline2.png', prediction_textline[:,:,0])
prediction_textline_longshot = self.do_prediction(False, img, self.models["textline"])
prediction_textline_longshot = self.do_prediction(False, img, self.model_zoo.get("textline"))
prediction_textline_longshot_true_size = resize_image(prediction_textline_longshot, img_h, img_w)
@ -1927,7 +1927,7 @@ class Eynollah:
img_h_new = int(img.shape[0] / float(img.shape[1]) * img_w_new)
img_resized = resize_image(img,img_h_new, img_w_new )
prediction_regions_org, _ = self.do_prediction_new_concept(True, img_resized, self.models["region"])
prediction_regions_org, _ = self.do_prediction_new_concept(True, img_resized, self.model_zoo.get("region"))
prediction_regions_org = resize_image(prediction_regions_org,img_height_h, img_width_h )
image_page, page_coord, cont_page = self.extract_page()
@ -2043,7 +2043,7 @@ class Eynollah:
#if self.input_binary:
#img_bin = np.copy(img_resized)
###if (not self.input_binary and self.full_layout) or (not self.input_binary and num_col_classifier >= 30):
###prediction_bin = self.do_prediction(True, img_resized, self.models["binarization"], n_batch_inference=5)
###prediction_bin = self.do_prediction(True, img_resized, self.model_zoo.get_model("binarization"), n_batch_inference=5)
####print("inside bin ", time.time()-t_bin)
###prediction_bin=prediction_bin[:,:,0]
@ -2058,7 +2058,7 @@ class Eynollah:
###else:
###img_bin = np.copy(img_resized)
if (self.ocr and self.tr) and not self.input_binary:
prediction_bin = self.do_prediction(True, img_resized, self.models["binarization"], n_batch_inference=5)
prediction_bin = self.do_prediction(True, img_resized, self.model_zoo.get("binarization"), n_batch_inference=5)
prediction_bin = 255 * (prediction_bin[:,:,0] == 0)
prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2)
prediction_bin = prediction_bin.astype(np.uint16)
@ -2090,14 +2090,14 @@ class Eynollah:
self.logger.debug("resized to %dx%d for %d cols",
img_resized.shape[1], img_resized.shape[0], num_col_classifier)
prediction_regions_org, confidence_matrix = self.do_prediction_new_concept(
True, img_resized, self.models["region_1_2"], n_batch_inference=1,
True, img_resized, self.model_zoo.get("region_1_2"), n_batch_inference=1,
thresholding_for_some_classes_in_light_version=True,
threshold_art_class_layout=self.threshold_art_class_layout)
else:
prediction_regions_org = np.zeros((self.image_org.shape[0], self.image_org.shape[1], 3))
confidence_matrix = np.zeros((self.image_org.shape[0], self.image_org.shape[1]))
prediction_regions_page, confidence_matrix_page = self.do_prediction_new_concept(
False, self.image_page_org_size, self.models["region_1_2"], n_batch_inference=1,
False, self.image_page_org_size, self.model_zoo.get("region_1_2"), n_batch_inference=1,
thresholding_for_artificial_class_in_light_version=True,
threshold_art_class_layout=self.threshold_art_class_layout)
ys = slice(*self.page_coord[0:2])
@ -2111,10 +2111,10 @@ class Eynollah:
self.logger.debug("resized to %dx%d (new_h=%d) for %d cols",
img_resized.shape[1], img_resized.shape[0], new_h, num_col_classifier)
prediction_regions_org, confidence_matrix = self.do_prediction_new_concept(
True, img_resized, self.models["region_1_2"], n_batch_inference=2,
True, img_resized, self.model_zoo.get("region_1_2"), n_batch_inference=2,
thresholding_for_some_classes_in_light_version=True,
threshold_art_class_layout=self.threshold_art_class_layout)
###prediction_regions_org = self.do_prediction(True, img_bin, self.models["region"],
###prediction_regions_org = self.do_prediction(True, img_bin, self.model_zoo.get_model("region"),
###n_batch_inference=3,
###thresholding_for_some_classes_in_light_version=True)
#print("inside 3 ", time.time()-t_in)
@ -2194,7 +2194,7 @@ class Eynollah:
ratio_x=1
img = resize_image(img_org, int(img_org.shape[0]*ratio_y), int(img_org.shape[1]*ratio_x))
prediction_regions_org_y = self.do_prediction(True, img, self.models["region"])
prediction_regions_org_y = self.do_prediction(True, img, self.model_zoo.get("region"))
prediction_regions_org_y = resize_image(prediction_regions_org_y, img_height_h, img_width_h )
#plt.imshow(prediction_regions_org_y[:,:,0])
@ -2209,7 +2209,7 @@ class Eynollah:
_, _ = find_num_col(img_only_regions, num_col_classifier, self.tables, multiplier=6.0)
img = resize_image(img_org, int(img_org.shape[0]), int(img_org.shape[1]*(1.2 if is_image_enhanced else 1)))
prediction_regions_org = self.do_prediction(True, img, self.models["region"])
prediction_regions_org = self.do_prediction(True, img, self.model_zoo.get("region"))
prediction_regions_org = resize_image(prediction_regions_org, img_height_h, img_width_h )
prediction_regions_org=prediction_regions_org[:,:,0]
@ -2217,7 +2217,7 @@ class Eynollah:
img = resize_image(img_org, int(img_org.shape[0]), int(img_org.shape[1]))
prediction_regions_org2 = self.do_prediction(True, img, self.models["region_p2"], marginal_of_patch_percent=0.2)
prediction_regions_org2 = self.do_prediction(True, img, self.model_zoo.get("region_p2"), marginal_of_patch_percent=0.2)
prediction_regions_org2=resize_image(prediction_regions_org2, img_height_h, img_width_h )
mask_zeros2 = (prediction_regions_org2[:,:,0] == 0)
@ -2241,7 +2241,7 @@ class Eynollah:
if self.input_binary:
prediction_bin = np.copy(img_org)
else:
prediction_bin = self.do_prediction(True, img_org, self.models["binarization"], n_batch_inference=5)
prediction_bin = self.do_prediction(True, img_org, self.model_zoo.get("binarization"), n_batch_inference=5)
prediction_bin = resize_image(prediction_bin, img_height_h, img_width_h )
prediction_bin = 255 * (prediction_bin[:,:,0]==0)
prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2)
@ -2251,7 +2251,7 @@ class Eynollah:
img = resize_image(prediction_bin, int(img_org.shape[0]*ratio_y), int(img_org.shape[1]*ratio_x))
prediction_regions_org = self.do_prediction(True, img, self.models["region"])
prediction_regions_org = self.do_prediction(True, img, self.model_zoo.get("region"))
prediction_regions_org = resize_image(prediction_regions_org, img_height_h, img_width_h )
prediction_regions_org=prediction_regions_org[:,:,0]
@ -2278,7 +2278,7 @@ class Eynollah:
except:
if self.input_binary:
prediction_bin = np.copy(img_org)
prediction_bin = self.do_prediction(True, img_org, self.models["binarization"], n_batch_inference=5)
prediction_bin = self.do_prediction(True, img_org, self.model_zoo.get("binarization"), n_batch_inference=5)
prediction_bin = resize_image(prediction_bin, img_height_h, img_width_h )
prediction_bin = 255 * (prediction_bin[:,:,0]==0)
prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2)
@ -2289,14 +2289,14 @@ class Eynollah:
img = resize_image(prediction_bin, int(img_org.shape[0]*ratio_y), int(img_org.shape[1]*ratio_x))
prediction_regions_org = self.do_prediction(True, img, self.models["region"])
prediction_regions_org = self.do_prediction(True, img, self.model_zoo.get("region"))
prediction_regions_org = resize_image(prediction_regions_org, img_height_h, img_width_h )
prediction_regions_org=prediction_regions_org[:,:,0]
#mask_lines_only=(prediction_regions_org[:,:]==3)*1
#img = resize_image(img_org, int(img_org.shape[0]*1), int(img_org.shape[1]*1))
#prediction_regions_org = self.do_prediction(True, img, self.models["region"])
#prediction_regions_org = self.do_prediction(True, img, self.model_zoo.get_model("region"))
#prediction_regions_org = resize_image(prediction_regions_org, img_height_h, img_width_h )
#prediction_regions_org = prediction_regions_org[:,:,0]
#prediction_regions_org[(prediction_regions_org[:,:] == 1) & (mask_zeros_y[:,:] == 1)]=0
@ -2667,13 +2667,13 @@ class Eynollah:
img_width_h = img_org.shape[1]
patches = False
if self.light_version:
prediction_table, _ = self.do_prediction_new_concept(patches, img, self.models["table"])
prediction_table, _ = self.do_prediction_new_concept(patches, img, self.model_zoo.get("table"))
prediction_table = prediction_table.astype(np.int16)
return prediction_table[:,:,0]
else:
if num_col_classifier < 4 and num_col_classifier > 2:
prediction_table = self.do_prediction(patches, img, self.models["table"])
pre_updown = self.do_prediction(patches, cv2.flip(img[:,:,:], -1), self.models["table"])
prediction_table = self.do_prediction(patches, img, self.model_zoo.get("table"))
pre_updown = self.do_prediction(patches, cv2.flip(img[:,:,:], -1), self.model_zoo.get("table"))
pre_updown = cv2.flip(pre_updown, -1)
prediction_table[:,:,0][pre_updown[:,:,0]==1]=1
@ -2692,8 +2692,8 @@ class Eynollah:
xs = slice(w_start, w_start + img.shape[1])
img_new[ys, xs] = img
prediction_ext = self.do_prediction(patches, img_new, self.models["table"])
pre_updown = self.do_prediction(patches, cv2.flip(img_new[:,:,:], -1), self.models["table"])
prediction_ext = self.do_prediction(patches, img_new, self.model_zoo.get("table"))
pre_updown = self.do_prediction(patches, cv2.flip(img_new[:,:,:], -1), self.model_zoo.get("table"))
pre_updown = cv2.flip(pre_updown, -1)
prediction_table = prediction_ext[ys, xs]
@ -2714,8 +2714,8 @@ class Eynollah:
xs = slice(w_start, w_start + img.shape[1])
img_new[ys, xs] = img
prediction_ext = self.do_prediction(patches, img_new, self.models["table"])
pre_updown = self.do_prediction(patches, cv2.flip(img_new[:,:,:], -1), self.models["table"])
prediction_ext = self.do_prediction(patches, img_new, self.model_zoo.get("table"))
pre_updown = self.do_prediction(patches, cv2.flip(img_new[:,:,:], -1), self.model_zoo.get("table"))
pre_updown = cv2.flip(pre_updown, -1)
prediction_table = prediction_ext[ys, xs]
@ -2727,10 +2727,10 @@ class Eynollah:
prediction_table = np.zeros(img.shape)
img_w_half = img.shape[1] // 2
pre1 = self.do_prediction(patches, img[:,0:img_w_half,:], self.models["table"])
pre2 = self.do_prediction(patches, img[:,img_w_half:,:], self.models["table"])
pre_full = self.do_prediction(patches, img[:,:,:], self.models["table"])
pre_updown = self.do_prediction(patches, cv2.flip(img[:,:,:], -1), self.models["table"])
pre1 = self.do_prediction(patches, img[:,0:img_w_half,:], self.model_zoo.get("table"))
pre2 = self.do_prediction(patches, img[:,img_w_half:,:], self.model_zoo.get("table"))
pre_full = self.do_prediction(patches, img[:,:,:], self.model_zoo.get("table"))
pre_updown = self.do_prediction(patches, cv2.flip(img[:,:,:], -1), self.model_zoo.get("table"))
pre_updown = cv2.flip(pre_updown, -1)
prediction_table_full_erode = cv2.erode(pre_full[:,:,0], KERNEL, iterations=4)
@ -3522,7 +3522,7 @@ class Eynollah:
tot_counter += 1
batch.append(j)
if tot_counter % inference_bs == 0 or tot_counter == len(ij_list):
y_pr = self.models["reading_order"].predict(input_1 , verbose=0)
y_pr = self.model_zoo.get("reading_order").predict(input_1 , verbose=0)
for jb, j in enumerate(batch):
if y_pr[jb][0]>=0.5:
post_list.append(j)
@ -4105,7 +4105,7 @@ class Eynollah:
gc.collect()
ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines(
image_page, all_found_textline_polygons, np.zeros((len(all_found_textline_polygons), 4)),
self.models["ocr"], self.b_s_ocr, self.models["num_to_char"], textline_light=True)
self.model_zoo.get("ocr"), self.b_s_ocr, self.model_zoo.get("num_to_char"), textline_light=True)
else:
ocr_all_textlines = None
@ -4614,27 +4614,27 @@ class Eynollah:
if len(all_found_textline_polygons):
ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines(
image_page, all_found_textline_polygons, all_box_coord,
self.models["ocr"], self.b_s_ocr, self.models["num_to_char"], self.textline_light, self.curved_line)
self.model_zoo.get("ocr"), self.b_s_ocr, self.model_zoo.get("num_to_char"), self.textline_light, self.curved_line)
if len(all_found_textline_polygons_marginals_left):
ocr_all_textlines_marginals_left = return_rnn_cnn_ocr_of_given_textlines(
image_page, all_found_textline_polygons_marginals_left, all_box_coord_marginals_left,
self.models["ocr"], self.b_s_ocr, self.models["num_to_char"], self.textline_light, self.curved_line)
self.model_zoo.get("ocr"), self.b_s_ocr, self.model_zoo.get("num_to_char"), self.textline_light, self.curved_line)
if len(all_found_textline_polygons_marginals_right):
ocr_all_textlines_marginals_right = return_rnn_cnn_ocr_of_given_textlines(
image_page, all_found_textline_polygons_marginals_right, all_box_coord_marginals_right,
self.models["ocr"], self.b_s_ocr, self.models["num_to_char"], self.textline_light, self.curved_line)
self.model_zoo.get("ocr"), self.b_s_ocr, self.model_zoo.get("num_to_char"), self.textline_light, self.curved_line)
if self.full_layout and len(all_found_textline_polygons):
ocr_all_textlines_h = return_rnn_cnn_ocr_of_given_textlines(
image_page, all_found_textline_polygons_h, all_box_coord_h,
self.models["ocr"], self.b_s_ocr, self.models["num_to_char"], self.textline_light, self.curved_line)
self.model_zoo.get("ocr"), self.b_s_ocr, self.model_zoo.get("num_to_char"), self.textline_light, self.curved_line)
if self.full_layout and len(polygons_of_drop_capitals):
ocr_all_textlines_drop = return_rnn_cnn_ocr_of_given_textlines(
image_page, polygons_of_drop_capitals, np.zeros((len(polygons_of_drop_capitals), 4)),
self.models["ocr"], self.b_s_ocr, self.models["num_to_char"], self.textline_light, self.curved_line)
self.model_zoo.get("ocr"), self.b_s_ocr, self.model_zoo.get("num_to_char"), self.textline_light, self.curved_line)
else:
if self.light_version:
@ -4646,7 +4646,7 @@ class Eynollah:
gc.collect()
torch.cuda.empty_cache()
self.models["ocr"].to(self.device)
self.model_zoo.get("ocr").to(self.device)
ind_tot = 0
#cv2.imwrite('./img_out.png', image_page)
@ -4683,7 +4683,7 @@ class Eynollah:
img_croped = img_poly_on_img[y:y+h, x:x+w, :]
#cv2.imwrite('./extracted_lines/'+str(ind_tot)+'.jpg', img_croped)
text_ocr = self.return_ocr_of_textline_without_common_section(
img_croped, self.models["ocr"], self.models['ocr_tr_processor'], self.device, w, h2w_ratio, ind_tot)
img_croped, self.model_zoo.get("ocr"), self.model_zoo.get("trocr_processor"), self.device, w, h2w_ratio, ind_tot)
ocr_textline_in_textregion.append(text_ocr)
ind_tot = ind_tot +1
ocr_all_textlines.append(ocr_textline_in_textregion)

View file

@ -1,6 +1,6 @@
# pyright: reportPossiblyUnboundVariable=false
from logging import getLogger
from logging import Logger, getLogger
from typing import Optional
from pathlib import Path
import os
@ -8,23 +8,31 @@ import json
import gc
import sys
import math
import cv2
import time
from keras.layers import StringLookup
from eynollah.utils.resize import resize_image
from eynollah.utils.utils_ocr import break_curved_line_into_small_pieces_and_then_merge, decode_batch_predictions, fit_text_single_line, get_contours_and_bounding_boxes, get_orientation_moments, preprocess_and_resize_image_for_ocrcnn_model, return_textlines_split_if_needed, rotate_image_with_padding
from .utils import is_image_filename
import cv2
import xml.etree.ElementTree as ET
import tensorflow as tf
from keras.models import load_model
from PIL import Image, ImageDraw, ImageFont
import numpy as np
from eynollah.model_zoo import EynollahModelZoo
import torch
from .utils import is_image_filename
from .utils.resize import resize_image
from .utils.utils_ocr import (
break_curved_line_into_small_pieces_and_then_merge,
decode_batch_predictions,
fit_text_single_line,
get_contours_and_bounding_boxes,
get_orientation_moments,
preprocess_and_resize_image_for_ocrcnn_model,
return_textlines_split_if_needed,
rotate_image_with_padding,
)
# cannot use importlib.resources until we move to 3.9+ forimportlib.resources.files
if sys.version_info < (3, 10):
import importlib_resources
@ -43,68 +51,51 @@ class Eynollah_ocr:
model_name=None,
dir_xmls=None,
tr_ocr=False,
batch_size=None,
export_textline_images_and_text=False,
do_not_mask_with_textline_contour=False,
batch_size: Optional[int]=None,
export_textline_images_and_text: bool=False,
do_not_mask_with_textline_contour: bool=False,
pref_of_dataset=None,
min_conf_value_of_textline_text : Optional[float]=None,
logger=None,
min_conf_value_of_textline_text : float=0.3,
logger: Optional[Logger]=None,
):
self.model_name = model_name
self.tr_ocr = tr_ocr
self.export_textline_images_and_text = export_textline_images_and_text
self.do_not_mask_with_textline_contour = do_not_mask_with_textline_contour
self.pref_of_dataset = pref_of_dataset
self.logger = logger if logger else getLogger('eynollah')
self.model_zoo = EynollahModelZoo(basedir=dir_models)
if not export_textline_images_and_text:
if min_conf_value_of_textline_text:
self.min_conf_value_of_textline_text = float(min_conf_value_of_textline_text)
# TODO: Properly document what 'export_textline_images_and_text' is about
if export_textline_images_and_text:
self.logger.info("export_textline_images_and_text was set, so no actual models are loaded")
return
self.min_conf_value_of_textline_text = min_conf_value_of_textline_text
self.b_s = 2 if batch_size is None and tr_ocr else 8 if batch_size is None else batch_size
if tr_ocr:
self.model_zoo.load_model('trocr_processor', '')
if model_name:
self.model_zoo.load_model('ocr', 'tr', model_name)
else:
self.min_conf_value_of_textline_text = 0.3
if tr_ocr:
assert TrOCRProcessor
self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if self.model_name:
self.model_ocr_dir = self.model_name
else:
self.model_ocr_dir = dir_models + "/model_eynollah_ocr_trocr_20250919"
self.model_ocr = VisionEncoderDecoderModel.from_pretrained(self.model_ocr_dir)
self.model_ocr.to(self.device)
if not batch_size:
self.b_s = 2
else:
self.b_s = int(batch_size)
self.model_zoo.load_model('ocr', 'tr')
self.model_zoo.get('ocr').to(self.device)
else:
if model_name:
self.model_zoo.load_model('ocr', '', model_name)
else:
if self.model_name:
self.model_ocr_dir = self.model_name
else:
self.model_ocr_dir = dir_models + "/model_eynollah_ocr_cnnrnn_20250930"
model_ocr = load_model(self.model_ocr_dir , compile=False)
self.prediction_model = tf.keras.models.Model(
model_ocr.get_layer(name = "image").input,
model_ocr.get_layer(name = "dense2").output)
if not batch_size:
self.b_s = 8
else:
self.b_s = int(batch_size)
with open(os.path.join(self.model_ocr_dir, "characters_org.txt"),"r") as config_file:
characters = json.load(config_file)
AUTOTUNE = tf.data.AUTOTUNE
self.model_zoo.load_model('ocr', '')
self.model_zoo.load_model('num_to_char')
self.end_character = len(self.model_zoo.load_model('characters')) + 2
# Mapping characters to integers.
char_to_num = StringLookup(vocabulary=list(characters), mask_token=None)
# Mapping integers back to original characters.
self.num_to_char = StringLookup(
vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True
)
self.end_character = len(characters) + 2
@property
def device(self):
if torch.cuda.is_available():
self.logger.info("Using GPU acceleration")
return torch.device("cuda:0")
else:
self.logger.info("Using CPU processing")
return torch.device("cpu")
def run(self, overwrite: bool = False,
dir_in: Optional[str] = None,
@ -119,13 +110,16 @@ class Eynollah_ocr:
for image_filename in filter(is_image_filename,
os.listdir(dir_in))]
else:
assert image_filename
ls_imgs = [image_filename]
if self.tr_ocr:
tr_ocr_input_height_and_width = 384
for dir_img in ls_imgs:
file_name = Path(dir_img).stem
assert dir_xmls # FIXME: check the logic
dir_xml = os.path.join(dir_xmls, file_name+'.xml')
assert dir_out # FIXME: check the logic
out_file_ocr = os.path.join(dir_out, file_name+'.xml')
if os.path.exists(out_file_ocr):
@ -204,10 +198,10 @@ class Eynollah_ocr:
cropped_lines = []
indexer_b_s = 0
pixel_values_merged = self.processor(imgs, return_tensors="pt").pixel_values
pixel_values_merged = self.model_zoo.get('processor')(imgs, return_tensors="pt").pixel_values
generated_ids_merged = self.model_ocr.generate(
pixel_values_merged.to(self.device))
generated_text_merged = self.processor.batch_decode(
generated_text_merged = self.model_zoo.get('processor').batch_decode(
generated_ids_merged, skip_special_tokens=True)
extracted_texts = extracted_texts + generated_text_merged
@ -227,10 +221,10 @@ class Eynollah_ocr:
cropped_lines = []
indexer_b_s = 0
pixel_values_merged = self.processor(imgs, return_tensors="pt").pixel_values
pixel_values_merged = self.model_zoo.get('processor')(imgs, return_tensors="pt").pixel_values
generated_ids_merged = self.model_ocr.generate(
pixel_values_merged.to(self.device))
generated_text_merged = self.processor.batch_decode(
generated_text_merged = self.model_zoo.get('processor').batch_decode(
generated_ids_merged, skip_special_tokens=True)
extracted_texts = extracted_texts + generated_text_merged
@ -247,10 +241,10 @@ class Eynollah_ocr:
cropped_lines = []
indexer_b_s = 0
pixel_values_merged = self.processor(imgs, return_tensors="pt").pixel_values
pixel_values_merged = self.model_zoo.get('processor')(imgs, return_tensors="pt").pixel_values
generated_ids_merged = self.model_ocr.generate(
pixel_values_merged.to(self.device))
generated_text_merged = self.processor.batch_decode(
generated_text_merged = self.model_zoo.get('processor').batch_decode(
generated_ids_merged, skip_special_tokens=True)
extracted_texts = extracted_texts + generated_text_merged
@ -265,10 +259,10 @@ class Eynollah_ocr:
cropped_lines = []
indexer_b_s = 0
pixel_values_merged = self.processor(imgs, return_tensors="pt").pixel_values
pixel_values_merged = self.model_zoo.get('processor')(imgs, return_tensors="pt").pixel_values
generated_ids_merged = self.model_ocr.generate(
pixel_values_merged.to(self.device))
generated_text_merged = self.processor.batch_decode(
generated_text_merged = self.model_zoo.get('processor').batch_decode(
generated_ids_merged, skip_special_tokens=True)
extracted_texts = extracted_texts + generated_text_merged
@ -282,9 +276,9 @@ class Eynollah_ocr:
cropped_lines = []
indexer_b_s = 0
pixel_values_merged = self.processor(imgs, return_tensors="pt").pixel_values
pixel_values_merged = self.model_zoo.get('processor')(imgs, return_tensors="pt").pixel_values
generated_ids_merged = self.model_ocr.generate(pixel_values_merged.to(self.device))
generated_text_merged = self.processor.batch_decode(generated_ids_merged, skip_special_tokens=True)
generated_text_merged = self.model_zoo.get('processor').batch_decode(generated_ids_merged, skip_special_tokens=True)
extracted_texts = extracted_texts + generated_text_merged
@ -299,10 +293,10 @@ class Eynollah_ocr:
####n_start = i*self.b_s
####n_end = (i+1)*self.b_s
####imgs = cropped_lines[n_start:n_end]
####pixel_values_merged = self.processor(imgs, return_tensors="pt").pixel_values
####pixel_values_merged = self.model_zoo.get('processor')(imgs, return_tensors="pt").pixel_values
####generated_ids_merged = self.model_ocr.generate(
#### pixel_values_merged.to(self.device))
####generated_text_merged = self.processor.batch_decode(
####generated_text_merged = self.model_zoo.get('processor').batch_decode(
#### generated_ids_merged, skip_special_tokens=True)
####extracted_texts = extracted_texts + generated_text_merged

View file

@ -11,7 +11,7 @@ from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from eynollah.patch_encoder import PatchEncoder, Patches
SomeEynollahModel = Union[VisionEncoderDecoderModel, TrOCRProcessor, Model]
SomeEynollahModel = Union[VisionEncoderDecoderModel, TrOCRProcessor, Model, List]
# Dict mapping model_category to dict mapping variant (default is '') to Path
@ -114,14 +114,19 @@ DEFAULT_MODEL_VERSIONS: Dict[str, Dict[str, str]] = {
'': "model_eynollah_ocr_cnnrnn_20250930",
},
'ocr_tr_processor': {
'trocr_processor': {
'': 'microsoft/trocr-base-printed',
'htr': "microsoft/trocr-base-handwritten",
},
'num_to_char': {
'': 'model_eynollah_ocr_cnnrnn_20250930/characters_org.txt'
'': 'characters_org.txt'
},
'characters': {
'': 'characters_org.txt'
},
}
@ -142,7 +147,7 @@ class EynollahModelZoo():
self.model_versions = deepcopy(DEFAULT_MODEL_VERSIONS)
if model_overrides:
self.override_models(*model_overrides)
self._loaded: Dict[Tuple[str, str], SomeEynollahModel] = {}
self._loaded: Dict[str, SomeEynollahModel] = {}
def override_models(self, *model_overrides: Tuple[str, str, str]):
"""
@ -216,7 +221,9 @@ class EynollahModelZoo():
model = self._load_ocr_model(variant=model_variant)
elif model_category == 'num_to_char':
model = self._load_num_to_char()
elif model_category == 'tr_processor':
elif model_category == 'characters':
model = self._load_characters()
elif model_category == 'trocr_processor':
return TrOCRProcessor.from_pretrained(self.model_path(...))
else:
try:
@ -225,14 +232,13 @@ class EynollahModelZoo():
self.logger.exception(e)
model = load_model(model_path, compile=False, custom_objects={
"PatchEncoder": PatchEncoder, "Patches": Patches})
self._loaded[(model_category, model_variant)] = model
self._loaded[model_category] = model
return model # type: ignore
def get_model(self, model_categeory, model_variant) -> SomeEynollahModel:
needle = (model_categeory, model_variant)
if needle not in self._loaded:
raise ValueError('Model/variant "{needle} not previously loaded with "load_model(..)"')
return self._loaded[needle]
def get(self, model_category) -> SomeEynollahModel:
if model_category not in self._loaded:
raise ValueError(f'Model "{model_category} not previously loaded with "load_model(..)"')
return self._loaded[model_category]
def _load_ocr_model(self, variant: str) -> SomeEynollahModel:
"""
@ -247,15 +253,21 @@ class EynollahModelZoo():
return Model(
ocr_model.get_layer(name = "image").input, # type: ignore
ocr_model.get_layer(name = "dense2").output) # type: ignore
def _load_characters(self) -> List[str]:
"""
Load encoding for OCR
"""
with open(self.model_path('ocr') / self.model_path('num_to_char', absolute=False), "r") as config_file:
return json.load(config_file)
def _load_num_to_char(self):
def _load_num_to_char(self) -> StringLookup:
"""
Load decoder for OCR
"""
with open(self.model_path('ocr') / self.model_path('ocr', 'num_to_char', absolute=False), "r") as config_file:
characters = json.load(config_file)
characters = self._load_characters()
# Mapping characters to integers.
char_to_num = StringLookup(vocabulary=list(characters), mask_token=None)
char_to_num = StringLookup(vocabulary=characters, mask_token=None)
# Mapping integers back to original characters.
return StringLookup(
vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True

View file

@ -393,7 +393,12 @@ def find_num_col_deskew(regions_without_separators, sigma_, multiplier=3.8):
z = gaussian_filter1d(regions_without_separators_0, sigma_)
return np.std(z)
def find_num_col(regions_without_separators, num_col_classifier, tables, multiplier=3.8):
def find_num_col(
regions_without_separators,
num_col_classifier,
tables,
multiplier=3.8,
):
if not regions_without_separators.any():
return 0, []
#plt.imshow(regions_without_separators)