🔥 remove OCR option from eynollah layout

This commit is contained in:
kba 2025-11-26 15:34:36 +01:00
parent 0f410c2e7c
commit 5a1900e664
5 changed files with 16 additions and 360 deletions

View file

@ -120,9 +120,6 @@ The following options can be used to further configure the processing:
| `-sa <directory>` | save all (plot, enhanced/binary image, layout) to this directory |
| `-thart` | threshold of artifical class in the case of textline detection. The default value is 0.1 |
| `-tharl` | threshold of artifical class in the case of layout detection. The default value is 0.1 |
| `-ocr` | do ocr |
| `-tr` | apply transformer ocr. Default model is a CNN-RNN model |
| `-bs_ocr` | ocr inference batch size. Default bs for trocr and cnn_rnn models are 2 and 8 respectively |
| `-ncu` | upper limit of columns in document image |
| `-ncl` | lower limit of columns in document image |
| `-slro` | skip layout detection and reading order |

View file

@ -321,7 +321,7 @@ def enhancement(ctx, image, out, overwrite, dir_in, num_col_upper, num_col_lower
"--input_binary/--input-RGB",
"-ib/-irgb",
is_flag=True,
help="in general, eynollah uses RGB as input but if the input document is strongly dark, bright or for any other reason you can turn binarized input on. This option does not mean that you have to provide a binary image, otherwise this means that the tool itself will binarized the RGB input document.",
help="In general, eynollah uses RGB as input but if the input document is very dark, very bright or for any other reason you can turn on input binarization. When this flag is set, eynollah will binarize the RGB input document, you should always provide RGB images to eynollah.",
)
@click.option(
"--allow_scaling/--no-allow-scaling",
@ -353,23 +353,6 @@ def enhancement(ctx, image, out, overwrite, dir_in, num_col_upper, num_col_lower
is_flag=True,
help="if this parameter set to true, this tool would apply machine based reading order detection",
)
@click.option(
"--do_ocr",
"-ocr/-noocr",
is_flag=True,
help="if this parameter set to true, this tool will try to do ocr",
)
@click.option(
"--transformer_ocr",
"-tr/-notr",
is_flag=True,
help="if this parameter set to true, this tool will apply transformer ocr",
)
@click.option(
"--batch_size_ocr",
"-bs_ocr",
help="number of inference batch size of ocr model. Default b_s for trocr and cnn_rnn models are 2 and 8 respectively",
)
@click.option(
"--num_col_upper",
"-ncu",
@ -421,9 +404,6 @@ def layout(
headers_off,
light_version,
reading_order_machine_based,
do_ocr,
transformer_ocr,
batch_size_ocr,
num_col_upper,
num_col_lower,
threshold_art_class_textline,
@ -470,9 +450,6 @@ def layout(
light_version=light_version,
ignore_page_extraction=ignore_page_extraction,
reading_order_machine_based=reading_order_machine_based,
do_ocr=do_ocr,
transformer_ocr=transformer_ocr,
batch_size_ocr=batch_size_ocr,
num_col_upper=num_col_upper,
num_col_lower=num_col_lower,
skip_layout_and_reading_order=skip_layout_and_reading_order,
@ -506,7 +483,15 @@ def layout(
@click.option(
"--dir_in_bin",
"-dib",
help="directory of binarized images (in addition to --dir_in for RGB images; filename stems must match the RGB image files, with '.png' suffix).\nPerform prediction using both RGB and binary images. (This does not necessarily improve results, however it may be beneficial for certain document images.)",
help=("""
directory of binarized images (in addition to --dir_in for RGB
images; filename stems must match the RGB image files, with '.png'
\n
Perform prediction using both RGB and binary images.
(This does not necessarily improve results, however it may be beneficial
for certain document images.
"""),
type=click.Path(exists=True, file_okay=False),
)
@click.option(

View file

@ -95,19 +95,6 @@ from .utils.rotate import (
rotation_not_90_func_full_layout,
rotation_image_new
)
from .utils.utils_ocr import (
return_start_and_end_of_common_text_of_textline_ocr_without_common_section,
return_textline_contour_with_added_box_coordinate,
preprocess_and_resize_image_for_ocrcnn_model,
return_textlines_split_if_needed,
decode_batch_predictions,
return_rnn_cnn_ocr_of_given_textlines,
fit_text_single_line,
break_curved_line_into_small_pieces_and_then_merge,
get_orientation_moments,
rotate_image_with_padding,
get_contours_and_bounding_boxes
)
from .utils.separate_lines import (
separate_lines_new2,
return_deskew_slop,
@ -176,9 +163,6 @@ class Eynollah:
light_version : bool = False,
ignore_page_extraction : bool = False,
reading_order_machine_based : bool = False,
do_ocr : bool = False,
transformer_ocr: bool = False,
batch_size_ocr: Optional[int] = None,
num_col_upper : Optional[int] = None,
num_col_lower : Optional[int] = None,
threshold_art_class_layout: Optional[float] = None,
@ -209,12 +193,6 @@ class Eynollah:
self.extract_only_images = extract_only_images
self.ignore_page_extraction = ignore_page_extraction
self.skip_layout_and_reading_order = skip_layout_and_reading_order
self.ocr = do_ocr
self.tr = transformer_ocr
if not batch_size_ocr:
self.b_s_ocr = 8
else:
self.b_s_ocr = int(batch_size_ocr)
if num_col_upper:
self.num_col_upper = int(num_col_upper)
else:
@ -284,14 +262,6 @@ class Eynollah:
if self.tables:
loadable.append(("table", 'light' if self.light_version else ''))
if self.ocr:
if self.tr:
loadable.append(('ocr', 'tr'))
loadable.append(('trocr_processor', ''))
else:
loadable.append('ocr')
loadable.append('num_to_char')
self.model_zoo.load_models(*loadable)
def __del__(self):
@ -2078,14 +2048,6 @@ class Eynollah:
###img_bin = np.copy(prediction_bin)
###else:
###img_bin = np.copy(img_resized)
if (self.ocr and self.tr) and not self.input_binary:
prediction_bin = self.do_prediction(True, img_resized, self.model_zoo.get("binarization"), n_batch_inference=5)
prediction_bin = 255 * (prediction_bin[:,:,0] == 0)
prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2)
prediction_bin = prediction_bin.astype(np.uint16)
#img= np.copy(prediction_bin)
img_bin = np.copy(prediction_bin)
else:
img_bin = np.copy(img_resized)
#print("inside 1 ", time.time()-t_in)
@ -3586,190 +3548,13 @@ class Eynollah:
region_ids = ['region_%04d' % i for i in range(len(co_text_all_org))]
return ordered, region_ids
def return_start_and_end_of_common_text_of_textline_ocr(self,textline_image, ind_tot):
width = np.shape(textline_image)[1]
height = np.shape(textline_image)[0]
common_window = int(0.2*width)
width1 = int ( width/2. - common_window )
width2 = int ( width/2. + common_window )
img_sum = np.sum(textline_image[:,:,0], axis=0)
sum_smoothed = gaussian_filter1d(img_sum, 3)
peaks_real, _ = find_peaks(sum_smoothed, height=0)
if len(peaks_real)>70:
print(len(peaks_real), 'len(peaks_real)')
peaks_real = peaks_real[(peaks_real<width2) & (peaks_real>width1)]
arg_sort = np.argsort(sum_smoothed[peaks_real])
arg_sort4 =arg_sort[::-1][:4]
peaks_sort_4 = peaks_real[arg_sort][::-1][:4]
argsort_sorted = np.argsort(peaks_sort_4)
first_4_sorted = peaks_sort_4[argsort_sorted]
y_4_sorted = sum_smoothed[peaks_real][arg_sort4[argsort_sorted]]
#print(first_4_sorted,'first_4_sorted')
arg_sortnew = np.argsort(y_4_sorted)
peaks_final =np.sort( first_4_sorted[arg_sortnew][2:] )
#plt.figure(ind_tot)
#plt.imshow(textline_image)
#plt.plot([peaks_final[0], peaks_final[0]], [0, height-1])
#plt.plot([peaks_final[1], peaks_final[1]], [0, height-1])
#plt.savefig('./'+str(ind_tot)+'.png')
return peaks_final[0], peaks_final[1]
else:
pass
def return_start_and_end_of_common_text_of_textline_ocr_new_splitted(
self, peaks_real, sum_smoothed, start_split, end_split):
peaks_real = peaks_real[(peaks_real<end_split) & (peaks_real>start_split)]
arg_sort = np.argsort(sum_smoothed[peaks_real])
arg_sort4 =arg_sort[::-1][:4]
peaks_sort_4 = peaks_real[arg_sort][::-1][:4]
argsort_sorted = np.argsort(peaks_sort_4)
first_4_sorted = peaks_sort_4[argsort_sorted]
y_4_sorted = sum_smoothed[peaks_real][arg_sort4[argsort_sorted]]
#print(first_4_sorted,'first_4_sorted')
arg_sortnew = np.argsort(y_4_sorted)
peaks_final =np.sort( first_4_sorted[arg_sortnew][3:] )
return peaks_final[0]
def return_start_and_end_of_common_text_of_textline_ocr_new(self, textline_image, ind_tot):
width = np.shape(textline_image)[1]
height = np.shape(textline_image)[0]
common_window = int(0.15*width)
width1 = int ( width/2. - common_window )
width2 = int ( width/2. + common_window )
mid = int(width/2.)
img_sum = np.sum(textline_image[:,:,0], axis=0)
sum_smoothed = gaussian_filter1d(img_sum, 3)
peaks_real, _ = find_peaks(sum_smoothed, height=0)
if len(peaks_real)>70:
peak_start = self.return_start_and_end_of_common_text_of_textline_ocr_new_splitted(
peaks_real, sum_smoothed, width1, mid+2)
peak_end = self.return_start_and_end_of_common_text_of_textline_ocr_new_splitted(
peaks_real, sum_smoothed, mid-2, width2)
#plt.figure(ind_tot)
#plt.imshow(textline_image)
#plt.plot([peak_start, peak_start], [0, height-1])
#plt.plot([peak_end, peak_end], [0, height-1])
#plt.savefig('./'+str(ind_tot)+'.png')
return peak_start, peak_end
else:
pass
def return_ocr_of_textline_without_common_section(
self,
textline_image,
model_ocr,
processor,
device,
width_textline,
h2w_ratio,
ind_tot,
):
if h2w_ratio > 0.05:
pixel_values = processor(textline_image, return_tensors="pt").pixel_values
generated_ids = model_ocr.generate(pixel_values.to(device))
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
else:
#width = np.shape(textline_image)[1]
#height = np.shape(textline_image)[0]
#common_window = int(0.3*width)
#width1 = int ( width/2. - common_window )
#width2 = int ( width/2. + common_window )
split_point = return_start_and_end_of_common_text_of_textline_ocr_without_common_section(textline_image)
if split_point:
image1 = textline_image[:, :split_point,:]# image.crop((0, 0, width2, height))
image2 = textline_image[:, split_point:,:]#image.crop((width1, 0, width, height))
#pixel_values1 = processor(image1, return_tensors="pt").pixel_values
#pixel_values2 = processor(image2, return_tensors="pt").pixel_values
pixel_values_merged = processor([image1,image2], return_tensors="pt").pixel_values
generated_ids_merged = model_ocr.generate(pixel_values_merged.to(device))
generated_text_merged = processor.batch_decode(generated_ids_merged, skip_special_tokens=True)
#print(generated_text_merged,'generated_text_merged')
#generated_ids1 = model_ocr.generate(pixel_values1.to(device))
#generated_ids2 = model_ocr.generate(pixel_values2.to(device))
#generated_text1 = processor.batch_decode(generated_ids1, skip_special_tokens=True)[0]
#generated_text2 = processor.batch_decode(generated_ids2, skip_special_tokens=True)[0]
#generated_text = generated_text1 + ' ' + generated_text2
generated_text = generated_text_merged[0] + ' ' + generated_text_merged[1]
#print(generated_text1,'generated_text1')
#print(generated_text2, 'generated_text2')
#print('########################################')
else:
pixel_values = processor(textline_image, return_tensors="pt").pixel_values
generated_ids = model_ocr.generate(pixel_values.to(device))
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
#print(generated_text,'generated_text')
#print('########################################')
return generated_text
def return_ocr_of_textline(
self, textline_image, model_ocr, processor, device, width_textline, h2w_ratio,ind_tot):
if h2w_ratio > 0.05:
pixel_values = processor(textline_image, return_tensors="pt").pixel_values
generated_ids = model_ocr.generate(pixel_values.to(device))
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
else:
#width = np.shape(textline_image)[1]
#height = np.shape(textline_image)[0]
#common_window = int(0.3*width)
#width1 = int ( width/2. - common_window )
#width2 = int ( width/2. + common_window )
try:
width1, width2 = self.return_start_and_end_of_common_text_of_textline_ocr_new(textline_image, ind_tot)
image1 = textline_image[:, :width2,:]# image.crop((0, 0, width2, height))
image2 = textline_image[:, width1:,:]#image.crop((width1, 0, width, height))
pixel_values1 = processor(image1, return_tensors="pt").pixel_values
pixel_values2 = processor(image2, return_tensors="pt").pixel_values
generated_ids1 = model_ocr.generate(pixel_values1.to(device))
generated_ids2 = model_ocr.generate(pixel_values2.to(device))
generated_text1 = processor.batch_decode(generated_ids1, skip_special_tokens=True)[0]
generated_text2 = processor.batch_decode(generated_ids2, skip_special_tokens=True)[0]
#print(generated_text1,'generated_text1')
#print(generated_text2, 'generated_text2')
#print('########################################')
match = sq(None, generated_text1, generated_text2).find_longest_match(
0, len(generated_text1), 0, len(generated_text2))
generated_text = generated_text1 + generated_text2[match.b+match.size:]
except:
pixel_values = processor(textline_image, return_tensors="pt").pixel_values
generated_ids = model_ocr.generate(pixel_values.to(device))
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
return generated_text
def return_list_of_contours_with_desired_order(self, ls_cons, sorted_indexes):
return list(np.array(ls_cons)[np.array(sorted_indexes)])
@ -4009,8 +3794,6 @@ class Eynollah:
enabled_modes.append("Light textline detection")
if self.full_layout:
enabled_modes.append("Full layout analysis")
if self.ocr:
enabled_modes.append("OCR")
if self.tables:
enabled_modes.append("Table detection")
if enabled_modes:
@ -4130,21 +3913,12 @@ class Eynollah:
id_of_texts_tot =['region_0001']
conf_contours_textregions =[0]
if self.ocr and not self.tr:
gc.collect()
ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines(
image_page, all_found_textline_polygons, np.zeros((len(all_found_textline_polygons), 4)),
self.model_zoo.get("ocr"), self.b_s_ocr, self.model_zoo.get("num_to_char"), textline_light=True)
else:
ocr_all_textlines = None
pcgts = self.writer.build_pagexml_no_full_layout(
cont_page, page_coord, order_text_new, id_of_texts_tot,
all_found_textline_polygons, page_coord, [],
[], [], [], [], [], [],
slopes, [], [],
cont_page, [], [],
ocr_all_textlines=ocr_all_textlines,
conf_contours_textregion=conf_contours_textregions,
skip_layout_reading_order=True)
self.logger.info("Basic processing complete")
@ -4629,94 +4403,6 @@ class Eynollah:
boxes_d, textline_mask_tot_d)
self.logger.info(f"Detection of reading order took {time.time() - t_order:.1f}s")
ocr_all_textlines = None
ocr_all_textlines_marginals_left = None
ocr_all_textlines_marginals_right = None
ocr_all_textlines_h = None
ocr_all_textlines_drop = None
if self.ocr:
self.logger.info("Step 4.5/5: OCR Processing")
if not self.tr:
gc.collect()
if len(all_found_textline_polygons):
ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines(
image_page, all_found_textline_polygons, all_box_coord,
self.model_zoo.get("ocr"), self.b_s_ocr, self.model_zoo.get("num_to_char"), self.textline_light, self.curved_line)
if len(all_found_textline_polygons_marginals_left):
ocr_all_textlines_marginals_left = return_rnn_cnn_ocr_of_given_textlines(
image_page, all_found_textline_polygons_marginals_left, all_box_coord_marginals_left,
self.model_zoo.get("ocr"), self.b_s_ocr, self.model_zoo.get("num_to_char"), self.textline_light, self.curved_line)
if len(all_found_textline_polygons_marginals_right):
ocr_all_textlines_marginals_right = return_rnn_cnn_ocr_of_given_textlines(
image_page, all_found_textline_polygons_marginals_right, all_box_coord_marginals_right,
self.model_zoo.get("ocr"), self.b_s_ocr, self.model_zoo.get("num_to_char"), self.textline_light, self.curved_line)
if self.full_layout and len(all_found_textline_polygons):
ocr_all_textlines_h = return_rnn_cnn_ocr_of_given_textlines(
image_page, all_found_textline_polygons_h, all_box_coord_h,
self.model_zoo.get("ocr"), self.b_s_ocr, self.model_zoo.get("num_to_char"), self.textline_light, self.curved_line)
if self.full_layout and len(polygons_of_drop_capitals):
ocr_all_textlines_drop = return_rnn_cnn_ocr_of_given_textlines(
image_page, polygons_of_drop_capitals, np.zeros((len(polygons_of_drop_capitals), 4)),
self.model_zoo.get("ocr"), self.b_s_ocr, self.model_zoo.get("num_to_char"), self.textline_light, self.curved_line)
else:
if self.light_version:
self.logger.info("Using light version OCR")
if self.textline_light:
self.logger.info("Using light text line detection for OCR")
self.logger.info("Processing text lines...")
gc.collect()
torch.cuda.empty_cache()
self.model_zoo.get("ocr").to(self.device)
ind_tot = 0
#cv2.imwrite('./img_out.png', image_page)
ocr_all_textlines = []
# FIXME: what about lines in marginals / headings / drop-capitals here?
for indexing, ind_poly_first in enumerate(all_found_textline_polygons):
ocr_textline_in_textregion = []
for indexing2, ind_poly in enumerate(ind_poly_first):
if not (self.textline_light or self.curved_line):
ind_poly = copy.deepcopy(ind_poly)
box_ind = all_box_coord[indexing]
#print(ind_poly,np.shape(ind_poly), 'ind_poly')
#print(box_ind)
ind_poly = return_textline_contour_with_added_box_coordinate(ind_poly, box_ind)
#print(ind_poly_copy)
ind_poly[ind_poly<0] = 0
x, y, w, h = cv2.boundingRect(ind_poly)
#print(ind_poly_copy, np.shape(ind_poly_copy))
#print(x, y, w, h, h/float(w),'ratio')
h2w_ratio = h/float(w)
mask_poly = np.zeros(image_page.shape)
if not self.light_version:
img_poly_on_img = np.copy(image_page)
else:
img_poly_on_img = np.copy(img_bin_light)
mask_poly = cv2.fillPoly(mask_poly, pts=[ind_poly], color=(1, 1, 1))
if self.textline_light:
mask_poly = cv2.dilate(mask_poly, KERNEL, iterations=1)
img_poly_on_img[:,:,0][mask_poly[:,:,0] ==0] = 255
img_poly_on_img[:,:,1][mask_poly[:,:,0] ==0] = 255
img_poly_on_img[:,:,2][mask_poly[:,:,0] ==0] = 255
img_croped = img_poly_on_img[y:y+h, x:x+w, :]
#cv2.imwrite('./extracted_lines/'+str(ind_tot)+'.jpg', img_croped)
text_ocr = self.return_ocr_of_textline_without_common_section(
img_croped, self.model_zoo.get("ocr"), self.model_zoo.get("trocr_processor"), self.device, w, h2w_ratio, ind_tot)
ocr_textline_in_textregion.append(text_ocr)
ind_tot = ind_tot +1
ocr_all_textlines.append(ocr_textline_in_textregion)
self.logger.info("Step 5/5: Output Generation")
if self.full_layout:
@ -4728,9 +4414,7 @@ class Eynollah:
all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right,
all_box_coord_marginals_left, all_box_coord_marginals_right,
slopes, slopes_h, slopes_marginals_left, slopes_marginals_right,
cont_page, polygons_seplines, ocr_all_textlines, ocr_all_textlines_h,
ocr_all_textlines_marginals_left, ocr_all_textlines_marginals_right,
ocr_all_textlines_drop,
cont_page, polygons_seplines,
conf_contours_textregions, conf_contours_textregions_h)
else:
pcgts = self.writer.build_pagexml_no_full_layout(
@ -4741,9 +4425,6 @@ class Eynollah:
all_box_coord_marginals_left, all_box_coord_marginals_right,
slopes, slopes_marginals_left, slopes_marginals_right,
cont_page, polygons_seplines, contours_tables,
ocr_all_textlines=ocr_all_textlines,
ocr_all_textlines_marginals_left=ocr_all_textlines_marginals_left,
ocr_all_textlines_marginals_right=ocr_all_textlines_marginals_right,
conf_contours_textregions=conf_contours_textregions)
return pcgts

View file

@ -473,8 +473,7 @@ class Eynollah_ocr:
img = cv2.imread(dir_img)
if dir_in_bin is not None:
cropped_lines_bin = []
dir_img_bin = os.path.join(dir_in_bin, file_name+'.png')
img_bin = cv2.imread(dir_img_bin)
img_bin = cv2.imread(os.path.join(dir_in_bin, file_name+'.png'))
if dir_out_image_text:
out_image_with_text = os.path.join(dir_out_image_text, file_name+'.png')

View file

@ -13,12 +13,6 @@ from ocrd_models.constants import NAMESPACES as NS
"--textline_light", "--light_version"],
# -ep ...
# -eoi ...
# FIXME: find out whether OCR extra was installed, otherwise skip these
["--do_ocr"],
["--do_ocr", "--light_version", "--textline_light"],
["--do_ocr", "--transformer_ocr"],
#["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light"],
["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light", "--full-layout"],
# --skip_layout_and_reading_order
], ids=str)
def test_run_eynollah_layout_filename(