mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-12-01 08:44:13 +01:00
move line-gt extraction out of ocr to eynollah-training
This commit is contained in:
parent
951bd2fce6
commit
30f9c695dc
4 changed files with 500 additions and 429 deletions
|
|
@ -59,12 +59,6 @@ import click
|
||||||
is_flag=True,
|
is_flag=True,
|
||||||
help="if this parameter set to true, transformer ocr will be applied, otherwise cnn_rnn model.",
|
help="if this parameter set to true, transformer ocr will be applied, otherwise cnn_rnn model.",
|
||||||
)
|
)
|
||||||
@click.option(
|
|
||||||
"--export_textline_images_and_text",
|
|
||||||
"-etit/-noetit",
|
|
||||||
is_flag=True,
|
|
||||||
help="if this parameter set to true, images and text in xml will be exported into output dir. This files can be used for training a OCR engine.",
|
|
||||||
)
|
|
||||||
@click.option(
|
@click.option(
|
||||||
"--do_not_mask_with_textline_contour",
|
"--do_not_mask_with_textline_contour",
|
||||||
"-nmtc/-mtc",
|
"-nmtc/-mtc",
|
||||||
|
|
@ -76,11 +70,6 @@ import click
|
||||||
"-bs",
|
"-bs",
|
||||||
help="number of inference batch size. Default b_s for trocr and cnn_rnn models are 2 and 8 respectively",
|
help="number of inference batch size. Default b_s for trocr and cnn_rnn models are 2 and 8 respectively",
|
||||||
)
|
)
|
||||||
@click.option(
|
|
||||||
"--dataset_abbrevation",
|
|
||||||
"-ds_pref",
|
|
||||||
help="in the case of extracting textline and text from a xml GT file user can add an abbrevation of dataset name to generated dataset",
|
|
||||||
)
|
|
||||||
@click.option(
|
@click.option(
|
||||||
"--min_conf_value_of_textline_text",
|
"--min_conf_value_of_textline_text",
|
||||||
"-min_conf",
|
"-min_conf",
|
||||||
|
|
@ -97,7 +86,6 @@ def ocr_cli(
|
||||||
dir_out_image_text,
|
dir_out_image_text,
|
||||||
overwrite,
|
overwrite,
|
||||||
tr_ocr,
|
tr_ocr,
|
||||||
export_textline_images_and_text,
|
|
||||||
do_not_mask_with_textline_contour,
|
do_not_mask_with_textline_contour,
|
||||||
batch_size,
|
batch_size,
|
||||||
dataset_abbrevation,
|
dataset_abbrevation,
|
||||||
|
|
@ -106,18 +94,11 @@ def ocr_cli(
|
||||||
"""
|
"""
|
||||||
Recognize text with a CNN/RNN or transformer ML model.
|
Recognize text with a CNN/RNN or transformer ML model.
|
||||||
"""
|
"""
|
||||||
assert not export_textline_images_and_text or not tr_ocr, "Exporting textline and text -etit can not be set alongside transformer ocr -tr_ocr"
|
assert bool(image) ^ bool(dir_in), "Either -i (single image) or -di (directory) must be provided, but not both."
|
||||||
# FIXME: refactor: move export_textline_images_and_text out of eynollah.py
|
|
||||||
# assert not export_textline_images_and_text or not model, "Exporting textline and text -etit can not be set alongside model -m"
|
|
||||||
assert not export_textline_images_and_text or not batch_size, "Exporting textline and text -etit can not be set alongside batch size -bs"
|
|
||||||
assert not export_textline_images_and_text or not dir_in_bin, "Exporting textline and text -etit can not be set alongside directory of bin images -dib"
|
|
||||||
assert not export_textline_images_and_text or not dir_out_image_text, "Exporting textline and text -etit can not be set alongside directory of images with predicted text -doit"
|
|
||||||
assert bool(image) != bool(dir_in), "Either -i (single image) or -di (directory) must be provided, but not both."
|
|
||||||
from ..eynollah_ocr import Eynollah_ocr
|
from ..eynollah_ocr import Eynollah_ocr
|
||||||
eynollah_ocr = Eynollah_ocr(
|
eynollah_ocr = Eynollah_ocr(
|
||||||
model_zoo=ctx.obj.model_zoo,
|
model_zoo=ctx.obj.model_zoo,
|
||||||
tr_ocr=tr_ocr,
|
tr_ocr=tr_ocr,
|
||||||
export_textline_images_and_text=export_textline_images_and_text,
|
|
||||||
do_not_mask_with_textline_contour=do_not_mask_with_textline_contour,
|
do_not_mask_with_textline_contour=do_not_mask_with_textline_contour,
|
||||||
batch_size=batch_size,
|
batch_size=batch_size,
|
||||||
pref_of_dataset=dataset_abbrevation,
|
pref_of_dataset=dataset_abbrevation,
|
||||||
|
|
|
||||||
|
|
@ -9,17 +9,13 @@ from logging import Logger, getLogger
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import os
|
import os
|
||||||
import json
|
|
||||||
import gc
|
import gc
|
||||||
import sys
|
import sys
|
||||||
import math
|
import math
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from keras.layers import StringLookup
|
|
||||||
import cv2
|
import cv2
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
import tensorflow as tf
|
|
||||||
from keras.models import load_model
|
|
||||||
from PIL import Image, ImageDraw, ImageFont
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from eynollah.model_zoo import EynollahModelZoo
|
from eynollah.model_zoo import EynollahModelZoo
|
||||||
|
|
@ -48,11 +44,6 @@ if sys.version_info < (3, 10):
|
||||||
else:
|
else:
|
||||||
import importlib.resources as importlib_resources
|
import importlib.resources as importlib_resources
|
||||||
|
|
||||||
try:
|
|
||||||
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
|
|
||||||
except ImportError:
|
|
||||||
TrOCRProcessor = VisionEncoderDecoderModel = None
|
|
||||||
|
|
||||||
class Eynollah_ocr:
|
class Eynollah_ocr:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|
@ -60,27 +51,16 @@ class Eynollah_ocr:
|
||||||
model_zoo: EynollahModelZoo,
|
model_zoo: EynollahModelZoo,
|
||||||
tr_ocr=False,
|
tr_ocr=False,
|
||||||
batch_size: Optional[int]=None,
|
batch_size: Optional[int]=None,
|
||||||
export_textline_images_and_text: bool=False,
|
|
||||||
do_not_mask_with_textline_contour: bool=False,
|
do_not_mask_with_textline_contour: bool=False,
|
||||||
pref_of_dataset=None,
|
|
||||||
min_conf_value_of_textline_text : Optional[float]=None,
|
min_conf_value_of_textline_text : Optional[float]=None,
|
||||||
logger: Optional[Logger]=None,
|
logger: Optional[Logger]=None,
|
||||||
):
|
):
|
||||||
self.tr_ocr = tr_ocr
|
self.tr_ocr = tr_ocr
|
||||||
# For generating textline-image pairs for traning, move to generate_gt_for_training
|
|
||||||
self.export_textline_images_and_text = export_textline_images_and_text
|
|
||||||
# masking for OCR and GT generation, relevant for skewed lines and bounding boxes
|
# masking for OCR and GT generation, relevant for skewed lines and bounding boxes
|
||||||
self.do_not_mask_with_textline_contour = do_not_mask_with_textline_contour
|
self.do_not_mask_with_textline_contour = do_not_mask_with_textline_contour
|
||||||
# prefix or dataset
|
|
||||||
self.pref_of_dataset = pref_of_dataset
|
|
||||||
self.logger = logger if logger else getLogger('eynollah.ocr')
|
self.logger = logger if logger else getLogger('eynollah.ocr')
|
||||||
self.model_zoo = model_zoo
|
self.model_zoo = model_zoo
|
||||||
|
|
||||||
# TODO: Properly document what 'export_textline_images_and_text' is about
|
|
||||||
if export_textline_images_and_text:
|
|
||||||
self.logger.info("export_textline_images_and_text was set, so no actual models are loaded")
|
|
||||||
return
|
|
||||||
|
|
||||||
self.min_conf_value_of_textline_text = min_conf_value_of_textline_text if min_conf_value_of_textline_text else 0.3
|
self.min_conf_value_of_textline_text = min_conf_value_of_textline_text if min_conf_value_of_textline_text else 0.3
|
||||||
self.b_s = 2 if batch_size is None and tr_ocr else 8 if batch_size is None else batch_size
|
self.b_s = 2 if batch_size is None and tr_ocr else 8 if batch_size is None else batch_size
|
||||||
|
|
||||||
|
|
@ -539,11 +519,6 @@ class Eynollah_ocr:
|
||||||
mask_poly = mask_poly[y:y+h, x:x+w, :]
|
mask_poly = mask_poly[y:y+h, x:x+w, :]
|
||||||
img_crop = img_poly_on_img[y:y+h, x:x+w, :]
|
img_crop = img_poly_on_img[y:y+h, x:x+w, :]
|
||||||
|
|
||||||
if self.export_textline_images_and_text:
|
|
||||||
if not self.do_not_mask_with_textline_contour:
|
|
||||||
img_crop[mask_poly==0] = 255
|
|
||||||
|
|
||||||
else:
|
|
||||||
# print(file_name, angle_degrees, w*h,
|
# print(file_name, angle_degrees, w*h,
|
||||||
# mask_poly[:,:,0].sum(),
|
# mask_poly[:,:,0].sum(),
|
||||||
# mask_poly[:,:,0].sum() /float(w*h) ,
|
# mask_poly[:,:,0].sum() /float(w*h) ,
|
||||||
|
|
@ -602,7 +577,6 @@ class Eynollah_ocr:
|
||||||
break_curved_line_into_small_pieces_and_then_merge(
|
break_curved_line_into_small_pieces_and_then_merge(
|
||||||
img_crop, mask_poly)
|
img_crop, mask_poly)
|
||||||
|
|
||||||
if not self.export_textline_images_and_text:
|
|
||||||
if w_scaled < 750:#1.5*image_width:
|
if w_scaled < 750:#1.5*image_width:
|
||||||
img_fin = preprocess_and_resize_image_for_ocrcnn_model(
|
img_fin = preprocess_and_resize_image_for_ocrcnn_model(
|
||||||
img_crop, image_height, image_width)
|
img_crop, image_height, image_width)
|
||||||
|
|
@ -666,31 +640,9 @@ class Eynollah_ocr:
|
||||||
img_crop_bin, image_height, image_width)
|
img_crop_bin, image_height, image_width)
|
||||||
cropped_lines_bin.append(img_fin)
|
cropped_lines_bin.append(img_fin)
|
||||||
|
|
||||||
if self.export_textline_images_and_text:
|
|
||||||
if img_crop.shape[0]==0 or img_crop.shape[1]==0:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
if child_textlines.tag.endswith("TextEquiv"):
|
|
||||||
for cheild_text in child_textlines:
|
|
||||||
if cheild_text.tag.endswith("Unicode"):
|
|
||||||
textline_text = cheild_text.text
|
|
||||||
if textline_text:
|
|
||||||
base_name = os.path.join(
|
|
||||||
dir_out, file_name + '_line_' + str(indexer_textlines))
|
|
||||||
if self.pref_of_dataset:
|
|
||||||
base_name += '_' + self.pref_of_dataset
|
|
||||||
if not self.do_not_mask_with_textline_contour:
|
|
||||||
base_name += '_masked'
|
|
||||||
|
|
||||||
with open(base_name + '.txt', 'w') as text_file:
|
|
||||||
text_file.write(textline_text)
|
|
||||||
cv2.imwrite(base_name + '.png', img_crop)
|
|
||||||
indexer_textlines+=1
|
|
||||||
|
|
||||||
if not self.export_textline_images_and_text:
|
|
||||||
indexer_text_region = indexer_text_region +1
|
indexer_text_region = indexer_text_region +1
|
||||||
|
|
||||||
if not self.export_textline_images_and_text:
|
|
||||||
extracted_texts = []
|
extracted_texts = []
|
||||||
extracted_conf_value = []
|
extracted_conf_value = []
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,7 @@ from .build_model_load_pretrained_weights_and_save import build_model_load_pretr
|
||||||
from .generate_gt_for_training import main as generate_gt_cli
|
from .generate_gt_for_training import main as generate_gt_cli
|
||||||
from .inference import main as inference_cli
|
from .inference import main as inference_cli
|
||||||
from .train import ex
|
from .train import ex
|
||||||
|
from .extract_line_gt import linegt_cli
|
||||||
|
|
||||||
@click.command(context_settings=dict(
|
@click.command(context_settings=dict(
|
||||||
ignore_unknown_options=True,
|
ignore_unknown_options=True,
|
||||||
|
|
@ -24,3 +25,4 @@ main.add_command(build_model_load_pretrained_weights_and_save)
|
||||||
main.add_command(generate_gt_cli, 'generate-gt')
|
main.add_command(generate_gt_cli, 'generate-gt')
|
||||||
main.add_command(inference_cli, 'inference')
|
main.add_command(inference_cli, 'inference')
|
||||||
main.add_command(train_cli, 'train')
|
main.add_command(train_cli, 'train')
|
||||||
|
main.add_command(linegt_cli, 'export_textline_images_and_text')
|
||||||
|
|
|
||||||
136
src/eynollah/training/extract_line_gt.py
Normal file
136
src/eynollah/training/extract_line_gt.py
Normal file
|
|
@ -0,0 +1,136 @@
|
||||||
|
from logging import Logger, getLogger
|
||||||
|
from typing import Optional
|
||||||
|
from pathlib import Path
|
||||||
|
import os
|
||||||
|
|
||||||
|
import click
|
||||||
|
import cv2
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from ..utils import is_image_filename
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.option(
|
||||||
|
"--image",
|
||||||
|
"-i",
|
||||||
|
help="input image filename",
|
||||||
|
type=click.Path(exists=True, dir_okay=False),
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--dir_in",
|
||||||
|
"-di",
|
||||||
|
'image_filename',
|
||||||
|
help="directory of input images (instead of --image)",
|
||||||
|
type=click.Path(exists=True, file_okay=False),
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--dir_xmls",
|
||||||
|
"-dx",
|
||||||
|
help="directory of input PAGE-XML files (in addition to --dir_in; filename stems must match the image files, with '.xml' suffix).",
|
||||||
|
type=click.Path(exists=True, file_okay=False),
|
||||||
|
required=True,
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--out",
|
||||||
|
"-o",
|
||||||
|
'dir_out',
|
||||||
|
help="directory for output PAGE-XML files",
|
||||||
|
type=click.Path(exists=True, file_okay=False),
|
||||||
|
required=True,
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--dataset_abbrevation",
|
||||||
|
"-ds_pref",
|
||||||
|
'pref_of_dataset',
|
||||||
|
help="in the case of extracting textline and text from a xml GT file user can add an abbrevation of dataset name to generated dataset",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--do_not_mask_with_textline_contour",
|
||||||
|
"-nmtc/-mtc",
|
||||||
|
is_flag=True,
|
||||||
|
help="if this parameter set to true, cropped textline images will not be masked with textline contour.",
|
||||||
|
)
|
||||||
|
def linegt_cli(
|
||||||
|
image_filename,
|
||||||
|
dir_in,
|
||||||
|
dir_xmls,
|
||||||
|
dir_out,
|
||||||
|
pref_of_dataset,
|
||||||
|
do_not_mask_with_textline_contour,
|
||||||
|
):
|
||||||
|
assert bool(dir_in) ^ bool(image_filename), "Set --dir-in or --image-filename, not both"
|
||||||
|
if dir_in:
|
||||||
|
ls_imgs = [
|
||||||
|
os.path.join(dir_in, image_filename) for image_filename in filter(is_image_filename, os.listdir(dir_in))
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
assert image_filename
|
||||||
|
ls_imgs = [image_filename]
|
||||||
|
|
||||||
|
for dir_img in ls_imgs:
|
||||||
|
file_name = Path(dir_img).stem
|
||||||
|
dir_xml = os.path.join(dir_xmls, file_name + '.xml')
|
||||||
|
|
||||||
|
img = cv2.imread(dir_img)
|
||||||
|
|
||||||
|
total_bb_coordinates = []
|
||||||
|
|
||||||
|
tree1 = ET.parse(dir_xml, parser=ET.XMLParser(encoding="utf-8"))
|
||||||
|
root1 = tree1.getroot()
|
||||||
|
alltags = [elem.tag for elem in root1.iter()]
|
||||||
|
|
||||||
|
name_space = alltags[0].split('}')[0]
|
||||||
|
name_space = name_space.split('{')[1]
|
||||||
|
|
||||||
|
region_tags = [x for x in alltags if x.endswith('TextRegion')][0]
|
||||||
|
|
||||||
|
cropped_lines_region_indexer = []
|
||||||
|
|
||||||
|
indexer_text_region = 0
|
||||||
|
indexer_textlines = 0
|
||||||
|
# FIXME: non recursive, use OCR-D PAGE generateDS API. Or use an existing tool for this purpose altogether
|
||||||
|
for nn in root1.iter(region_tags):
|
||||||
|
for child_textregion in nn:
|
||||||
|
if child_textregion.tag.endswith("TextLine"):
|
||||||
|
for child_textlines in child_textregion:
|
||||||
|
if child_textlines.tag.endswith("Coords"):
|
||||||
|
cropped_lines_region_indexer.append(indexer_text_region)
|
||||||
|
p_h = child_textlines.attrib['points'].split(' ')
|
||||||
|
textline_coords = np.array([[int(x.split(',')[0]), int(x.split(',')[1])] for x in p_h])
|
||||||
|
|
||||||
|
x, y, w, h = cv2.boundingRect(textline_coords)
|
||||||
|
|
||||||
|
total_bb_coordinates.append([x, y, w, h])
|
||||||
|
|
||||||
|
img_poly_on_img = np.copy(img)
|
||||||
|
|
||||||
|
mask_poly = np.zeros(img.shape)
|
||||||
|
mask_poly = cv2.fillPoly(mask_poly, pts=[textline_coords], color=(1, 1, 1))
|
||||||
|
|
||||||
|
mask_poly = mask_poly[y : y + h, x : x + w, :]
|
||||||
|
img_crop = img_poly_on_img[y : y + h, x : x + w, :]
|
||||||
|
|
||||||
|
if not do_not_mask_with_textline_contour:
|
||||||
|
img_crop[mask_poly == 0] = 255
|
||||||
|
|
||||||
|
if img_crop.shape[0] == 0 or img_crop.shape[1] == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if child_textlines.tag.endswith("TextEquiv"):
|
||||||
|
for cheild_text in child_textlines:
|
||||||
|
if cheild_text.tag.endswith("Unicode"):
|
||||||
|
textline_text = cheild_text.text
|
||||||
|
if textline_text:
|
||||||
|
base_name = os.path.join(
|
||||||
|
dir_out, file_name + '_line_' + str(indexer_textlines)
|
||||||
|
)
|
||||||
|
if pref_of_dataset:
|
||||||
|
base_name += '_' + pref_of_dataset
|
||||||
|
if not do_not_mask_with_textline_contour:
|
||||||
|
base_name += '_masked'
|
||||||
|
|
||||||
|
with open(base_name + '.txt', 'w') as text_file:
|
||||||
|
text_file.write(textline_text)
|
||||||
|
cv2.imwrite(base_name + '.png', img_crop)
|
||||||
|
indexer_textlines += 1
|
||||||
Loading…
Add table
Add a link
Reference in a new issue