ocr engine first integration

2026-01-08 11:27:01 +01:00 · 2024-07-17 10:01:37 +02:00 · 2024-07-17 10:01:37 +02:00 · 5144668834
commit 5144668834
parent eac18c553d
3 changed files with 313 additions and 5 deletions
--- a/qurator/eynollah/cli.py
+++ b/qurator/eynollah/cli.py
@ -139,6 +139,12 @@ from qurator.eynollah.eynollah import Eynollah
    is_flag=True,
    help="if this parameter set to true, this tool would apply machine based reading order detection",
 )
+@click.option(
+    "--do_ocr",
+    "-ocr/-noocr",
+    is_flag=True,
+    help="if this parameter set to true, this tool will try to do ocr",
+)
@click.option(
    "--log-level",
    "-l",
@ -167,6 +173,7 @@ def main(
    headers_off,
    light_version,
    reading_order_machine_based,
+    do_ocr,
    ignore_page_extraction,
    log_level
 ):
@ -205,6 +212,7 @@ def main(
        light_version=light_version,
        ignore_page_extraction=ignore_page_extraction,
        reading_order_machine_based=reading_order_machine_based,
+        do_ocr=do_ocr,
    )
    eynollah.run()
    #pcgts = eynollah.run()
--- a/qurator/eynollah/eynollah.py
+++ b/qurator/eynollah/eynollah.py
@ -17,6 +17,16 @@ import gc
 from ocrd_utils import getLogger
 import cv2
 import numpy as np
+from transformers import TrOCRProcessor
+from PIL import Image
+import torch
+from difflib import SequenceMatcher as sq
+from transformers import VisionEncoderDecoderModel
+from numba import cuda 
+import copy
+from scipy.signal import find_peaks
+from scipy.ndimage import gaussian_filter1d
+
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
 stderr = sys.stderr
 sys.stderr = open(os.devnull, "w")
@ -166,6 +176,7 @@ class Eynollah:
        light_version=False,
        ignore_page_extraction=False,
        reading_order_machine_based=False,
+        do_ocr=False,
        override_dpi=None,
        logger=None,
        pcgts=None,
@ -199,6 +210,7 @@ class Eynollah:
        self.headers_off = headers_off
        self.light_version = light_version
        self.ignore_page_extraction = ignore_page_extraction
+        self.ocr = do_ocr
        self.pcgts = pcgts
        if not dir_in:
            self.plotter = None if not enable_plotting else EynollahPlotter(
@ -233,6 +245,9 @@ class Eynollah:
            self.model_textline_dir = dir_models + "/eynollah-textline_light_20210425"
        else:
            self.model_textline_dir = dir_models + "/eynollah-textline_20210425"
+        if self.ocr:
+            self.model_ocr_dir = dir_models + "/checkpoint-166692_printed_trocr"
+            
        self.model_tables = dir_models + "/eynollah-tables_20210319"
        
        self.models = {}
@ -251,6 +266,10 @@ class Eynollah:
            self.model_region_fl_np = self.our_load_model(self.model_region_dir_fully_np)
            self.model_region_fl = self.our_load_model(self.model_region_dir_fully)
            self.model_reading_order_machine = self.our_load_model(self.model_reading_order_machine_dir)
+            if self.ocr:
+                self.model_ocr = VisionEncoderDecoderModel.from_pretrained(self.model_ocr_dir)
+                self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+                self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")#("microsoft/trocr-base-printed")#("microsoft/trocr-base-handwritten")
            
            self.ls_imgs  = os.listdir(self.dir_in)
            
@ -3135,6 +3154,223 @@ class Eynollah:
            
        
        return order_of_texts, id_of_texts
+    def return_start_and_end_of_common_text_of_textline_ocr(self,textline_image, ind_tot):
+        width = np.shape(textline_image)[1]
+        height = np.shape(textline_image)[0]
+        common_window = int(0.2*width)
+
+        width1 = int ( width/2. - common_window )
+        width2 = int ( width/2. + common_window )
+        
+        img_sum = np.sum(textline_image[:,:,0], axis=0)
+        sum_smoothed = gaussian_filter1d(img_sum, 3)
+        
+        peaks_real, _ = find_peaks(sum_smoothed, height=0)
+        
+        if len(peaks_real)>70:
+            print(len(peaks_real), 'len(peaks_real)')
+
+            peaks_real = peaks_real[(peaks_real<width2) & (peaks_real>width1)]
+
+            arg_sort = np.argsort(sum_smoothed[peaks_real])
+
+            arg_sort4 =arg_sort[::-1][:4]
+
+            peaks_sort_4 = peaks_real[arg_sort][::-1][:4]
+
+            argsort_sorted = np.argsort(peaks_sort_4)
+
+            first_4_sorted = peaks_sort_4[argsort_sorted]
+            y_4_sorted = sum_smoothed[peaks_real][arg_sort4[argsort_sorted]]
+            #print(first_4_sorted,'first_4_sorted')
+            
+            arg_sortnew = np.argsort(y_4_sorted)
+            peaks_final =np.sort( first_4_sorted[arg_sortnew][2:] )
+            
+            #plt.figure(ind_tot)
+            #plt.imshow(textline_image)
+            #plt.plot([peaks_final[0], peaks_final[0]], [0, height-1])
+            #plt.plot([peaks_final[1], peaks_final[1]], [0, height-1])
+            #plt.savefig('./'+str(ind_tot)+'.png')
+            
+            return peaks_final[0], peaks_final[1]
+        else:
+            pass
+        
+        
+    def return_start_and_end_of_common_text_of_textline_ocr_without_common_section(self,textline_image, ind_tot):
+        width = np.shape(textline_image)[1]
+        height = np.shape(textline_image)[0]
+        common_window = int(0.06*width)
+
+        width1 = int ( width/2. - common_window )
+        width2 = int ( width/2. + common_window )
+        
+        img_sum = np.sum(textline_image[:,:,0], axis=0)
+        sum_smoothed = gaussian_filter1d(img_sum, 3)
+        
+        peaks_real, _ = find_peaks(sum_smoothed, height=0)
+        
+        if len(peaks_real)>70:
+            #print(len(peaks_real), 'len(peaks_real)')
+
+            peaks_real = peaks_real[(peaks_real<width2) & (peaks_real>width1)]
+
+            arg_max = np.argmax(sum_smoothed[peaks_real])
+
+            peaks_final = peaks_real[arg_max]
+            
+            #plt.figure(ind_tot)
+            #plt.imshow(textline_image)
+            #plt.plot([peaks_final, peaks_final], [0, height-1])
+            ##plt.plot([peaks_final[1], peaks_final[1]], [0, height-1])
+            #plt.savefig('./'+str(ind_tot)+'.png')
+            
+            return peaks_final
+        else:
+            return None
+    def return_start_and_end_of_common_text_of_textline_ocr_new_splitted(self,peaks_real, sum_smoothed, start_split, end_split):
+        peaks_real = peaks_real[(peaks_real<end_split) & (peaks_real>start_split)]
+
+        arg_sort = np.argsort(sum_smoothed[peaks_real])
+
+        arg_sort4 =arg_sort[::-1][:4]
+
+        peaks_sort_4 = peaks_real[arg_sort][::-1][:4]
+
+        argsort_sorted = np.argsort(peaks_sort_4)
+
+        first_4_sorted = peaks_sort_4[argsort_sorted]
+        y_4_sorted = sum_smoothed[peaks_real][arg_sort4[argsort_sorted]]
+        #print(first_4_sorted,'first_4_sorted')
+        
+        arg_sortnew = np.argsort(y_4_sorted)
+        peaks_final =np.sort( first_4_sorted[arg_sortnew][3:] )
+        return peaks_final[0]
+        
+    def return_start_and_end_of_common_text_of_textline_ocr_new(self,textline_image, ind_tot):
+        width = np.shape(textline_image)[1]
+        height = np.shape(textline_image)[0]
+        common_window = int(0.15*width)
+
+        width1 = int ( width/2. - common_window )
+        width2 = int ( width/2. + common_window )
+        mid = int(width/2.)
+        
+        img_sum = np.sum(textline_image[:,:,0], axis=0)
+        sum_smoothed = gaussian_filter1d(img_sum, 3)
+        
+        peaks_real, _ = find_peaks(sum_smoothed, height=0)
+        
+        if len(peaks_real)>70:
+            peak_start = self.return_start_and_end_of_common_text_of_textline_ocr_new_splitted(peaks_real, sum_smoothed, width1, mid+2)
+
+            peak_end = self.return_start_and_end_of_common_text_of_textline_ocr_new_splitted(peaks_real, sum_smoothed, mid-2, width2)
+            
+            #plt.figure(ind_tot)
+            #plt.imshow(textline_image)
+            #plt.plot([peak_start, peak_start], [0, height-1])
+            #plt.plot([peak_end, peak_end], [0, height-1])
+            #plt.savefig('./'+str(ind_tot)+'.png')
+            
+            return peak_start, peak_end
+        else:
+            pass
+    
+    def return_ocr_of_textline_without_common_section(self, textline_image, model_ocr, processor, device, width_textline, h2w_ratio,ind_tot):
+        if h2w_ratio > 0.05:
+            pixel_values = processor(textline_image, return_tensors="pt").pixel_values
+            generated_ids = model_ocr.generate(pixel_values.to(device))
+            generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        else:
+            
+            #width = np.shape(textline_image)[1]
+            #height = np.shape(textline_image)[0]
+            #common_window = int(0.3*width)
+            
+            #width1 = int ( width/2. - common_window )
+            #width2 = int ( width/2. + common_window )
+            
+            
+            split_point = self.return_start_and_end_of_common_text_of_textline_ocr_without_common_section(textline_image, ind_tot)
+            if split_point:
+                image1 = textline_image[:, :split_point,:]# image.crop((0, 0, width2, height))
+                image2 = textline_image[:, split_point:,:]#image.crop((width1, 0, width, height))
+                
+                #pixel_values1 = processor(image1, return_tensors="pt").pixel_values
+                #pixel_values2 = processor(image2, return_tensors="pt").pixel_values
+                
+                pixel_values_merged = processor([image1,image2], return_tensors="pt").pixel_values
+                generated_ids_merged = model_ocr.generate(pixel_values_merged.to(device))
+                generated_text_merged = processor.batch_decode(generated_ids_merged, skip_special_tokens=True)
+                
+                #print(generated_text_merged,'generated_text_merged')
+                
+                #generated_ids1 = model_ocr.generate(pixel_values1.to(device))
+                #generated_ids2 = model_ocr.generate(pixel_values2.to(device))
+                
+                #generated_text1 = processor.batch_decode(generated_ids1, skip_special_tokens=True)[0]
+                #generated_text2 = processor.batch_decode(generated_ids2, skip_special_tokens=True)[0]
+                
+                #generated_text = generated_text1 + ' ' + generated_text2
+                generated_text = generated_text_merged[0] + ' ' + generated_text_merged[1]
+            
+                #print(generated_text1,'generated_text1')
+                #print(generated_text2, 'generated_text2')
+                #print('########################################')
+            else:
+                pixel_values = processor(textline_image, return_tensors="pt").pixel_values
+                generated_ids = model_ocr.generate(pixel_values.to(device))
+                generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+          
+        #print(generated_text,'generated_text')
+        #print('########################################')
+        return generated_text
+    def return_ocr_of_textline(self, textline_image, model_ocr, processor, device, width_textline, h2w_ratio,ind_tot):
+        if h2w_ratio > 0.05:
+            pixel_values = processor(textline_image, return_tensors="pt").pixel_values
+            generated_ids = model_ocr.generate(pixel_values.to(device))
+            generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        else:
+            #width = np.shape(textline_image)[1]
+            #height = np.shape(textline_image)[0]
+            #common_window = int(0.3*width)
+            
+            #width1 = int ( width/2. - common_window )
+            #width2 = int ( width/2. + common_window )
+            
+            try:
+                width1, width2 = self.return_start_and_end_of_common_text_of_textline_ocr_new(textline_image, ind_tot)
+            
+                image1 = textline_image[:, :width2,:]# image.crop((0, 0, width2, height))
+                image2 = textline_image[:, width1:,:]#image.crop((width1, 0, width, height))
+                
+                pixel_values1 = processor(image1, return_tensors="pt").pixel_values
+                pixel_values2 = processor(image2, return_tensors="pt").pixel_values
+                
+                generated_ids1 = model_ocr.generate(pixel_values1.to(device))
+                generated_ids2 = model_ocr.generate(pixel_values2.to(device))
+                
+                generated_text1 = processor.batch_decode(generated_ids1, skip_special_tokens=True)[0]
+                generated_text2 = processor.batch_decode(generated_ids2, skip_special_tokens=True)[0]
+                #print(generated_text1,'generated_text1')
+                #print(generated_text2, 'generated_text2')
+                #print('########################################')
+            
+                match = sq(None, generated_text1, generated_text2).find_longest_match(0, len(generated_text1), 0, len(generated_text2))
+                
+                generated_text = generated_text1 + generated_text2[match.b+match.size:]
+            except:
+                pixel_values = processor(textline_image, return_tensors="pt").pixel_values
+                generated_ids = model_ocr.generate(pixel_values.to(device))
+                generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+                
+        return generated_text
+    
+    def return_textline_contour_with_added_box_coordinate(self, textline_contour,  box_ind):
+        textline_contour[:,0] = textline_contour[:,0] + box_ind[2]
+        textline_contour[:,1] = textline_contour[:,1] + box_ind[0]
+        return textline_contour

    def run(self):
        """
@ -3398,6 +3634,7 @@ class Eynollah:
            if self.plotter:
                self.plotter.write_images_into_directory(polygons_of_images, image_page)
            t_order = time.time()
+                    
            if self.full_layout:
                
                if self.reading_order_machine_based:
@ -3425,11 +3662,67 @@ class Eynollah:
                        contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered, dtype=object)[index_by_text_par_con])
                        order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d)
                    
+
+                if self.ocr:
+
+                    device = cuda.get_current_device()
+                    device.reset()
+                    gc.collect()
+                    model_ocr = VisionEncoderDecoderModel.from_pretrained(self.model_ocr_dir)
+                    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+                    processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
+                    torch.cuda.empty_cache()
+                    model_ocr.to(device)
+                    
+                    ind_tot = 0
+                    #cv2.imwrite('./img_out.png', image_page)
+                    
+                    ocr_all_textlines = []
+                    for indexing, ind_poly_first in enumerate(all_found_textline_polygons):
+                        ocr_textline_in_textregion = []
+                        for indexing2, ind_poly in enumerate(ind_poly_first):
+                            if not (self.textline_light or self.curved_line):
+                                ind_poly = copy.deepcopy(ind_poly)
+                                box_ind = all_box_coord[indexing]
+                                #print(ind_poly,np.shape(ind_poly), 'ind_poly')
+                                #print(box_ind)
+                                ind_poly = self.return_textline_contour_with_added_box_coordinate(ind_poly, box_ind)
+                                #print(ind_poly_copy)
+                                ind_poly[ind_poly<0] = 0
+                            x, y, w, h = cv2.boundingRect(ind_poly)
+                            #print(ind_poly_copy, np.shape(ind_poly_copy))
+                            #print(x, y, w, h, h/float(w),'ratio')
+                            h2w_ratio = h/float(w)
+                            mask_poly = np.zeros(image_page.shape)
+                            img_poly_on_img = np.copy(image_page)
+
+                            mask_poly = cv2.fillPoly(mask_poly, pts=[ind_poly], color=(1, 1, 1))
+                            
+                            if self.textline_light:
+                                mask_poly = cv2.dilate(mask_poly, KERNEL, iterations=1)
+                            
+                            img_poly_on_img[:,:,0][mask_poly[:,:,0] ==0] = 255
+                            img_poly_on_img[:,:,1][mask_poly[:,:,0] ==0] = 255
+                            img_poly_on_img[:,:,2][mask_poly[:,:,0] ==0] = 255
+                            
+                            img_croped = img_poly_on_img[y:y+h, x:x+w, :]
+                            text_ocr = self.return_ocr_of_textline_without_common_section(img_croped, model_ocr, processor, device, w, h2w_ratio, ind_tot)
+                            
+                            ocr_textline_in_textregion.append(text_ocr)
+                        
+                            ##cv2.imwrite(str(ind_tot)+'.png', img_croped)
+                            ind_tot = ind_tot +1
+                        ocr_all_textlines.append(ocr_textline_in_textregion)
+                        
+                else:
+                    ocr_all_textlines = None
+                #print(ocr_all_textlines)
                self.logger.info("detection of reading order took %.1fs", time.time() - t_order)
-                pcgts = self.writer.build_pagexml_no_full_layout(txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_xml, contours_tables)
+                pcgts = self.writer.build_pagexml_no_full_layout(txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines)
                self.logger.info("Job done in %.1fs", time.time() - t0)
                ##return pcgts
            self.writer.write_pagexml(pcgts)
            #self.logger.info("Job done in %.1fs", time.time() - t0)
+            
        if self.dir_in:
            self.logger.info("All jobs done in %.1fs", time.time() - t0_tot)
--- a/qurator/eynollah/writer.py
+++ b/qurator/eynollah/writer.py
@ -2,7 +2,7 @@
 # pylint: disable=import-error
 from pathlib import Path
 import os.path
-
+import xml.etree.ElementTree as ET
 from .utils.xml import create_page_xml, xml_reading_order
 from .utils.counter import EynollahIdCounter

@ -12,6 +12,7 @@ from ocrd_models.ocrd_page import (
        CoordsType,
        PcGtsType,
        TextLineType,
+        TextEquivType,
        TextRegionType,
        ImageRegionType,
        TableRegionType,
@ -93,11 +94,13 @@ class EynollahXmlWriter():
                points_co += ' '
            coords.set_points(points_co[:-1])

-    def serialize_lines_in_region(self, text_region, all_found_textline_polygons, region_idx, page_coord, all_box_coord, slopes, counter):
+    def serialize_lines_in_region(self, text_region, all_found_textline_polygons, region_idx, page_coord, all_box_coord, slopes, counter, ocr_all_textlines_textregion):
        self.logger.debug('enter serialize_lines_in_region')
        for j in range(len(all_found_textline_polygons[region_idx])):
            coords = CoordsType()
            textline = TextLineType(id=counter.next_line_id, Coords=coords)
+            if ocr_all_textlines_textregion:
+                textline.set_TextEquiv( [ TextEquivType(Unicode=ocr_all_textlines_textregion[j]) ] )
            text_region.add_TextLine(textline)
            region_bboxes = all_box_coord[region_idx]
            points_co = ''
@ -140,7 +143,7 @@ class EynollahXmlWriter():
        with open(out_fname, 'w') as f:
            f.write(to_xml(pcgts))

-    def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml, found_polygons_tables):
+    def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml, found_polygons_tables, ocr_all_textlines):
        self.logger.debug('enter build_pagexml_no_full_layout')

        # create the file structure
@ -159,7 +162,11 @@ class EynollahXmlWriter():
                    Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord)),
                    )
            page.add_TextRegion(textregion)
-            self.serialize_lines_in_region(textregion, all_found_textline_polygons, mm, page_coord, all_box_coord, slopes, counter)
+            if ocr_all_textlines:
+                ocr_textlines = ocr_all_textlines[mm]
+            else:
+                ocr_textlines = None
+            self.serialize_lines_in_region(textregion, all_found_textline_polygons, mm, page_coord, all_box_coord, slopes, counter, ocr_textlines)

        for mm in range(len(found_polygons_marginals)):
            marginal = TextRegionType(id=counter.next_region_id, type_='marginalia',