In the context of OCR, if Page-XML files already contain text, the new predicted text will replace the existing text.

2025-07-01 15:09:54 +02:00 · 2025-03-26 18:42:06 +01:00 · 2025-03-26 18:42:06 +01:00 · 7df0427b04
commit 7df0427b04
parent 370d44a66b
1 changed files with 36 additions and 7 deletions
--- a/src/eynollah/eynollah.py
+++ b/src/eynollah/eynollah.py
@ -4965,7 +4965,7 @@ class Eynollah_ocr:
            self.model_ocr.to(self.device)

        else:
-            self.model_ocr_dir = dir_models + "/model_step_100000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn"
+            self.model_ocr_dir = dir_models + "/model_step_150000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn"
            model_ocr = load_model(self.model_ocr_dir , compile=False)
            
            self.prediction_model = tf.keras.models.Model(
@ -5358,6 +5358,13 @@ class Eynollah_ocr:
                    indexer = 0
                    indexer_textregion = 0
                    for nn in root1.iter(region_tags):
+                        
+                        is_textregion_text = False
+                        for childtest in nn:
+                            if childtest.tag.endswith("TextEquiv"):
+                                is_textregion_text = True
+                        
+                        if not is_textregion_text:
                            text_subelement_textregion = ET.SubElement(nn, 'TextEquiv')
                            unicode_textregion = ET.SubElement(text_subelement_textregion, 'Unicode')

@ -5365,12 +5372,34 @@ class Eynollah_ocr:
                        has_textline = False
                        for child_textregion in nn:
                            if child_textregion.tag.endswith("TextLine"):
+                                
+                                is_textline_text = False
+                                for childtest2 in child_textregion:
+                                    if childtest2.tag.endswith("TextEquiv"):
+                                        is_textline_text = True
+                                
+                                
+                                if not is_textline_text:
                                    text_subelement = ET.SubElement(child_textregion, 'TextEquiv')
                                    unicode_textline = ET.SubElement(text_subelement, 'Unicode')
                                    unicode_textline.text = extracted_texts_merged[indexer]
+                                else:
+                                    for childtest3 in child_textregion:
+                                        if childtest3.tag.endswith("TextEquiv"):
+                                            for child_uc in childtest3:
+                                                if child_uc.tag.endswith("Unicode"):
+                                                    child_uc.text = extracted_texts_merged[indexer]
+                                        
                                indexer = indexer + 1
                                has_textline = True
                        if has_textline:
+                            if is_textregion_text:
+                                for child4 in nn:
+                                    if child4.tag.endswith("TextEquiv"):
+                                        for childtr_uc in child4:
+                                            if childtr_uc.tag.endswith("Unicode"):
+                                                childtr_uc.text = text_by_textregion[indexer_textregion]
+                            else:
                                unicode_textregion.text = text_by_textregion[indexer_textregion]
                            indexer_textregion = indexer_textregion + 1