In the context of OCR, if Page-XML files already contain text, the new predicted text will replace the existing text.

2025-12-15 23:54:13 +01:00 · 2025-03-26 18:42:06 +01:00 · 2025-03-26 18:42:06 +01:00 · 7df0427b04
commit 7df0427b04
parent 370d44a66b
1 changed files with 36 additions and 7 deletions
--- a/src/eynollah/eynollah.py
+++ b/src/eynollah/eynollah.py
@ -4965,7 +4965,7 @@ class Eynollah_ocr:
            self.model_ocr.to(self.device)
        else:
-            self.model_ocr_dir = dir_models + "/model_step_100000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn"
+            self.model_ocr_dir = dir_models + "/model_step_150000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn"
            model_ocr = load_model(self.model_ocr_dir , compile=False)
            self.prediction_model = tf.keras.models.Model(
@ -5358,6 +5358,13 @@ class Eynollah_ocr:
                    indexer = 0
                    indexer_textregion = 0
                    for nn in root1.iter(region_tags):
                        is_textregion_text = False
                        for childtest in nn:
                            if childtest.tag.endswith("TextEquiv"):
                                is_textregion_text = True
                        if not is_textregion_text:
                            text_subelement_textregion = ET.SubElement(nn, 'TextEquiv')
                            unicode_textregion = ET.SubElement(text_subelement_textregion, 'Unicode')
@ -5365,12 +5372,34 @@ class Eynollah_ocr:
                        has_textline = False
                        for child_textregion in nn:
                            if child_textregion.tag.endswith("TextLine"):
                                is_textline_text = False
                                for childtest2 in child_textregion:
                                    if childtest2.tag.endswith("TextEquiv"):
                                        is_textline_text = True
                                if not is_textline_text:
                                    text_subelement = ET.SubElement(child_textregion, 'TextEquiv')
                                    unicode_textline = ET.SubElement(text_subelement, 'Unicode')
                                    unicode_textline.text = extracted_texts_merged[indexer]
                                else:
                                    for childtest3 in child_textregion:
                                        if childtest3.tag.endswith("TextEquiv"):
                                            for child_uc in childtest3:
                                                if child_uc.tag.endswith("Unicode"):
                                                    child_uc.text = extracted_texts_merged[indexer]
                                indexer = indexer + 1
                                has_textline = True
                        if has_textline:
                            if is_textregion_text:
                                for child4 in nn:
                                    if child4.tag.endswith("TextEquiv"):
                                        for childtr_uc in child4:
                                            if childtr_uc.tag.endswith("Unicode"):
                                                childtr_uc.text = text_by_textregion[indexer_textregion]
                            else:
                                unicode_textregion.text = text_by_textregion[indexer_textregion]
                            indexer_textregion = indexer_textregion + 1