mirror of
				https://github.com/qurator-spk/eynollah.git
				synced 2025-10-27 15:54:13 +01:00 
			
		
		
		
	should merged text for the whole page be written in xml?
This commit is contained in:
		
							parent
							
								
									920705c3b1
								
							
						
					
					
						commit
						d968a306e4
					
				
					 1 changed files with 28 additions and 3 deletions
				
			
		|  | @ -5129,7 +5129,7 @@ class Eynollah_ocr: | ||||||
|                     self.b_s = int(batch_size) |                     self.b_s = int(batch_size) | ||||||
| 
 | 
 | ||||||
|             else: |             else: | ||||||
|                 self.model_ocr_dir = dir_models + "/model_eynollah_ocr_cnnrnn_20250716"#"/model_ens_ocrcnn_new6"#"/model_ens_ocrcnn_new2"# |                 self.model_ocr_dir = dir_models + "/model_ens_ocrcnn_new6"#"/model_eynollah_ocr_cnnrnn_20250716"#"/model_ens_ocrcnn_new6"#"/model_ens_ocrcnn_new2"# | ||||||
|                 model_ocr = load_model(self.model_ocr_dir , compile=False) |                 model_ocr = load_model(self.model_ocr_dir , compile=False) | ||||||
|                  |                  | ||||||
|                 self.prediction_model = tf.keras.models.Model( |                 self.prediction_model = tf.keras.models.Model( | ||||||
|  | @ -5141,7 +5141,7 @@ class Eynollah_ocr: | ||||||
|                     self.b_s = int(batch_size) |                     self.b_s = int(batch_size) | ||||||
| 
 | 
 | ||||||
|                      |                      | ||||||
|                 with open(os.path.join(self.model_ocr_dir, "characters_org.txt"),"r") as config_file: |                 with open(os.path.join(self.model_ocr_dir, "characters_20250707_all_lang.txt"),"r") as config_file: | ||||||
|                     characters = json.load(config_file) |                     characters = json.load(config_file) | ||||||
|                      |                      | ||||||
|                 AUTOTUNE = tf.data.AUTOTUNE |                 AUTOTUNE = tf.data.AUTOTUNE | ||||||
|  | @ -5780,9 +5780,24 @@ class Eynollah_ocr: | ||||||
|                             text_by_textregion.append(" ".join(extracted_texts_merged_un)) |                             text_by_textregion.append(" ".join(extracted_texts_merged_un)) | ||||||
|                         #print(text_by_textregion, 'text_by_textregiontext_by_textregiontext_by_textregiontext_by_textregiontext_by_textregion') |                         #print(text_by_textregion, 'text_by_textregiontext_by_textregiontext_by_textregiontext_by_textregiontext_by_textregion') | ||||||
|                          |                          | ||||||
|  |                          | ||||||
|  |                     ###index_tot_regions = [] | ||||||
|  |                     ###tot_region_ref = [] | ||||||
|  | 
 | ||||||
|  |                     ###for jj in root1.iter(link+'RegionRefIndexed'): | ||||||
|  |                         ###index_tot_regions.append(jj.attrib['index']) | ||||||
|  |                         ###tot_region_ref.append(jj.attrib['regionRef']) | ||||||
|  |                          | ||||||
|  |                     ###id_to_order = {tid: ro for tid, ro in zip(tot_region_ref, index_tot_regions)} | ||||||
|  |          | ||||||
|  |                     id_textregions = [] | ||||||
|  |                     textregions_by_existing_ids = [] | ||||||
|                     indexer = 0 |                     indexer = 0 | ||||||
|                     indexer_textregion = 0 |                     indexer_textregion = 0 | ||||||
|                     for nn in root1.iter(region_tags): |                     for nn in root1.iter(region_tags): | ||||||
|  |                         id_textregion = nn.attrib['id'] | ||||||
|  |                         id_textregions.append(id_textregion) | ||||||
|  |                         textregions_by_existing_ids.append(text_by_textregion[indexer_textregion]) | ||||||
|                          |                          | ||||||
|                         is_textregion_text = False |                         is_textregion_text = False | ||||||
|                         for childtest in nn: |                         for childtest in nn: | ||||||
|  | @ -5830,6 +5845,16 @@ class Eynollah_ocr: | ||||||
|                                 unicode_textregion.text = text_by_textregion[indexer_textregion] |                                 unicode_textregion.text = text_by_textregion[indexer_textregion] | ||||||
|                             indexer_textregion = indexer_textregion + 1 |                             indexer_textregion = indexer_textregion + 1 | ||||||
|                              |                              | ||||||
|  |                     ###sample_order  = [(id_to_order[tid], text) for tid, text in zip(id_textregions, textregions_by_existing_ids) if tid in id_to_order] | ||||||
|  |                      | ||||||
|  |                     ##ordered_texts_sample = [text for _, text in sorted(sample_order)] | ||||||
|  |                     ##tot_page_text = ' '.join(ordered_texts_sample) | ||||||
|  |                      | ||||||
|  |                     ##for page_element in root1.iter(link+'Page'): | ||||||
|  |                         ##text_page = ET.SubElement(page_element, 'TextEquiv') | ||||||
|  |                         ##unicode_textpage = ET.SubElement(text_page, 'Unicode') | ||||||
|  |                         ##unicode_textpage.text = tot_page_text | ||||||
|  |                      | ||||||
|                     ET.register_namespace("",name_space) |                     ET.register_namespace("",name_space) | ||||||
|                     tree1.write(out_file_ocr,xml_declaration=True,method='xml',encoding="utf8",default_namespace=None) |                     tree1.write(out_file_ocr,xml_declaration=True,method='xml',encoding="utf8",default_namespace=None) | ||||||
|                     #print("Job done in %.1fs", time.time() - t0) |                     #print("Job done in %.1fs", time.time() - t0) | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue