mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-08-29 11:59:55 +02:00
should merged text for the whole page be written in xml?
This commit is contained in:
parent
920705c3b1
commit
d968a306e4
1 changed files with 28 additions and 3 deletions
|
@ -5129,7 +5129,7 @@ class Eynollah_ocr:
|
||||||
self.b_s = int(batch_size)
|
self.b_s = int(batch_size)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
self.model_ocr_dir = dir_models + "/model_eynollah_ocr_cnnrnn_20250716"#"/model_ens_ocrcnn_new6"#"/model_ens_ocrcnn_new2"#
|
self.model_ocr_dir = dir_models + "/model_ens_ocrcnn_new6"#"/model_eynollah_ocr_cnnrnn_20250716"#"/model_ens_ocrcnn_new6"#"/model_ens_ocrcnn_new2"#
|
||||||
model_ocr = load_model(self.model_ocr_dir , compile=False)
|
model_ocr = load_model(self.model_ocr_dir , compile=False)
|
||||||
|
|
||||||
self.prediction_model = tf.keras.models.Model(
|
self.prediction_model = tf.keras.models.Model(
|
||||||
|
@ -5141,7 +5141,7 @@ class Eynollah_ocr:
|
||||||
self.b_s = int(batch_size)
|
self.b_s = int(batch_size)
|
||||||
|
|
||||||
|
|
||||||
with open(os.path.join(self.model_ocr_dir, "characters_org.txt"),"r") as config_file:
|
with open(os.path.join(self.model_ocr_dir, "characters_20250707_all_lang.txt"),"r") as config_file:
|
||||||
characters = json.load(config_file)
|
characters = json.load(config_file)
|
||||||
|
|
||||||
AUTOTUNE = tf.data.AUTOTUNE
|
AUTOTUNE = tf.data.AUTOTUNE
|
||||||
|
@ -5780,9 +5780,24 @@ class Eynollah_ocr:
|
||||||
text_by_textregion.append(" ".join(extracted_texts_merged_un))
|
text_by_textregion.append(" ".join(extracted_texts_merged_un))
|
||||||
#print(text_by_textregion, 'text_by_textregiontext_by_textregiontext_by_textregiontext_by_textregiontext_by_textregion')
|
#print(text_by_textregion, 'text_by_textregiontext_by_textregiontext_by_textregiontext_by_textregiontext_by_textregion')
|
||||||
|
|
||||||
|
|
||||||
|
###index_tot_regions = []
|
||||||
|
###tot_region_ref = []
|
||||||
|
|
||||||
|
###for jj in root1.iter(link+'RegionRefIndexed'):
|
||||||
|
###index_tot_regions.append(jj.attrib['index'])
|
||||||
|
###tot_region_ref.append(jj.attrib['regionRef'])
|
||||||
|
|
||||||
|
###id_to_order = {tid: ro for tid, ro in zip(tot_region_ref, index_tot_regions)}
|
||||||
|
|
||||||
|
id_textregions = []
|
||||||
|
textregions_by_existing_ids = []
|
||||||
indexer = 0
|
indexer = 0
|
||||||
indexer_textregion = 0
|
indexer_textregion = 0
|
||||||
for nn in root1.iter(region_tags):
|
for nn in root1.iter(region_tags):
|
||||||
|
id_textregion = nn.attrib['id']
|
||||||
|
id_textregions.append(id_textregion)
|
||||||
|
textregions_by_existing_ids.append(text_by_textregion[indexer_textregion])
|
||||||
|
|
||||||
is_textregion_text = False
|
is_textregion_text = False
|
||||||
for childtest in nn:
|
for childtest in nn:
|
||||||
|
@ -5829,7 +5844,17 @@ class Eynollah_ocr:
|
||||||
else:
|
else:
|
||||||
unicode_textregion.text = text_by_textregion[indexer_textregion]
|
unicode_textregion.text = text_by_textregion[indexer_textregion]
|
||||||
indexer_textregion = indexer_textregion + 1
|
indexer_textregion = indexer_textregion + 1
|
||||||
|
|
||||||
|
###sample_order = [(id_to_order[tid], text) for tid, text in zip(id_textregions, textregions_by_existing_ids) if tid in id_to_order]
|
||||||
|
|
||||||
|
##ordered_texts_sample = [text for _, text in sorted(sample_order)]
|
||||||
|
##tot_page_text = ' '.join(ordered_texts_sample)
|
||||||
|
|
||||||
|
##for page_element in root1.iter(link+'Page'):
|
||||||
|
##text_page = ET.SubElement(page_element, 'TextEquiv')
|
||||||
|
##unicode_textpage = ET.SubElement(text_page, 'Unicode')
|
||||||
|
##unicode_textpage.text = tot_page_text
|
||||||
|
|
||||||
ET.register_namespace("",name_space)
|
ET.register_namespace("",name_space)
|
||||||
tree1.write(out_file_ocr,xml_declaration=True,method='xml',encoding="utf8",default_namespace=None)
|
tree1.write(out_file_ocr,xml_declaration=True,method='xml',encoding="utf8",default_namespace=None)
|
||||||
#print("Job done in %.1fs", time.time() - t0)
|
#print("Job done in %.1fs", time.time() - t0)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue