trocr: avoid warnings by passing clean_up_tokenization_spaces=False

This commit is contained in:
Robert Sachunsky 2026-05-21 14:20:51 +02:00
parent f9f9130dbb
commit d50bd7c650

View file

@ -139,11 +139,14 @@ class Eynollah_ocr(Eynollah):
cropped_lines = [] cropped_lines = []
indexer_b_s = 0 indexer_b_s = 0
pixel_values_merged = self.model_zoo.get('trocr_processor')(imgs, return_tensors="pt").pixel_values pixel_values_merged = self.model_zoo.get('trocr_processor')(
imgs, return_tensors="pt").pixel_values
generated_ids_merged = self.model_zoo.get('ocr').generate( generated_ids_merged = self.model_zoo.get('ocr').generate(
pixel_values_merged.to(self.device)) pixel_values_merged.to(self.device))
generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode( generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode(
generated_ids_merged, skip_special_tokens=True) generated_ids_merged,
skip_special_tokens=True,
clean_up_tokenization_spaces=False)
extracted_texts = extracted_texts + generated_text_merged extracted_texts = extracted_texts + generated_text_merged
@ -162,11 +165,14 @@ class Eynollah_ocr(Eynollah):
cropped_lines = [] cropped_lines = []
indexer_b_s = 0 indexer_b_s = 0
pixel_values_merged = self.model_zoo.get('trocr_processor')(imgs, return_tensors="pt").pixel_values pixel_values_merged = self.model_zoo.get('trocr_processor')(
imgs, return_tensors="pt").pixel_values
generated_ids_merged = self.model_zoo.get('ocr').generate( generated_ids_merged = self.model_zoo.get('ocr').generate(
pixel_values_merged.to(self.device)) pixel_values_merged.to(self.device))
generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode( generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode(
generated_ids_merged, skip_special_tokens=True) generated_ids_merged,
skip_special_tokens=True,
clean_up_tokenization_spaces=False)
extracted_texts = extracted_texts + generated_text_merged extracted_texts = extracted_texts + generated_text_merged
@ -182,11 +188,14 @@ class Eynollah_ocr(Eynollah):
cropped_lines = [] cropped_lines = []
indexer_b_s = 0 indexer_b_s = 0
pixel_values_merged = self.model_zoo.get('trocr_processor')(imgs, return_tensors="pt").pixel_values pixel_values_merged = self.model_zoo.get('trocr_processor')(
imgs, return_tensors="pt").pixel_values
generated_ids_merged = self.model_zoo.get('ocr').generate( generated_ids_merged = self.model_zoo.get('ocr').generate(
pixel_values_merged.to(self.device)) pixel_values_merged.to(self.device))
generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode( generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode(
generated_ids_merged, skip_special_tokens=True) generated_ids_merged,
skip_special_tokens=True,
clean_up_tokenization_spaces=False)
extracted_texts = extracted_texts + generated_text_merged extracted_texts = extracted_texts + generated_text_merged
@ -200,16 +209,17 @@ class Eynollah_ocr(Eynollah):
cropped_lines = [] cropped_lines = []
indexer_b_s = 0 indexer_b_s = 0
pixel_values_merged = self.model_zoo.get('trocr_processor')(imgs, return_tensors="pt").pixel_values pixel_values_merged = self.model_zoo.get('trocr_processor')(
imgs, return_tensors="pt").pixel_values
generated_ids_merged = self.model_zoo.get('ocr').generate( generated_ids_merged = self.model_zoo.get('ocr').generate(
pixel_values_merged.to(self.device)) pixel_values_merged.to(self.device))
generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode( generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode(
generated_ids_merged, skip_special_tokens=True) generated_ids_merged,
skip_special_tokens=True,
clean_up_tokenization_spaces=False)
extracted_texts = extracted_texts + generated_text_merged extracted_texts = extracted_texts + generated_text_merged
indexer_text_region = indexer_text_region +1 indexer_text_region = indexer_text_region +1
if indexer_b_s!=0: if indexer_b_s!=0:
@ -217,9 +227,14 @@ class Eynollah_ocr(Eynollah):
cropped_lines = [] cropped_lines = []
indexer_b_s = 0 indexer_b_s = 0
pixel_values_merged = self.model_zoo.get('trocr_processor')(imgs, return_tensors="pt").pixel_values pixel_values_merged = self.model_zoo.get('trocr_processor')(
generated_ids_merged = self.model_zoo.get('ocr').generate(pixel_values_merged.to(self.device)) imgs, return_tensors="pt").pixel_values
generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode(generated_ids_merged, skip_special_tokens=True) generated_ids_merged = self.model_zoo.get('ocr').generate(
pixel_values_merged.to(self.device))
generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode(
generated_ids_merged,
skip_special_tokens=True,
clean_up_tokenization_spaces=False)
extracted_texts = extracted_texts + generated_text_merged extracted_texts = extracted_texts + generated_text_merged
@ -750,6 +765,7 @@ class Eynollah_ocr(Eynollah):
indexer_textregion = indexer_textregion + 1 indexer_textregion = indexer_textregion + 1
ET.register_namespace("",page_ns) ET.register_namespace("",page_ns)
self.logger.info("output filename: '%s'", out_file_ocr)
page_tree.write(out_file_ocr, xml_declaration=True, method='xml', encoding="utf-8", default_namespace=None) page_tree.write(out_file_ocr, xml_declaration=True, method='xml', encoding="utf-8", default_namespace=None)
def run( def run(