mirror of
https://github.com/qurator-spk/eynollah.git
synced 2026-05-26 07:39:22 +02:00
trocr: avoid warnings by passing clean_up_tokenization_spaces=False
This commit is contained in:
parent
f9f9130dbb
commit
d50bd7c650
1 changed files with 33 additions and 17 deletions
|
|
@ -139,11 +139,14 @@ class Eynollah_ocr(Eynollah):
|
||||||
cropped_lines = []
|
cropped_lines = []
|
||||||
indexer_b_s = 0
|
indexer_b_s = 0
|
||||||
|
|
||||||
pixel_values_merged = self.model_zoo.get('trocr_processor')(imgs, return_tensors="pt").pixel_values
|
pixel_values_merged = self.model_zoo.get('trocr_processor')(
|
||||||
|
imgs, return_tensors="pt").pixel_values
|
||||||
generated_ids_merged = self.model_zoo.get('ocr').generate(
|
generated_ids_merged = self.model_zoo.get('ocr').generate(
|
||||||
pixel_values_merged.to(self.device))
|
pixel_values_merged.to(self.device))
|
||||||
generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode(
|
generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode(
|
||||||
generated_ids_merged, skip_special_tokens=True)
|
generated_ids_merged,
|
||||||
|
skip_special_tokens=True,
|
||||||
|
clean_up_tokenization_spaces=False)
|
||||||
|
|
||||||
extracted_texts = extracted_texts + generated_text_merged
|
extracted_texts = extracted_texts + generated_text_merged
|
||||||
|
|
||||||
|
|
@ -162,11 +165,14 @@ class Eynollah_ocr(Eynollah):
|
||||||
cropped_lines = []
|
cropped_lines = []
|
||||||
indexer_b_s = 0
|
indexer_b_s = 0
|
||||||
|
|
||||||
pixel_values_merged = self.model_zoo.get('trocr_processor')(imgs, return_tensors="pt").pixel_values
|
pixel_values_merged = self.model_zoo.get('trocr_processor')(
|
||||||
|
imgs, return_tensors="pt").pixel_values
|
||||||
generated_ids_merged = self.model_zoo.get('ocr').generate(
|
generated_ids_merged = self.model_zoo.get('ocr').generate(
|
||||||
pixel_values_merged.to(self.device))
|
pixel_values_merged.to(self.device))
|
||||||
generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode(
|
generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode(
|
||||||
generated_ids_merged, skip_special_tokens=True)
|
generated_ids_merged,
|
||||||
|
skip_special_tokens=True,
|
||||||
|
clean_up_tokenization_spaces=False)
|
||||||
|
|
||||||
extracted_texts = extracted_texts + generated_text_merged
|
extracted_texts = extracted_texts + generated_text_merged
|
||||||
|
|
||||||
|
|
@ -182,11 +188,14 @@ class Eynollah_ocr(Eynollah):
|
||||||
cropped_lines = []
|
cropped_lines = []
|
||||||
indexer_b_s = 0
|
indexer_b_s = 0
|
||||||
|
|
||||||
pixel_values_merged = self.model_zoo.get('trocr_processor')(imgs, return_tensors="pt").pixel_values
|
pixel_values_merged = self.model_zoo.get('trocr_processor')(
|
||||||
|
imgs, return_tensors="pt").pixel_values
|
||||||
generated_ids_merged = self.model_zoo.get('ocr').generate(
|
generated_ids_merged = self.model_zoo.get('ocr').generate(
|
||||||
pixel_values_merged.to(self.device))
|
pixel_values_merged.to(self.device))
|
||||||
generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode(
|
generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode(
|
||||||
generated_ids_merged, skip_special_tokens=True)
|
generated_ids_merged,
|
||||||
|
skip_special_tokens=True,
|
||||||
|
clean_up_tokenization_spaces=False)
|
||||||
|
|
||||||
extracted_texts = extracted_texts + generated_text_merged
|
extracted_texts = extracted_texts + generated_text_merged
|
||||||
|
|
||||||
|
|
@ -194,22 +203,23 @@ class Eynollah_ocr(Eynollah):
|
||||||
cropped_lines.append(img_crop)
|
cropped_lines.append(img_crop)
|
||||||
cropped_lines_meging_indexing.append(0)
|
cropped_lines_meging_indexing.append(0)
|
||||||
indexer_b_s+=1
|
indexer_b_s+=1
|
||||||
|
|
||||||
if indexer_b_s==self.b_s:
|
if indexer_b_s==self.b_s:
|
||||||
imgs = cropped_lines[:]
|
imgs = cropped_lines[:]
|
||||||
cropped_lines = []
|
cropped_lines = []
|
||||||
indexer_b_s = 0
|
indexer_b_s = 0
|
||||||
|
|
||||||
pixel_values_merged = self.model_zoo.get('trocr_processor')(imgs, return_tensors="pt").pixel_values
|
pixel_values_merged = self.model_zoo.get('trocr_processor')(
|
||||||
|
imgs, return_tensors="pt").pixel_values
|
||||||
generated_ids_merged = self.model_zoo.get('ocr').generate(
|
generated_ids_merged = self.model_zoo.get('ocr').generate(
|
||||||
pixel_values_merged.to(self.device))
|
pixel_values_merged.to(self.device))
|
||||||
generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode(
|
generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode(
|
||||||
generated_ids_merged, skip_special_tokens=True)
|
generated_ids_merged,
|
||||||
|
skip_special_tokens=True,
|
||||||
|
clean_up_tokenization_spaces=False)
|
||||||
|
|
||||||
extracted_texts = extracted_texts + generated_text_merged
|
extracted_texts = extracted_texts + generated_text_merged
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
indexer_text_region = indexer_text_region +1
|
indexer_text_region = indexer_text_region +1
|
||||||
|
|
||||||
if indexer_b_s!=0:
|
if indexer_b_s!=0:
|
||||||
|
|
@ -217,9 +227,14 @@ class Eynollah_ocr(Eynollah):
|
||||||
cropped_lines = []
|
cropped_lines = []
|
||||||
indexer_b_s = 0
|
indexer_b_s = 0
|
||||||
|
|
||||||
pixel_values_merged = self.model_zoo.get('trocr_processor')(imgs, return_tensors="pt").pixel_values
|
pixel_values_merged = self.model_zoo.get('trocr_processor')(
|
||||||
generated_ids_merged = self.model_zoo.get('ocr').generate(pixel_values_merged.to(self.device))
|
imgs, return_tensors="pt").pixel_values
|
||||||
generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode(generated_ids_merged, skip_special_tokens=True)
|
generated_ids_merged = self.model_zoo.get('ocr').generate(
|
||||||
|
pixel_values_merged.to(self.device))
|
||||||
|
generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode(
|
||||||
|
generated_ids_merged,
|
||||||
|
skip_special_tokens=True,
|
||||||
|
clean_up_tokenization_spaces=False)
|
||||||
|
|
||||||
extracted_texts = extracted_texts + generated_text_merged
|
extracted_texts = extracted_texts + generated_text_merged
|
||||||
|
|
||||||
|
|
@ -750,6 +765,7 @@ class Eynollah_ocr(Eynollah):
|
||||||
indexer_textregion = indexer_textregion + 1
|
indexer_textregion = indexer_textregion + 1
|
||||||
|
|
||||||
ET.register_namespace("",page_ns)
|
ET.register_namespace("",page_ns)
|
||||||
|
self.logger.info("output filename: '%s'", out_file_ocr)
|
||||||
page_tree.write(out_file_ocr, xml_declaration=True, method='xml', encoding="utf-8", default_namespace=None)
|
page_tree.write(out_file_ocr, xml_declaration=True, method='xml', encoding="utf-8", default_namespace=None)
|
||||||
|
|
||||||
def run(
|
def run(
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue