mirror of
				https://github.com/qurator-spk/page2tsv.git
				synced 2025-10-31 08:34:13 +01:00 
			
		
		
		
	Merge pull request #6 from kba/lineid-ocr-tsv
Retain line_id, tsv2page CLI to propagate results back to PAGE-XML
This commit is contained in:
		
						commit
						abcdb67e9e
					
				
					 2 changed files with 27 additions and 5 deletions
				
			
		
							
								
								
									
										1
									
								
								setup.py
									
										
									
									
									
								
							
							
						
						
									
										1
									
								
								setup.py
									
										
									
									
									
								
							|  | @ -23,6 +23,7 @@ setup( | |||
|         "extract-doc-links=tsvtools.cli:extract_document_links", | ||||
|         "annotate-tsv=tsvtools.cli:annotate_tsv", | ||||
|         "page2tsv=tsvtools.cli:page2tsv", | ||||
|         "tsv2page=tsvtools.cli:tsv2page", | ||||
|         "find-entities=tsvtools.cli:find_entities", | ||||
|         "make-page2tsv-commands=tsvtools.cli:make_page2tsv_commands" | ||||
|       ] | ||||
|  |  | |||
|  | @ -3,11 +3,13 @@ import glob | |||
| import re | ||||
| import os | ||||
| from io import StringIO | ||||
| from pathlib import Path | ||||
| 
 | ||||
| import numpy as np | ||||
| import click | ||||
| import pandas as pd | ||||
| import requests | ||||
| from lxml import etree as ET | ||||
| 
 | ||||
| from ocrd_models.ocrd_page import parse | ||||
| from ocrd_utils import bbox_from_points | ||||
|  | @ -81,7 +83,7 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, | |||
|     if purpose == "NERD": | ||||
|         out_columns = ['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'ID', 'url_id', 'left', 'right', 'top', 'bottom', 'conf'] | ||||
|     elif purpose == "OCR": | ||||
|         out_columns = ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom', 'conf'] | ||||
|         out_columns = ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom', 'conf', 'line_id'] | ||||
|         if min_confidence is not None and max_confidence is not None: | ||||
|             out_columns += ['ocrconf'] | ||||
|     else: | ||||
|  | @ -110,7 +112,7 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, | |||
|             else: | ||||
|                 conf = np.nan | ||||
| 
 | ||||
|             line_info.append((len(urls), left, right, top, bottom, conf)) | ||||
|             line_info.append((len(urls), left, right, top, bottom, conf, text_line.id)) | ||||
| 
 | ||||
|             for word in text_line.get_Word(): | ||||
|                 for text_equiv in word.get_TextEquiv(): | ||||
|  | @ -119,14 +121,14 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, | |||
|                     left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(word.get_Coords().points)] | ||||
| 
 | ||||
|                     tsv.append((region_idx, len(line_info) - 1, left + (right - left) / 2.0, | ||||
|                                 text_equiv.get_Unicode(), len(urls), left, right, top, bottom)) | ||||
|                                 text_equiv.get_Unicode(), len(urls), left, right, top, bottom, text_line.id)) | ||||
| 
 | ||||
|     line_info = pd.DataFrame(line_info, columns=['url_id', 'left', 'right', 'top', 'bottom', 'conf']) | ||||
|     line_info = pd.DataFrame(line_info, columns=['url_id', 'left', 'right', 'top', 'bottom', 'conf', 'line_id']) | ||||
| 
 | ||||
|     if min_confidence is not None and max_confidence is not None: | ||||
|         line_info['ocrconf'] = line_info.conf.map(lambda x: get_conf_color(x, min_confidence, max_confidence)) | ||||
| 
 | ||||
|     tsv = pd.DataFrame(tsv, columns=['rid', 'line', 'hcenter'] + ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom']) | ||||
|     tsv = pd.DataFrame(tsv, columns=['rid', 'line', 'hcenter'] + ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom', 'line_id']) | ||||
| 
 | ||||
|     if len(tsv) == 0: | ||||
|         return | ||||
|  | @ -177,6 +179,25 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, | |||
|     except requests.HTTPError as e: | ||||
|         print(e) | ||||
| 
 | ||||
| @click.command() | ||||
| @click.option('--output-filename', '-o', help="Output filename. If omitted, PAGE-XML filename with .corrected.xml extension") | ||||
| @click.option('--keep-words', '-k', is_flag=True, help="Keep (out-of-date) Words of TextLines") | ||||
| @click.argument('page-file') | ||||
| @click.argument('tsv-file') | ||||
| def tsv2page(output_filename, keep_words, page_file, tsv_file): | ||||
|     if not output_filename: | ||||
|         output_filename = Path(page_file).stem + '.corrected.xml' | ||||
|     ns = {'pc': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15'} | ||||
|     tsv = pd.read_csv(tsv_file, sep='\t', comment='#', quoting=3) | ||||
|     tree = ET.parse(page_file) | ||||
|     for _, row in tsv.iterrows(): | ||||
|         el_textline = tree.find(f'//pc:TextLine[@id="{row.line_id}"]', namespaces=ns) | ||||
|         el_textline.find('pc:TextEquiv/pc:Unicode', namespaces=ns).text = row.TEXT | ||||
|         if not keep_words: | ||||
|             for el_word in el_textline.findall('pc:Word', namespaces=ns): | ||||
|                 el_textline.remove(el_word) | ||||
|     with open(output_filename, 'w', encoding='utf-8') as f: | ||||
|         f.write(ET.tostring(tree, pretty_print=True).decode('utf-8')) | ||||
| 
 | ||||
| @click.command() | ||||
| @click.argument('tsv-file', type=click.Path(exists=True), required=True, nargs=1) | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue