mirror of
				https://github.com/qurator-spk/page2tsv.git
				synced 2025-10-30 16:14:13 +01:00 
			
		
		
		
	cli: split page2tsv from page2tsv_cli
This commit is contained in:
		
							parent
							
								
									568e1cd104
								
							
						
					
					
						commit
						93ee53c8e2
					
				
					 2 changed files with 120 additions and 43 deletions
				
			
		|  | @ -13,7 +13,7 @@ from lxml import etree as ET | ||||||
| from ocrd_models.ocrd_page import parse | from ocrd_models.ocrd_page import parse | ||||||
| from ocrd_utils import bbox_from_points | from ocrd_utils import bbox_from_points | ||||||
| 
 | 
 | ||||||
| from qurator.utils.tsv import read_tsv, write_tsv, extract_doc_links | from .tsv import read_tsv, write_tsv, extract_doc_links | ||||||
| from .ocr import get_conf_color | from .ocr import get_conf_color | ||||||
| 
 | 
 | ||||||
| @click.command() | @click.command() | ||||||
|  | @ -54,27 +54,6 @@ def annotate_tsv(tsv_file, annotated_tsv_file): | ||||||
|     df.to_csv(annotated_tsv_file, sep="\t", quoting=3, index=False) |     df.to_csv(annotated_tsv_file, sep="\t", quoting=3, index=False) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @click.command() |  | ||||||
| @click.argument('page-xml-file', type=click.Path(exists=True), required=True, nargs=1) |  | ||||||
| @click.argument('tsv-out-file', type=click.Path(), required=True, nargs=1) |  | ||||||
| @click.option('--purpose', type=click.Choice(['NERD', 'OCR'], case_sensitive=False), default="NERD", |  | ||||||
|               help="Purpose of output tsv file. " |  | ||||||
|                    "\n\nNERD: NER/NED application/ground-truth creation. " |  | ||||||
|                    "\n\nOCR: OCR application/ground-truth creation. " |  | ||||||
|                    "\n\ndefault: NERD.") |  | ||||||
| @click.option('--image-url', type=str, default='http://empty') |  | ||||||
| @click.option('--ner-rest-endpoint', type=str, default=None, |  | ||||||
|               help="REST endpoint of sbb_ner service. See https://github.com/qurator-spk/sbb_ner for details. " |  | ||||||
|                    "Only applicable in case of NERD.") |  | ||||||
| @click.option('--ned-rest-endpoint', type=str, default=None, |  | ||||||
|               help="REST endpoint of sbb_ned service. See https://github.com/qurator-spk/sbb_ned for details. " |  | ||||||
|                    "Only applicable in case of NERD.") |  | ||||||
| @click.option('--noproxy', type=bool, is_flag=True, help='disable proxy. default: enabled.') |  | ||||||
| @click.option('--scale-factor', type=float, default=1.0, help='default: 1.0') |  | ||||||
| @click.option('--ned-threshold', type=float, default=None) |  | ||||||
| @click.option('--min-confidence', type=float, default=None) |  | ||||||
| @click.option('--max-confidence', type=float, default=None) |  | ||||||
| @click.option('--ned-priority', type=int, default=1) |  | ||||||
| def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, ned_rest_endpoint, | def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, ned_rest_endpoint, | ||||||
|              noproxy, scale_factor, ned_threshold, min_confidence, max_confidence, ned_priority): |              noproxy, scale_factor, ned_threshold, min_confidence, max_confidence, ned_priority): | ||||||
|     if purpose == "NERD": |     if purpose == "NERD": | ||||||
|  | @ -102,7 +81,6 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, | ||||||
| 
 | 
 | ||||||
|     for region_idx, region in enumerate(pcgts.get_Page().get_AllRegions(classes=['Text'], order='reading-order')): |     for region_idx, region in enumerate(pcgts.get_Page().get_AllRegions(classes=['Text'], order='reading-order')): | ||||||
|         for text_line in region.get_TextLine(): |         for text_line in region.get_TextLine(): | ||||||
| 
 |  | ||||||
|             left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(text_line.get_Coords().points)] |             left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(text_line.get_Coords().points)] | ||||||
| 
 | 
 | ||||||
|             if min_confidence is not None and max_confidence is not None: |             if min_confidence is not None and max_confidence is not None: | ||||||
|  | @ -118,19 +96,15 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, | ||||||
|                 for text_equiv in text_line.get_TextEquiv(): |                 for text_equiv in text_line.get_TextEquiv(): | ||||||
|                     # transform OCR coordinates using `scale_factor` to derive |                     # transform OCR coordinates using `scale_factor` to derive | ||||||
|                     # correct coordinates for the web presentation image |                     # correct coordinates for the web presentation image | ||||||
|                     left, top, right, bottom = [int(scale_factor * x) for x in |                     left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(text_line.get_Coords().points)] | ||||||
|                                                 bbox_from_points(text_line.get_Coords().points)] |  | ||||||
| 
 |  | ||||||
|                     tsv.append((region_idx, len(line_info) - 1, left + (right - left) / 2.0, |                     tsv.append((region_idx, len(line_info) - 1, left + (right - left) / 2.0, | ||||||
|                                 text_equiv.get_Unicode(), len(urls), left, right, top, bottom, text_line.id)) |                                 text_equiv.get_Unicode(), len(urls), left, right, top, bottom, text_line.id)) | ||||||
|             else: |             else: | ||||||
|                 for word in words: |                 for word in words: | ||||||
| 
 |  | ||||||
|                     for text_equiv in word.get_TextEquiv(): |                     for text_equiv in word.get_TextEquiv(): | ||||||
|                         # transform OCR coordinates using `scale_factor` to derive |                         # transform OCR coordinates using `scale_factor` to derive | ||||||
|                         # correct coordinates for the web presentation image |                         # correct coordinates for the web presentation image | ||||||
|                         left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(word.get_Coords().points)] |                         left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(word.get_Coords().points)] | ||||||
| 
 |  | ||||||
|                         tsv.append((region_idx, len(line_info) - 1, left + (right - left) / 2.0, |                         tsv.append((region_idx, len(line_info) - 1, left + (right - left) / 2.0, | ||||||
|                                     text_equiv.get_Unicode(), len(urls), left, right, top, bottom, text_line.id)) |                                     text_equiv.get_Unicode(), len(urls), left, right, top, bottom, text_line.id)) | ||||||
| 
 | 
 | ||||||
|  | @ -142,51 +116,41 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, | ||||||
|     tsv = pd.DataFrame(tsv, columns=['rid', 'line', 'hcenter'] + |     tsv = pd.DataFrame(tsv, columns=['rid', 'line', 'hcenter'] + | ||||||
|                                     ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom', 'line_id']) |                                     ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom', 'line_id']) | ||||||
| 
 | 
 | ||||||
|  |     # print(tsv) | ||||||
|  |     with open(tsv_out_file, 'a') as f: | ||||||
|  |         f.write('# ' + image_url + '\n') | ||||||
|  | 
 | ||||||
|     if len(tsv) == 0: |     if len(tsv) == 0: | ||||||
|         return |         return | ||||||
| 
 | 
 | ||||||
|     with open(tsv_out_file, 'a') as f: |  | ||||||
| 
 |  | ||||||
|         f.write('# ' + image_url + '\n') |  | ||||||
| 
 |  | ||||||
|     vlinecenter = pd.DataFrame(tsv[['line', 'top']].groupby('line', sort=False).mean().top + |     vlinecenter = pd.DataFrame(tsv[['line', 'top']].groupby('line', sort=False).mean().top + | ||||||
|                                (tsv[['line', 'bottom']].groupby('line', sort=False).mean().bottom - |                                (tsv[['line', 'bottom']].groupby('line', sort=False).mean().bottom - | ||||||
|                                 tsv[['line', 'top']].groupby('line', sort=False).mean().top) / 2, |                                 tsv[['line', 'top']].groupby('line', sort=False).mean().top) / 2, | ||||||
|                                columns=['vlinecenter']) |                                columns=['vlinecenter']) | ||||||
| 
 | 
 | ||||||
|     tsv = tsv.merge(vlinecenter, left_on='line', right_index=True) |     tsv = tsv.merge(vlinecenter, left_on='line', right_index=True) | ||||||
| 
 |  | ||||||
|     regions = [region.sort_values(['vlinecenter', 'hcenter']) for rid, region in tsv.groupby('rid', sort=False)] |     regions = [region.sort_values(['vlinecenter', 'hcenter']) for rid, region in tsv.groupby('rid', sort=False)] | ||||||
| 
 |  | ||||||
|     tsv = pd.concat(regions) |     tsv = pd.concat(regions) | ||||||
| 
 | 
 | ||||||
|     if purpose == 'NERD': |     if purpose == 'NERD': | ||||||
| 
 |  | ||||||
|         tsv['No.'] = 0 |         tsv['No.'] = 0 | ||||||
|         tsv['NE-TAG'] = 'O' |         tsv['NE-TAG'] = 'O' | ||||||
|         tsv['NE-EMB'] = 'O' |         tsv['NE-EMB'] = 'O' | ||||||
|         tsv['ID'] = '-' |         tsv['ID'] = '-' | ||||||
|         tsv['conf'] = '-' |         tsv['conf'] = '-' | ||||||
| 
 |  | ||||||
|         tsv = tsv.rename(columns={'TEXT': 'TOKEN'}) |         tsv = tsv.rename(columns={'TEXT': 'TOKEN'}) | ||||||
|     elif purpose == 'OCR': |  | ||||||
| 
 | 
 | ||||||
|  |     elif purpose == 'OCR': | ||||||
|         tsv = pd.DataFrame([(line, " ".join(part.TEXT.to_list())) for line, part in tsv.groupby('line')], |         tsv = pd.DataFrame([(line, " ".join(part.TEXT.to_list())) for line, part in tsv.groupby('line')], | ||||||
|                            columns=['line', 'TEXT']) |                            columns=['line', 'TEXT']) | ||||||
| 
 |  | ||||||
|         tsv = tsv.merge(line_info, left_on='line', right_index=True) |         tsv = tsv.merge(line_info, left_on='line', right_index=True) | ||||||
| 
 |  | ||||||
|     tsv = tsv[out_columns].reset_index(drop=True) |     tsv = tsv[out_columns].reset_index(drop=True) | ||||||
| 
 | 
 | ||||||
|     try: |     try: | ||||||
|         if purpose == 'NERD' and ner_rest_endpoint is not None: |         if purpose == 'NERD' and ner_rest_endpoint is not None: | ||||||
| 
 |  | ||||||
|             tsv, ner_result = ner(tsv, ner_rest_endpoint) |             tsv, ner_result = ner(tsv, ner_rest_endpoint) | ||||||
| 
 |  | ||||||
|             if ned_rest_endpoint is not None: |             if ned_rest_endpoint is not None: | ||||||
| 
 |  | ||||||
|                 tsv, _ = ned(tsv, ner_result, ned_rest_endpoint, threshold=ned_threshold, priority=ned_priority) |                 tsv, _ = ned(tsv, ner_result, ned_rest_endpoint, threshold=ned_threshold, priority=ned_priority) | ||||||
| 
 |  | ||||||
|         tsv.to_csv(tsv_out_file, sep="\t", quoting=3, index=False, mode='a', header=False) |         tsv.to_csv(tsv_out_file, sep="\t", quoting=3, index=False, mode='a', header=False) | ||||||
|     except requests.HTTPError as e: |     except requests.HTTPError as e: | ||||||
|         print(e) |         print(e) | ||||||
|  | @ -250,3 +214,29 @@ def make_page2tsv_commands(xls_file, directory, purpose): | ||||||
|                       '{}-{:08d}/left,top,width,height/full/0/default.jpg --scale-factor=1.0 --purpose={}'. |                       '{}-{:08d}/left,top,width,height/full/0/default.jpg --scale-factor=1.0 --purpose={}'. | ||||||
|                       format(file, ma.group(1), ma.group(2), int(ma.group(3)), purpose)) |                       format(file, ma.group(1), ma.group(2), int(ma.group(3)), purpose)) | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|  | @click.command() | ||||||
|  | @click.argument('page-xml-file', type=click.Path(exists=True), required=True, nargs=1) | ||||||
|  | @click.argument('tsv-out-file', type=click.Path(), required=True, nargs=1) | ||||||
|  | @click.option('--purpose', type=click.Choice(['NERD', 'OCR'], case_sensitive=False), default="NERD", | ||||||
|  |               help="Purpose of output tsv file. " | ||||||
|  |                    "\n\nNERD: NER/NED application/ground-truth creation. " | ||||||
|  |                    "\n\nOCR: OCR application/ground-truth creation. " | ||||||
|  |                    "\n\ndefault: NERD.") | ||||||
|  | @click.option('--image-url', type=str, default='http://empty') | ||||||
|  | @click.option('--ner-rest-endpoint', type=str, default=None, | ||||||
|  |               help="REST endpoint of sbb_ner service. See https://github.com/qurator-spk/sbb_ner for details. " | ||||||
|  |                    "Only applicable in case of NERD.") | ||||||
|  | @click.option('--ned-rest-endpoint', type=str, default=None, | ||||||
|  |               help="REST endpoint of sbb_ned service. See https://github.com/qurator-spk/sbb_ned for details. " | ||||||
|  |                    "Only applicable in case of NERD.") | ||||||
|  | @click.option('--noproxy', type=bool, is_flag=True, help='disable proxy. default: enabled.') | ||||||
|  | @click.option('--scale-factor', type=float, default=1.0, help='default: 1.0') | ||||||
|  | @click.option('--ned-threshold', type=float, default=None) | ||||||
|  | @click.option('--min-confidence', type=float, default=None) | ||||||
|  | @click.option('--max-confidence', type=float, default=None) | ||||||
|  | @click.option('--ned-priority', type=int, default=1) | ||||||
|  | def page2tsv_cli(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, ned_rest_endpoint, | ||||||
|  |              noproxy, scale_factor, ned_threshold, min_confidence, max_confidence, ned_priority): | ||||||
|  |     return page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, ned_rest_endpoint, | ||||||
|  |              noproxy, scale_factor, ned_threshold, min_confidence, max_confidence, ned_priority) | ||||||
|  |  | ||||||
							
								
								
									
										87
									
								
								tsvtools/tsv.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										87
									
								
								tsvtools/tsv.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,87 @@ | ||||||
|  | import pandas as pd | ||||||
|  | import re | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def read_tsv(tsv_file): | ||||||
|  | 
 | ||||||
|  |     tsv = pd.read_csv(tsv_file, sep='\t', comment='#', quoting=3).rename(columns={'GND-ID': 'ID'}) | ||||||
|  | 
 | ||||||
|  |     parts = extract_doc_links(tsv_file) | ||||||
|  | 
 | ||||||
|  |     urls = [part['url'] for part in parts] | ||||||
|  | 
 | ||||||
|  |     return tsv, urls | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def write_tsv(tsv, urls, tsv_out_file): | ||||||
|  | 
 | ||||||
|  |     if 'conf' in tsv.columns: | ||||||
|  |         out_columns = ['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'ID', 'url_id', 'left', 'right', 'top', 'bottom', 'conf'] | ||||||
|  |     else: | ||||||
|  |         out_columns = ['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'ID', 'url_id', 'left', 'right', 'top', 'bottom'] | ||||||
|  | 
 | ||||||
|  |     if len(urls) == 0: | ||||||
|  |         print('Writing to {}...'.format(tsv_out_file)) | ||||||
|  | 
 | ||||||
|  |         tsv.to_csv(tsv_out_file, sep="\t", quoting=3, index=False) | ||||||
|  |     else: | ||||||
|  |         pd.DataFrame([], columns=out_columns).to_csv(tsv_out_file, sep="\t", quoting=3, index=False) | ||||||
|  | 
 | ||||||
|  |         for url_id, part in tsv.groupby('url_id'): | ||||||
|  |             with open(tsv_out_file, 'a') as f: | ||||||
|  |                 f.write('# ' + urls[int(url_id)] + '\n') | ||||||
|  | 
 | ||||||
|  |             part.to_csv(tsv_out_file, sep="\t", quoting=3, index=False, mode='a', header=False) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def extract_doc_links(tsv_file): | ||||||
|  |     parts = [] | ||||||
|  | 
 | ||||||
|  |     header = None | ||||||
|  | 
 | ||||||
|  |     with open(tsv_file, 'r') as f: | ||||||
|  | 
 | ||||||
|  |         text = [] | ||||||
|  |         url = None | ||||||
|  | 
 | ||||||
|  |         for line in f: | ||||||
|  | 
 | ||||||
|  |             if header is None: | ||||||
|  |                 header = "\t".join(line.split()) + '\n' | ||||||
|  |                 continue | ||||||
|  | 
 | ||||||
|  |             urls = [url for url in | ||||||
|  |                     re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line)] | ||||||
|  | 
 | ||||||
|  |             if len(urls) > 0: | ||||||
|  |                 if url is not None: | ||||||
|  |                     parts.append({"url": url, 'header': header, 'text': "".join(text)}) | ||||||
|  |                     text = [] | ||||||
|  | 
 | ||||||
|  |                 url = urls[-1] | ||||||
|  |             else: | ||||||
|  |                 if url is None: | ||||||
|  |                     continue | ||||||
|  | 
 | ||||||
|  |                 line = '\t'.join(line.split()) | ||||||
|  | 
 | ||||||
|  |                 if line.count('\t') == 2: | ||||||
|  |                     line = "\t" + line | ||||||
|  | 
 | ||||||
|  |                 if line.count('\t') >= 3: | ||||||
|  |                     text.append(line + '\n') | ||||||
|  | 
 | ||||||
|  |                     continue | ||||||
|  | 
 | ||||||
|  |                 if line.startswith('#'): | ||||||
|  |                     continue | ||||||
|  | 
 | ||||||
|  |                 if len(line) == 0: | ||||||
|  |                     continue | ||||||
|  | 
 | ||||||
|  |                 print('Line error: |', line, '|Number of Tabs: ', line.count('\t')) | ||||||
|  | 
 | ||||||
|  |         if url is not None: | ||||||
|  |             parts.append({"url": url, 'header': header, 'text': "".join(text)}) | ||||||
|  | 
 | ||||||
|  |     return parts | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue