mirror of
				https://github.com/qurator-spk/neat.git
				synced 2025-10-31 00:34:14 +01:00 
			
		
		
		
	improve page2tsv tool
This commit is contained in:
		
							parent
							
								
									a206504560
								
							
						
					
					
						commit
						d6311edd0c
					
				
					 6 changed files with 89 additions and 28 deletions
				
			
		|  | @ -296,6 +296,8 @@ function setupInterface(data, file) { | ||||||
| 
 | 
 | ||||||
|     function updateTable() { |     function updateTable() { | ||||||
| 
 | 
 | ||||||
|  |         let do_not_display = new Set(['url_id', 'left', 'right', 'top', 'bottom']); | ||||||
|  | 
 | ||||||
|         editingTd = null; |         editingTd = null; | ||||||
| 
 | 
 | ||||||
|         let editable_html = |         let editable_html = | ||||||
|  | @ -318,7 +320,7 @@ function setupInterface(data, file) { | ||||||
|                   $.each(el, |                   $.each(el, | ||||||
|                       function(column, content) { |                       function(column, content) { | ||||||
| 
 | 
 | ||||||
|                           if (column == 'url_id') return |                           if (do_not_display.has(column)) return | ||||||
| 
 | 
 | ||||||
|                           var clickAction = function() { console.log('Do something different');} |                           var clickAction = function() { console.log('Do something different');} | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -66,12 +66,12 @@ | ||||||
|                 <img id="preview" alt="facsimile_preview" class="img-responsive fit-image"/> |                 <img id="preview" alt="facsimile_preview" class="img-responsive fit-image"/> | ||||||
|             </a> |             </a> | ||||||
|         </div> |         </div> | ||||||
|         <div class="col-8 text-center" id="tableregion"> |         <div class="col-9 text-center" id="tableregion"> | ||||||
|             Please upload a TSV file in the <a href="https://sites.google.com/site/germeval2014ner/data" target="_blank">GermEval2014 data format</a>: |             Please upload a TSV file in the <a href="https://sites.google.com/site/germeval2014ner/data" target="_blank">GermEval2014 data format</a>: | ||||||
|             <br><br> |             <br><br> | ||||||
|             <input type="file" id="tsv-file" name="files"/> |             <input type="file" id="tsv-file" name="files"/> | ||||||
|         </div> |         </div> | ||||||
|         <div class="col-2" id="region-right"> |         <div class="col-1" id="region-right"> | ||||||
|         </div> |         </div> | ||||||
|     </div> |     </div> | ||||||
|     <div class="row  mt-3"> |     <div class="row  mt-3"> | ||||||
|  |  | ||||||
|  | @ -22,7 +22,40 @@ Install package together with its dependencies in development mode: | ||||||
| pip install -e ./ | pip install -e ./ | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| ## Usage: | ## PAGE-XML to TSV Transformation: | ||||||
|  | 
 | ||||||
|  | Create a TSV file from OCR in PAGE-XML format (with word segmentation): | ||||||
|  | 
 | ||||||
|  | ``` | ||||||
|  | page2tsv PAGE1.xml PAGE.tsv --image-url=http://link-to-corresponding-image-1 | ||||||
|  | ``` | ||||||
|  | 
 | ||||||
|  | In order to create a TSV file for multiple PAGE XML files just perform successive calls | ||||||
|  | of the tool using the same TSV file: | ||||||
|  | 
 | ||||||
|  | ``` | ||||||
|  | page2tsv PAGE1.xml PAGE.tsv --image-url=http://link-to-corresponding-image-1 | ||||||
|  | page2tsv PAGE2.xml PAGE.tsv --image-url=http://link-to-corresponding-image-2 | ||||||
|  | page2tsv PAGE3.xml PAGE.tsv --image-url=http://link-to-corresponding-image-3 | ||||||
|  | page2tsv PAGE4.xml PAGE.tsv --image-url=http://link-to-corresponding-image-4 | ||||||
|  | page2tsv PAGE5.xml PAGE.tsv --image-url=http://link-to-corresponding-image-5 | ||||||
|  | ... | ||||||
|  | ... | ||||||
|  | ... | ||||||
|  | ``` | ||||||
|  | 
 | ||||||
|  | A corresponding URL-mapping file can be obtained from: | ||||||
|  | 
 | ||||||
|  | ``` | ||||||
|  | extract-doc-links PAGE.tsv  PAGE-urls.tsv | ||||||
|  | ``` | ||||||
|  | By loading the annotated TSV as well as the url mapping file into  | ||||||
|  | ner.edith, you will be able to jump directly to the original image | ||||||
|  | where the full text has been extracted from. | ||||||
|  | 
 | ||||||
|  | --- | ||||||
|  | 
 | ||||||
|  | ## Processing of already existing TSV files: | ||||||
| 
 | 
 | ||||||
| Create a URL-annotated TSV file from an existing TSV file: | Create a URL-annotated TSV file from an existing TSV file: | ||||||
| 
 | 
 | ||||||
|  | @ -39,12 +72,3 @@ By loading the annotated TSV as well as the url mapping file into | ||||||
| ner.edith, you will be able to jump directly to the original image | ner.edith, you will be able to jump directly to the original image | ||||||
| where the full text has been extracted from. | where the full text has been extracted from. | ||||||
| 
 | 
 | ||||||
| # PAGE-XML to TSV Transformation |  | ||||||
| 
 |  | ||||||
| ## Usage: |  | ||||||
| 
 |  | ||||||
| Create a TSV file from OCR in PAGE-XML format (with word segmentation): |  | ||||||
| 
 |  | ||||||
| ``` |  | ||||||
| python page2tsv.py PAGE.xml > PAGE.tsv |  | ||||||
| ``` |  | ||||||
|  |  | ||||||
							
								
								
									
										48
									
								
								tools/cli.py
									
										
									
									
									
								
							
							
						
						
									
										48
									
								
								tools/cli.py
									
										
									
									
									
								
							|  | @ -2,6 +2,8 @@ import re | ||||||
| import click | import click | ||||||
| import pandas as pd | import pandas as pd | ||||||
| from io import StringIO | from io import StringIO | ||||||
|  | import os | ||||||
|  | import xml.etree.ElementTree as ET | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @click.command() | @click.command() | ||||||
|  | @ -78,7 +80,7 @@ def extract_doc_links(tsv_file): | ||||||
| 
 | 
 | ||||||
|                     line = "\t" + line |                     line = "\t" + line | ||||||
| 
 | 
 | ||||||
|                 if line.count('\t') == 3: |                 if line.count('\t') >= 3: | ||||||
| 
 | 
 | ||||||
|                     text.append(line + '\n') |                     text.append(line + '\n') | ||||||
| 
 | 
 | ||||||
|  | @ -92,6 +94,50 @@ def extract_doc_links(tsv_file): | ||||||
| 
 | 
 | ||||||
|                 print('Line error: |', line, '|Number of Tabs: ', line.count('\t')) |                 print('Line error: |', line, '|Number of Tabs: ', line.count('\t')) | ||||||
| 
 | 
 | ||||||
|  |         if url is not None: | ||||||
|             parts.append({"url": url, 'header': header, 'text': "".join(text)}) |             parts.append({"url": url, 'header': header, 'text': "".join(text)}) | ||||||
| 
 | 
 | ||||||
|     return parts |     return parts | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @click.command() | ||||||
|  | @click.argument('page-xml-file', type=click.Path(exists=True), required=True, nargs=1) | ||||||
|  | @click.argument('tsv-out-file', type=click.Path(), required=True, nargs=1) | ||||||
|  | @click.option('--image-url', type=str, default='') | ||||||
|  | def page2tsv(page_xml_file, tsv_out_file, image_url): | ||||||
|  | 
 | ||||||
|  |     tree = ET.parse(page_xml_file) | ||||||
|  |     xmlns = tree.getroot().tag.split('}')[0].strip('{') | ||||||
|  | 
 | ||||||
|  |     urls = [] | ||||||
|  |     if os.path.exists(tsv_out_file): | ||||||
|  |         parts = extract_doc_links(tsv_out_file) | ||||||
|  | 
 | ||||||
|  |         urls = [part['url'] for part in parts] | ||||||
|  |     else: | ||||||
|  |         pd.DataFrame([], columns=['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'GND-ID', 'url_id', 'left', 'right', 'top', | ||||||
|  |                                   'bottom']). to_csv(tsv_out_file, sep="\t", quoting=3, index=False) | ||||||
|  | 
 | ||||||
|  |     tsv = [] | ||||||
|  |     for words in tree.findall('.//{%s}Word' % xmlns): | ||||||
|  |         for word in words.findall('.//{%s}Unicode' % xmlns): | ||||||
|  |             text = word.text | ||||||
|  |             for coords in words.findall('.//{%s}Coords' % xmlns): | ||||||
|  | 
 | ||||||
|  |                 points = [int(pos) for p in coords.attrib['points'].split(' ') for pos in p.split(',')] | ||||||
|  | 
 | ||||||
|  |                 left = points[0] | ||||||
|  |                 right = points[2] | ||||||
|  |                 top = points[1] | ||||||
|  |                 bottom = points[5] | ||||||
|  | 
 | ||||||
|  |                 tsv.append((0, text, 'O', 'O', '-', len(urls), left, right, top, bottom)) | ||||||
|  | 
 | ||||||
|  |     with open(tsv_out_file, 'a') as f: | ||||||
|  | 
 | ||||||
|  |         f.write('# ' + image_url + '\n') | ||||||
|  | 
 | ||||||
|  |     tsv = pd.DataFrame(tsv, columns=['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'GND-ID', | ||||||
|  |                                      'url_id', 'left', 'right', 'top', 'bottom']) | ||||||
|  | 
 | ||||||
|  |     tsv.to_csv(tsv_out_file, sep="\t", quoting=3, index=False, mode='a', header=False) | ||||||
|  |  | ||||||
|  | @ -1,12 +0,0 @@ | ||||||
| import sys |  | ||||||
| import codecs |  | ||||||
| import xml.etree.ElementTree as ET |  | ||||||
| 
 |  | ||||||
| sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer, 'strict') |  | ||||||
| tree = ET.parse(sys.argv[1]) |  | ||||||
| xmlns = tree.getroot().tag.split('}')[0].strip('{') |  | ||||||
| for words in tree.findall('.//{%s}Word' % xmlns): |  | ||||||
| 	for word in words.findall('.//{%s}Unicode' % xmlns): |  | ||||||
| 		text = word.text |  | ||||||
| 		for coords in words.findall('.//{%s}Coords' % xmlns): |  | ||||||
| 			sys.stdout.write('0\t'+text+'\tO\tO\t'+coords.attrib['points']+'\n') |  | ||||||
|  | @ -21,7 +21,8 @@ setup( | ||||||
|     entry_points={ |     entry_points={ | ||||||
|       'console_scripts': [ |       'console_scripts': [ | ||||||
|         "extract-doc-links=cli:extract_document_links", |         "extract-doc-links=cli:extract_document_links", | ||||||
|         "annotate-tsv=cli:annotate_tsv" |         "annotate-tsv=cli:annotate_tsv", | ||||||
|  |         "page2tsv=cli:page2tsv" | ||||||
|       ] |       ] | ||||||
|     }, |     }, | ||||||
|     python_requires='>=3.6.0', |     python_requires='>=3.6.0', | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue