improve page2tsv tool

pull/39/head
Kai Labusch 5 years ago
parent a206504560
commit d6311edd0c

@ -296,6 +296,8 @@ function setupInterface(data, file) {
function updateTable() {
let do_not_display = new Set(['url_id', 'left', 'right', 'top', 'bottom']);
editingTd = null;
let editable_html =
@ -318,7 +320,7 @@ function setupInterface(data, file) {
$.each(el,
function(column, content) {
if (column == 'url_id') return
if (do_not_display.has(column)) return
var clickAction = function() { console.log('Do something different');}

@ -66,12 +66,12 @@
<img id="preview" alt="facsimile_preview" class="img-responsive fit-image"/>
</a>
</div>
<div class="col-8 text-center" id="tableregion">
<div class="col-9 text-center" id="tableregion">
Please upload a TSV file in the <a href="https://sites.google.com/site/germeval2014ner/data" target="_blank">GermEval2014 data format</a>:
<br><br>
<input type="file" id="tsv-file" name="files"/>
</div>
<div class="col-2" id="region-right">
<div class="col-1" id="region-right">
</div>
</div>
<div class="row mt-3">

@ -22,29 +22,53 @@ Install package together with its dependencies in development mode:
pip install -e ./
```
## Usage:
## PAGE-XML to TSV Transformation:
Create a URL-annotated TSV file from an existing TSV file:
Create a TSV file from OCR in PAGE-XML format (with word segmentation):
```
annotate-tsv enp_DE.tsv enp_DE-annotated.tsv
page2tsv PAGE1.xml PAGE.tsv --image-url=http://link-to-corresponding-image-1
```
Create a corresponding URL-mapping file:
In order to create a TSV file for multiple PAGE XML files just perform successive calls
of the tool using the same TSV file:
```
extract-doc-links enp_DE.tsv enp_DE-urls.tsv
page2tsv PAGE1.xml PAGE.tsv --image-url=http://link-to-corresponding-image-1
page2tsv PAGE2.xml PAGE.tsv --image-url=http://link-to-corresponding-image-2
page2tsv PAGE3.xml PAGE.tsv --image-url=http://link-to-corresponding-image-3
page2tsv PAGE4.xml PAGE.tsv --image-url=http://link-to-corresponding-image-4
page2tsv PAGE5.xml PAGE.tsv --image-url=http://link-to-corresponding-image-5
...
...
...
```
A corresponding URL-mapping file can be obtained from:
```
extract-doc-links PAGE.tsv PAGE-urls.tsv
```
By loading the annotated TSV as well as the url mapping file into
ner.edith, you will be able to jump directly to the original image
where the full text has been extracted from.
# PAGE-XML to TSV Transformation
---
## Usage:
## Processing of already existing TSV files:
Create a TSV file from OCR in PAGE-XML format (with word segmentation):
Create a URL-annotated TSV file from an existing TSV file:
```
python page2tsv.py PAGE.xml > PAGE.tsv
annotate-tsv enp_DE.tsv enp_DE-annotated.tsv
```
Create a corresponding URL-mapping file:
```
extract-doc-links enp_DE.tsv enp_DE-urls.tsv
```
By loading the annotated TSV as well as the url mapping file into
ner.edith, you will be able to jump directly to the original image
where the full text has been extracted from.

@ -2,6 +2,8 @@ import re
import click
import pandas as pd
from io import StringIO
import os
import xml.etree.ElementTree as ET
@click.command()
@ -78,7 +80,7 @@ def extract_doc_links(tsv_file):
line = "\t" + line
if line.count('\t') == 3:
if line.count('\t') >= 3:
text.append(line + '\n')
@ -92,6 +94,50 @@ def extract_doc_links(tsv_file):
print('Line error: |', line, '|Number of Tabs: ', line.count('\t'))
parts.append({"url": url, 'header': header, 'text': "".join(text)})
if url is not None:
parts.append({"url": url, 'header': header, 'text': "".join(text)})
return parts
@click.command()
@click.argument('page-xml-file', type=click.Path(exists=True), required=True, nargs=1)
@click.argument('tsv-out-file', type=click.Path(), required=True, nargs=1)
@click.option('--image-url', type=str, default='')
def page2tsv(page_xml_file, tsv_out_file, image_url):
tree = ET.parse(page_xml_file)
xmlns = tree.getroot().tag.split('}')[0].strip('{')
urls = []
if os.path.exists(tsv_out_file):
parts = extract_doc_links(tsv_out_file)
urls = [part['url'] for part in parts]
else:
pd.DataFrame([], columns=['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'GND-ID', 'url_id', 'left', 'right', 'top',
'bottom']). to_csv(tsv_out_file, sep="\t", quoting=3, index=False)
tsv = []
for words in tree.findall('.//{%s}Word' % xmlns):
for word in words.findall('.//{%s}Unicode' % xmlns):
text = word.text
for coords in words.findall('.//{%s}Coords' % xmlns):
points = [int(pos) for p in coords.attrib['points'].split(' ') for pos in p.split(',')]
left = points[0]
right = points[2]
top = points[1]
bottom = points[5]
tsv.append((0, text, 'O', 'O', '-', len(urls), left, right, top, bottom))
with open(tsv_out_file, 'a') as f:
f.write('# ' + image_url + '\n')
tsv = pd.DataFrame(tsv, columns=['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'GND-ID',
'url_id', 'left', 'right', 'top', 'bottom'])
tsv.to_csv(tsv_out_file, sep="\t", quoting=3, index=False, mode='a', header=False)

@ -1,12 +0,0 @@
import sys
import codecs
import xml.etree.ElementTree as ET
sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer, 'strict')
tree = ET.parse(sys.argv[1])
xmlns = tree.getroot().tag.split('}')[0].strip('{')
for words in tree.findall('.//{%s}Word' % xmlns):
for word in words.findall('.//{%s}Unicode' % xmlns):
text = word.text
for coords in words.findall('.//{%s}Coords' % xmlns):
sys.stdout.write('0\t'+text+'\tO\tO\t'+coords.attrib['points']+'\n')

@ -21,7 +21,8 @@ setup(
entry_points={
'console_scripts': [
"extract-doc-links=cli:extract_document_links",
"annotate-tsv=cli:annotate_tsv"
"annotate-tsv=cli:annotate_tsv",
"page2tsv=cli:page2tsv"
]
},
python_requires='>=3.6.0',

Loading…
Cancel
Save