From d6311edd0c88a77720a57b0a2ce8addbbcd31938 Mon Sep 17 00:00:00 2001 From: Kai Labusch Date: Thu, 31 Oct 2019 15:41:56 +0100 Subject: [PATCH] improve page2tsv tool --- ner-edith.js | 4 +++- ner.edith.html | 4 ++-- tools/README.md | 42 ++++++++++++++++++++++++++++++--------- tools/cli.py | 50 +++++++++++++++++++++++++++++++++++++++++++++-- tools/page2tsv.py | 12 ------------ tools/setup.py | 3 ++- 6 files changed, 88 insertions(+), 27 deletions(-) delete mode 100644 tools/page2tsv.py diff --git a/ner-edith.js b/ner-edith.js index 923030e..ce4f1b3 100644 --- a/ner-edith.js +++ b/ner-edith.js @@ -296,6 +296,8 @@ function setupInterface(data, file) { function updateTable() { + let do_not_display = new Set(['url_id', 'left', 'right', 'top', 'bottom']); + editingTd = null; let editable_html = @@ -318,7 +320,7 @@ function setupInterface(data, file) { $.each(el, function(column, content) { - if (column == 'url_id') return + if (do_not_display.has(column)) return var clickAction = function() { console.log('Do something different');} diff --git a/ner.edith.html b/ner.edith.html index bc377e6..2559e9d 100644 --- a/ner.edith.html +++ b/ner.edith.html @@ -66,12 +66,12 @@ facsimile_preview -
+
Please upload a TSV file in the GermEval2014 data format:

-
+
diff --git a/tools/README.md b/tools/README.md index 502f899..b070b59 100644 --- a/tools/README.md +++ b/tools/README.md @@ -22,29 +22,53 @@ Install package together with its dependencies in development mode: pip install -e ./ ``` -## Usage: +## PAGE-XML to TSV Transformation: -Create a URL-annotated TSV file from an existing TSV file: +Create a TSV file from OCR in PAGE-XML format (with word segmentation): ``` -annotate-tsv enp_DE.tsv enp_DE-annotated.tsv +page2tsv PAGE1.xml PAGE.tsv --image-url=http://link-to-corresponding-image-1 ``` -Create a corresponding URL-mapping file: + +In order to create a TSV file for multiple PAGE XML files just perform successive calls +of the tool using the same TSV file: ``` -extract-doc-links enp_DE.tsv enp_DE-urls.tsv +page2tsv PAGE1.xml PAGE.tsv --image-url=http://link-to-corresponding-image-1 +page2tsv PAGE2.xml PAGE.tsv --image-url=http://link-to-corresponding-image-2 +page2tsv PAGE3.xml PAGE.tsv --image-url=http://link-to-corresponding-image-3 +page2tsv PAGE4.xml PAGE.tsv --image-url=http://link-to-corresponding-image-4 +page2tsv PAGE5.xml PAGE.tsv --image-url=http://link-to-corresponding-image-5 +... +... +... ``` +A corresponding URL-mapping file can be obtained from: + +``` +extract-doc-links PAGE.tsv PAGE-urls.tsv +``` By loading the annotated TSV as well as the url mapping file into ner.edith, you will be able to jump directly to the original image where the full text has been extracted from. -# PAGE-XML to TSV Transformation +--- -## Usage: +## Processing of already existing TSV files: -Create a TSV file from OCR in PAGE-XML format (with word segmentation): +Create a URL-annotated TSV file from an existing TSV file: ``` -python page2tsv.py PAGE.xml > PAGE.tsv +annotate-tsv enp_DE.tsv enp_DE-annotated.tsv +``` +Create a corresponding URL-mapping file: + +``` +extract-doc-links enp_DE.tsv enp_DE-urls.tsv ``` + +By loading the annotated TSV as well as the url mapping file into +ner.edith, you will be able to jump directly to the original image +where the full text has been extracted from. + diff --git a/tools/cli.py b/tools/cli.py index ce9df71..95893d8 100644 --- a/tools/cli.py +++ b/tools/cli.py @@ -2,6 +2,8 @@ import re import click import pandas as pd from io import StringIO +import os +import xml.etree.ElementTree as ET @click.command() @@ -78,7 +80,7 @@ def extract_doc_links(tsv_file): line = "\t" + line - if line.count('\t') == 3: + if line.count('\t') >= 3: text.append(line + '\n') @@ -92,6 +94,50 @@ def extract_doc_links(tsv_file): print('Line error: |', line, '|Number of Tabs: ', line.count('\t')) - parts.append({"url": url, 'header': header, 'text': "".join(text)}) + if url is not None: + parts.append({"url": url, 'header': header, 'text': "".join(text)}) return parts + + +@click.command() +@click.argument('page-xml-file', type=click.Path(exists=True), required=True, nargs=1) +@click.argument('tsv-out-file', type=click.Path(), required=True, nargs=1) +@click.option('--image-url', type=str, default='') +def page2tsv(page_xml_file, tsv_out_file, image_url): + + tree = ET.parse(page_xml_file) + xmlns = tree.getroot().tag.split('}')[0].strip('{') + + urls = [] + if os.path.exists(tsv_out_file): + parts = extract_doc_links(tsv_out_file) + + urls = [part['url'] for part in parts] + else: + pd.DataFrame([], columns=['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'GND-ID', 'url_id', 'left', 'right', 'top', + 'bottom']). to_csv(tsv_out_file, sep="\t", quoting=3, index=False) + + tsv = [] + for words in tree.findall('.//{%s}Word' % xmlns): + for word in words.findall('.//{%s}Unicode' % xmlns): + text = word.text + for coords in words.findall('.//{%s}Coords' % xmlns): + + points = [int(pos) for p in coords.attrib['points'].split(' ') for pos in p.split(',')] + + left = points[0] + right = points[2] + top = points[1] + bottom = points[5] + + tsv.append((0, text, 'O', 'O', '-', len(urls), left, right, top, bottom)) + + with open(tsv_out_file, 'a') as f: + + f.write('# ' + image_url + '\n') + + tsv = pd.DataFrame(tsv, columns=['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'GND-ID', + 'url_id', 'left', 'right', 'top', 'bottom']) + + tsv.to_csv(tsv_out_file, sep="\t", quoting=3, index=False, mode='a', header=False) diff --git a/tools/page2tsv.py b/tools/page2tsv.py deleted file mode 100644 index 34e2ba3..0000000 --- a/tools/page2tsv.py +++ /dev/null @@ -1,12 +0,0 @@ -import sys -import codecs -import xml.etree.ElementTree as ET - -sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer, 'strict') -tree = ET.parse(sys.argv[1]) -xmlns = tree.getroot().tag.split('}')[0].strip('{') -for words in tree.findall('.//{%s}Word' % xmlns): - for word in words.findall('.//{%s}Unicode' % xmlns): - text = word.text - for coords in words.findall('.//{%s}Coords' % xmlns): - sys.stdout.write('0\t'+text+'\tO\tO\t'+coords.attrib['points']+'\n') \ No newline at end of file diff --git a/tools/setup.py b/tools/setup.py index b415b01..beb79a3 100644 --- a/tools/setup.py +++ b/tools/setup.py @@ -21,7 +21,8 @@ setup( entry_points={ 'console_scripts': [ "extract-doc-links=cli:extract_document_links", - "annotate-tsv=cli:annotate_tsv" + "annotate-tsv=cli:annotate_tsv", + "page2tsv=cli:page2tsv" ] }, python_requires='>=3.6.0',