improve page2tsv tool

2025-10-30 16:24:12 +01:00 · 2019-10-31 15:41:56 +01:00 · 2019-10-31 15:41:56 +01:00 · d6311edd0c
commit d6311edd0c
parent a206504560
6 changed files with 89 additions and 28 deletions
--- a/tools/README.md
+++ b/tools/README.md
@ -22,7 +22,40 @@ Install package together with its dependencies in development mode:
 pip install -e ./
 ```

-## Usage:
+## PAGE-XML to TSV Transformation:
+
+Create a TSV file from OCR in PAGE-XML format (with word segmentation):
+
+```
+page2tsv PAGE1.xml PAGE.tsv --image-url=http://link-to-corresponding-image-1
+```
+
+In order to create a TSV file for multiple PAGE XML files just perform successive calls
+of the tool using the same TSV file:
+
+```
+page2tsv PAGE1.xml PAGE.tsv --image-url=http://link-to-corresponding-image-1
+page2tsv PAGE2.xml PAGE.tsv --image-url=http://link-to-corresponding-image-2
+page2tsv PAGE3.xml PAGE.tsv --image-url=http://link-to-corresponding-image-3
+page2tsv PAGE4.xml PAGE.tsv --image-url=http://link-to-corresponding-image-4
+page2tsv PAGE5.xml PAGE.tsv --image-url=http://link-to-corresponding-image-5
+...
+...
+...
+```
+
+A corresponding URL-mapping file can be obtained from:
+
+```
+extract-doc-links PAGE.tsv  PAGE-urls.tsv
+```
+By loading the annotated TSV as well as the url mapping file into 
+ner.edith, you will be able to jump directly to the original image
+where the full text has been extracted from.
+
+---
+
+## Processing of already existing TSV files:

 Create a URL-annotated TSV file from an existing TSV file:

@ -39,12 +72,3 @@ By loading the annotated TSV as well as the url mapping file into
 ner.edith, you will be able to jump directly to the original image
 where the full text has been extracted from.

-# PAGE-XML to TSV Transformation
-
-## Usage:
-
-Create a TSV file from OCR in PAGE-XML format (with word segmentation):
-
-```
-python page2tsv.py PAGE.xml > PAGE.tsv
-```
--- a/tools/cli.py
+++ b/tools/cli.py
@ -2,6 +2,8 @@ import re
 import click
 import pandas as pd
 from io import StringIO
+import os
+import xml.etree.ElementTree as ET


@click.command()
@ -78,7 +80,7 @@ def extract_doc_links(tsv_file):

                    line = "\t" + line

-                if line.count('\t') == 3:
+                if line.count('\t') >= 3:

                    text.append(line + '\n')

@ -92,6 +94,50 @@ def extract_doc_links(tsv_file):

                print('Line error: |', line, '|Number of Tabs: ', line.count('\t'))

-        parts.append({"url": url, 'header': header, 'text': "".join(text)})
+        if url is not None:
+            parts.append({"url": url, 'header': header, 'text': "".join(text)})

    return parts
+
+
+@click.command()
+@click.argument('page-xml-file', type=click.Path(exists=True), required=True, nargs=1)
+@click.argument('tsv-out-file', type=click.Path(), required=True, nargs=1)
+@click.option('--image-url', type=str, default='')
+def page2tsv(page_xml_file, tsv_out_file, image_url):
+
+    tree = ET.parse(page_xml_file)
+    xmlns = tree.getroot().tag.split('}')[0].strip('{')
+
+    urls = []
+    if os.path.exists(tsv_out_file):
+        parts = extract_doc_links(tsv_out_file)
+
+        urls = [part['url'] for part in parts]
+    else:
+        pd.DataFrame([], columns=['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'GND-ID', 'url_id', 'left', 'right', 'top',
+                                  'bottom']). to_csv(tsv_out_file, sep="\t", quoting=3, index=False)
+
+    tsv = []
+    for words in tree.findall('.//{%s}Word' % xmlns):
+        for word in words.findall('.//{%s}Unicode' % xmlns):
+            text = word.text
+            for coords in words.findall('.//{%s}Coords' % xmlns):
+
+                points = [int(pos) for p in coords.attrib['points'].split(' ') for pos in p.split(',')]
+
+                left = points[0]
+                right = points[2]
+                top = points[1]
+                bottom = points[5]
+
+                tsv.append((0, text, 'O', 'O', '-', len(urls), left, right, top, bottom))
+
+    with open(tsv_out_file, 'a') as f:
+
+        f.write('# ' + image_url + '\n')
+
+    tsv = pd.DataFrame(tsv, columns=['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'GND-ID',
+                                     'url_id', 'left', 'right', 'top', 'bottom'])
+
+    tsv.to_csv(tsv_out_file, sep="\t", quoting=3, index=False, mode='a', header=False)
--- a/tools/page2tsv.py
+++ b/tools/page2tsv.py
@ -1,12 +0,0 @@
-import sys
-import codecs
-import xml.etree.ElementTree as ET
-
-sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer, 'strict')
-tree = ET.parse(sys.argv[1])
-xmlns = tree.getroot().tag.split('}')[0].strip('{')
-for words in tree.findall('.//{%s}Word' % xmlns):
-	for word in words.findall('.//{%s}Unicode' % xmlns):
-		text = word.text
-		for coords in words.findall('.//{%s}Coords' % xmlns):
-			sys.stdout.write('0\t'+text+'\tO\tO\t'+coords.attrib['points']+'\n')
--- a/tools/setup.py
+++ b/tools/setup.py
@ -21,7 +21,8 @@ setup(
    entry_points={
      'console_scripts': [
        "extract-doc-links=cli:extract_document_links",
-        "annotate-tsv=cli:annotate_tsv"
+        "annotate-tsv=cli:annotate_tsv",
+        "page2tsv=cli:page2tsv"
      ]
    },
    python_requires='>=3.6.0',