From 6afb0a6375e024cf741171fc9c4efc0eadeb951e Mon Sep 17 00:00:00 2001 From: Kai Labusch Date: Mon, 23 Sep 2019 16:31:02 +0200 Subject: [PATCH] add annotation tools and url mapping integration --- ner-edith.js | 351 ++++++++++++++++++++++++----------------- tools/README.md | 0 tools/cli.py | 97 ++++++++++++ tools/requirements.txt | 3 + tools/setup.py | 35 ++++ 5 files changed, 338 insertions(+), 148 deletions(-) create mode 100644 tools/README.md create mode 100644 tools/cli.py create mode 100644 tools/requirements.txt create mode 100644 tools/setup.py diff --git a/ner-edith.js b/ner-edith.js index 30b8721..ecb35c4 100644 --- a/ner-edith.js +++ b/ner-edith.js @@ -1,19 +1,120 @@ -var data; -var file = null; +function loadFile(evt, onComplete) { + + var file = evt.target.files[0]; + + Papa.parse(file, { + header: true, + delimiter: '\t', + quoteChar: String.fromCharCode(0), + escapeChar: String.fromCharCode(0), + comments: "#", + skipEmptyLines: true, + dynamicTyping: true, + complete: function(results) { onComplete(results, file) } + }); +} + var displayRows=30 var startIndex=0; var endIndex=displayRows; +var urls = null; + +function setupInterface(data, file) { + + function updateTable() { + + let editable_html = + ` + + `; -function loadFile(evt) { + $('#table-body').empty(); + + $.each(data.data, + function(nRow, el) { + + if (nRow < startIndex) return; + if (nRow >= endIndex) return; + + var row = $(""); + row.append($(' ')); + + $.each(el, + function(column, content) { + + if (column == 'url_id') return + + row.append( + $(editable_html). + text(content). + data('tableInfo', { 'nRow': nRow, 'column': column }) + ); + }); + + $("#table tbody").append(row); + }); + + $("#table td:contains('B-PER')").addClass('ner_per'); + $("#table td:contains('I-PER')").addClass('ner_per'); + $("#table td:contains('B-LOC')").addClass('ner_loc'); + $("#table td:contains('I-LOC')").addClass('ner_loc'); + $("#table td:contains('B-ORG')").addClass('ner_org'); + $("#table td:contains('I-ORG')").addClass('ner_org'); + $("#table td:contains('B-OTH')").addClass('ner_oth'); + $("#table td:contains('I-OTH')").addClass('ner_oth'); + $("#table td:contains('B-TODO')").addClass('ner_todo'); + $("#table td:contains('I-TODO')").addClass('ner_todo'); + + $(".offset").on('click', + function(evt) { + + if (urls != null) { + return; + } + + let url_mapping_html = + ` +
+
+
+ + Please + upload a url mapping file + or + `; + + $("#tableregion").html(url_mapping_html); + $("#btn-region").empty(); + + $('#goback').on('click', + function(evt) { + setupInterface(data, file); + } + ); + + $('#url-mapping-tsv-file').change( + function(evt) { + loadFile(evt, + function(results, url_mapping_file) { + urls = results; + + setupInterface(data, file); + }); + } + ); + } + ); + } - let table_html = + let table_html = ` - + @@ -33,6 +134,53 @@ function loadFile(evt) { $("#btn-region").html(save_html) + $("#file-region").html('

' + file.name + '

'); + + function saveFile(evt) { + + let csv = + Papa.unparse(data, + { + header: true, + delimiter: '\t', + comments: "#", + quoteChar: String.fromCharCode(0), + escapeChar: String.fromCharCode(0), + skipEmptyLines: true, + dynamicTyping: true + }); + + openSaveFileDialog (csv, file.name, null) + } + + function openSaveFileDialog (data, filename, mimetype) { + + if (!data) return; + + var blob = data.constructor !== Blob + ? new Blob([data], {type: mimetype || 'application/octet-stream'}) + : data ; + + if (navigator.msSaveBlob) { + navigator.msSaveBlob(blob, filename); + return; + } + + var lnk = document.createElement('a'), + url = window.URL, + objectURL; + + if (mimetype) { + lnk.type = mimetype; + } + + lnk.download = filename || 'untitled'; + lnk.href = objectURL = url.createObjectURL(blob); + lnk.dispatchEvent(new MouseEvent('click')); + setTimeout(url.revokeObjectURL.bind(url, objectURL)); + + } + $('.saveButton').on('click', saveFile) let editingTd; @@ -108,167 +256,74 @@ function loadFile(evt) { makeTdEditable(target); }); - file = evt.target.files[0]; - - // TODO: adapt to streaming with 'chunk' callback for large file support, see https://www.papaparse.com/docs - Papa.parse(file, { - header: true, - delimiter: '\t', - comments: "#", - skipEmptyLines: true, - dynamicTyping: true, - complete: function(results) { - //console.log(results); - data = results; - - updateTable(); - - $("#file-region").html('

' + file.name + '

'); - - $('#tableregion')[0].addEventListener("wheel", - function(event) { - - if (event.deltaY < 0) { - - if (startIndex <= 0) return; + updateTable(); - startIndex -= 1; - endIndex -= 1; - } - else { + $('#tableregion')[0].addEventListener("wheel", + function(event) { - if (endIndex >= data.data.length) return; + if (event.deltaY < 0) { - startIndex += 1; - endIndex += 1; - } + if (startIndex <= 0) return; - updateTable(); - }); + startIndex -= 1; + endIndex -= 1; + } + else { - $('#back').on('click', - function(evt) { + if (endIndex >= data.data.length) return; - if (startIndex >= displayRows) { - startIndex -= displayRows; - endIndex -= displayRows; - } - else { - startIndex = 0; - endIndex = displayRows; - } + startIndex += 1; + endIndex += 1; + } - updateTable(); - } - ); + updateTable(); + }); - $('#next').on('click', - function(evt) { + $('#back').on('click', + function(evt) { - if (endIndex + displayRows < data.data.length) { - endIndex += displayRows; - startIndex = endIndex - displayRows; - } - else { - endIndex = data.data.length; - startIndex = endIndex - displayRows; - } + if (startIndex >= displayRows) { + startIndex -= displayRows; + endIndex -= displayRows; + } + else { + startIndex = 0; + endIndex = displayRows; + } - updateTable(); - } - ); + updateTable(); } - }); -} - -function updateTable() { - - let editable_html = - ` - "); - row.append($("
OFFSETLOCATION POSITION TOKEN NE-TAG - `; - - $('#table-body').empty(); + ); - $.each(data.data, - function(nRow, el) { + $('#next').on('click', + function(evt) { - if (nRow < startIndex) return; - if (nRow >= endIndex) return; - - var row = $("
").text(nRow)); - - $.each(el, - function(column, content) { - row.append( - $(editable_html). - text(content). - data('tableInfo', { 'nRow': nRow, 'column': column }) - ); - }); - - $("#table tbody").append(row); - }); - - $("#table td:contains('B-PER')").addClass('ner_per'); - $("#table td:contains('I-PER')").addClass('ner_per'); - $("#table td:contains('B-LOC')").addClass('ner_loc'); - $("#table td:contains('I-LOC')").addClass('ner_loc'); - $("#table td:contains('B-ORG')").addClass('ner_org'); - $("#table td:contains('I-ORG')").addClass('ner_org'); - $("#table td:contains('B-OTH')").addClass('ner_oth'); - $("#table td:contains('I-OTH')").addClass('ner_oth'); - $("#table td:contains('B-TODO')").addClass('ner_todo'); - $("#table td:contains('I-TODO')").addClass('ner_todo'); -} - -function saveFile(evt) { - - let csv = - Papa.unparse(data, - { - header: true, - delimiter: '\t', - comments: "#", - skipEmptyLines: true, - dynamicTyping: true - }); + if (endIndex + displayRows < data.data.length) { + endIndex += displayRows; + startIndex = endIndex - displayRows; + } + else { + endIndex = data.data.length; + startIndex = endIndex - displayRows; + } - openSaveFileDialog (csv, file.name, null) + updateTable(); + } + ); } -function openSaveFileDialog (data, filename, mimetype) { - - if (!data) return; - - var blob = data.constructor !== Blob - ? new Blob([data], {type: mimetype || 'application/octet-stream'}) - : data ; - - if (navigator.msSaveBlob) { - navigator.msSaveBlob(blob, filename); - return; - } - - var lnk = document.createElement('a'), - url = window.URL, - objectURL; - - if (mimetype) { - lnk.type = mimetype; - } - - lnk.download = filename || 'untitled'; - lnk.href = objectURL = url.createObjectURL(blob); - lnk.dispatchEvent(new MouseEvent('click')); - setTimeout(url.revokeObjectURL.bind(url, objectURL)); - -} - $(document).ready( function() { - $('#tsv-file').change(loadFile); + $('#tsv-file').change( + function(evt) { + + loadFile ( evt, + function(results, file) { + + setupInterface(results, file); + }) + } + ); } ); \ No newline at end of file diff --git a/tools/README.md b/tools/README.md new file mode 100644 index 0000000..e69de29 diff --git a/tools/cli.py b/tools/cli.py new file mode 100644 index 0000000..83b8668 --- /dev/null +++ b/tools/cli.py @@ -0,0 +1,97 @@ +import re +import click +import pandas as pd +from io import StringIO + + +@click.command() +@click.argument('tsv-file', type=click.Path(exists=True), required=True, nargs=1) +def extract_document_links(tsv_file): + + parts = extract_doc_links(tsv_file) + + for part in parts: + + print(part['url']) + + +@click.command() +@click.argument('tsv-file', type=click.Path(exists=True), required=True, nargs=1) +@click.argument('annotated-tsv-file', type=click.Path(exists=False), required=True, nargs=1) +def annotate_tsv(tsv_file, annotated_tsv_file): + + parts = extract_doc_links(tsv_file) + + annotated_parts = [] + + urls = [] + + for part in parts: + + part_data = StringIO(part['header'] + part['text']) + urls.append(part['url']) + + df = pd.read_csv(part_data, sep="\t", comment='#', quoting=3) + + df['url_id'] = len(annotated_parts) + + annotated_parts.append(df) + + df = pd.concat(annotated_parts) + + df.to_csv(annotated_tsv_file, sep="\t", quoting=3, index=False) + + +def extract_doc_links(tsv_file): + + parts = [] + + header = None + + with open(tsv_file, 'r') as f: + + text = [] + url = None + + for line in f: + + if header is None: + header = "\t".join(line.split()) + '\n' + continue + + urls = [url for url in + re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line)] + + if len(urls) > 0: + if url is not None: + parts.append({"url": url, 'header': header, 'text': "".join(text)}) + text = [] + + url = urls[-1] + else: + if url is None: + continue + + line = '\t'.join(line.split()) + + if line.count('\t') == 2: + + line = "\t" + line + + if line.count('\t') == 3: + + text.append(line + '\n') + + continue + + if line.startswith('#'): + continue + + if len(line) == 0: + continue + + print('Line error: |', line, '|Number of Tabs: ', line.count('\t')) + + parts.append({"url": url, 'header': header, 'text': "".join(text)}) + + return parts diff --git a/tools/requirements.txt b/tools/requirements.txt new file mode 100644 index 0000000..a45ae2b --- /dev/null +++ b/tools/requirements.txt @@ -0,0 +1,3 @@ +numpy +pandas +click \ No newline at end of file diff --git a/tools/setup.py b/tools/setup.py new file mode 100644 index 0000000..b415b01 --- /dev/null +++ b/tools/setup.py @@ -0,0 +1,35 @@ +from io import open +from setuptools import find_packages, setup + +with open('requirements.txt') as fp: + install_requires = fp.read() + +setup( + name="ner-edith", + version="0.0.1", + author="", + author_email="qurator@sbb.spk-berlin.de", + description="ner.edith", + long_description=open("README.md", "r", encoding='utf-8').read(), + long_description_content_type="text/markdown", + keywords='qurator', + license='Apache', + url="https://github.com/cneud/ner.edith", + packages=find_packages(exclude=["*.tests", "*.tests.*", + "tests.*", "tests"]), + install_requires=install_requires, + entry_points={ + 'console_scripts': [ + "extract-doc-links=cli:extract_document_links", + "annotate-tsv=cli:annotate_tsv" + ] + }, + python_requires='>=3.6.0', + tests_require=['pytest'], + classifiers=[ + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: Apache Software License', + 'Programming Language :: Python :: 3', + 'Topic :: Scientific/Engineering :: Artificial Intelligence', + ], +)