diff --git a/ner-edith.js b/ner-edith.js index 30b8721..ecb35c4 100644 --- a/ner-edith.js +++ b/ner-edith.js @@ -1,19 +1,120 @@ -var data; -var file = null; +function loadFile(evt, onComplete) { + + var file = evt.target.files[0]; + + Papa.parse(file, { + header: true, + delimiter: '\t', + quoteChar: String.fromCharCode(0), + escapeChar: String.fromCharCode(0), + comments: "#", + skipEmptyLines: true, + dynamicTyping: true, + complete: function(results) { onComplete(results, file) } + }); +} + var displayRows=30 var startIndex=0; var endIndex=displayRows; +var urls = null; + +function setupInterface(data, file) { + + function updateTable() { + + let editable_html = + ` +
OFFSET | +LOCATION | POSITION | TOKEN | NE-TAG | @@ -33,6 +134,53 @@ function loadFile(evt) { $("#btn-region").html(save_html) + $("#file-region").html('- `; - - $('#table-body').empty(); + ); - $.each(data.data, - function(nRow, el) { + $('#next').on('click', + function(evt) { - if (nRow < startIndex) return; - if (nRow >= endIndex) return; - - var row = $(" |
---|---|---|---|---|---|
").text(nRow)); - - $.each(el, - function(column, content) { - row.append( - $(editable_html). - text(content). - data('tableInfo', { 'nRow': nRow, 'column': column }) - ); - }); - - $("#table tbody").append(row); - }); - - $("#table td:contains('B-PER')").addClass('ner_per'); - $("#table td:contains('I-PER')").addClass('ner_per'); - $("#table td:contains('B-LOC')").addClass('ner_loc'); - $("#table td:contains('I-LOC')").addClass('ner_loc'); - $("#table td:contains('B-ORG')").addClass('ner_org'); - $("#table td:contains('I-ORG')").addClass('ner_org'); - $("#table td:contains('B-OTH')").addClass('ner_oth'); - $("#table td:contains('I-OTH')").addClass('ner_oth'); - $("#table td:contains('B-TODO')").addClass('ner_todo'); - $("#table td:contains('I-TODO')").addClass('ner_todo'); -} - -function saveFile(evt) { - - let csv = - Papa.unparse(data, - { - header: true, - delimiter: '\t', - comments: "#", - skipEmptyLines: true, - dynamicTyping: true - }); + if (endIndex + displayRows < data.data.length) { + endIndex += displayRows; + startIndex = endIndex - displayRows; + } + else { + endIndex = data.data.length; + startIndex = endIndex - displayRows; + } - openSaveFileDialog (csv, file.name, null) + updateTable(); + } + ); } -function openSaveFileDialog (data, filename, mimetype) { - - if (!data) return; - - var blob = data.constructor !== Blob - ? new Blob([data], {type: mimetype || 'application/octet-stream'}) - : data ; - - if (navigator.msSaveBlob) { - navigator.msSaveBlob(blob, filename); - return; - } - - var lnk = document.createElement('a'), - url = window.URL, - objectURL; - - if (mimetype) { - lnk.type = mimetype; - } - - lnk.download = filename || 'untitled'; - lnk.href = objectURL = url.createObjectURL(blob); - lnk.dispatchEvent(new MouseEvent('click')); - setTimeout(url.revokeObjectURL.bind(url, objectURL)); - -} - $(document).ready( function() { - $('#tsv-file').change(loadFile); + $('#tsv-file').change( + function(evt) { + + loadFile ( evt, + function(results, file) { + + setupInterface(results, file); + }) + } + ); } ); \ No newline at end of file diff --git a/tools/README.md b/tools/README.md new file mode 100644 index 0000000..e69de29 diff --git a/tools/cli.py b/tools/cli.py new file mode 100644 index 0000000..83b8668 --- /dev/null +++ b/tools/cli.py @@ -0,0 +1,97 @@ +import re +import click +import pandas as pd +from io import StringIO + + +@click.command() +@click.argument('tsv-file', type=click.Path(exists=True), required=True, nargs=1) +def extract_document_links(tsv_file): + + parts = extract_doc_links(tsv_file) + + for part in parts: + + print(part['url']) + + +@click.command() +@click.argument('tsv-file', type=click.Path(exists=True), required=True, nargs=1) +@click.argument('annotated-tsv-file', type=click.Path(exists=False), required=True, nargs=1) +def annotate_tsv(tsv_file, annotated_tsv_file): + + parts = extract_doc_links(tsv_file) + + annotated_parts = [] + + urls = [] + + for part in parts: + + part_data = StringIO(part['header'] + part['text']) + urls.append(part['url']) + + df = pd.read_csv(part_data, sep="\t", comment='#', quoting=3) + + df['url_id'] = len(annotated_parts) + + annotated_parts.append(df) + + df = pd.concat(annotated_parts) + + df.to_csv(annotated_tsv_file, sep="\t", quoting=3, index=False) + + +def extract_doc_links(tsv_file): + + parts = [] + + header = None + + with open(tsv_file, 'r') as f: + + text = [] + url = None + + for line in f: + + if header is None: + header = "\t".join(line.split()) + '\n' + continue + + urls = [url for url in + re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line)] + + if len(urls) > 0: + if url is not None: + parts.append({"url": url, 'header': header, 'text': "".join(text)}) + text = [] + + url = urls[-1] + else: + if url is None: + continue + + line = '\t'.join(line.split()) + + if line.count('\t') == 2: + + line = "\t" + line + + if line.count('\t') == 3: + + text.append(line + '\n') + + continue + + if line.startswith('#'): + continue + + if len(line) == 0: + continue + + print('Line error: |', line, '|Number of Tabs: ', line.count('\t')) + + parts.append({"url": url, 'header': header, 'text': "".join(text)}) + + return parts diff --git a/tools/requirements.txt b/tools/requirements.txt new file mode 100644 index 0000000..a45ae2b --- /dev/null +++ b/tools/requirements.txt @@ -0,0 +1,3 @@ +numpy +pandas +click \ No newline at end of file diff --git a/tools/setup.py b/tools/setup.py new file mode 100644 index 0000000..b415b01 --- /dev/null +++ b/tools/setup.py @@ -0,0 +1,35 @@ +from io import open +from setuptools import find_packages, setup + +with open('requirements.txt') as fp: + install_requires = fp.read() + +setup( + name="ner-edith", + version="0.0.1", + author="", + author_email="qurator@sbb.spk-berlin.de", + description="ner.edith", + long_description=open("README.md", "r", encoding='utf-8').read(), + long_description_content_type="text/markdown", + keywords='qurator', + license='Apache', + url="https://github.com/cneud/ner.edith", + packages=find_packages(exclude=["*.tests", "*.tests.*", + "tests.*", "tests"]), + install_requires=install_requires, + entry_points={ + 'console_scripts': [ + "extract-doc-links=cli:extract_document_links", + "annotate-tsv=cli:annotate_tsv" + ] + }, + python_requires='>=3.6.0', + tests_require=['pytest'], + classifiers=[ + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: Apache Software License', + 'Programming Language :: Python :: 3', + 'Topic :: Scientific/Engineering :: Artificial Intelligence', + ], +) |