add annotation tools and url mapping integration

pull/39/head
Kai Labusch 5 years ago
parent 3d10f7b001
commit 6afb0a6375

@ -1,19 +1,120 @@
var data;
var file = null;
function loadFile(evt, onComplete) {
var file = evt.target.files[0];
Papa.parse(file, {
header: true,
delimiter: '\t',
quoteChar: String.fromCharCode(0),
escapeChar: String.fromCharCode(0),
comments: "#",
skipEmptyLines: true,
dynamicTyping: true,
complete: function(results) { onComplete(results, file) }
});
}
var displayRows=30
var startIndex=0;
var endIndex=displayRows;
var urls = null;
function setupInterface(data, file) {
function updateTable() {
let editable_html =
`
<td class="editable">
`;
function loadFile(evt) {
$('#table-body').empty();
$.each(data.data,
function(nRow, el) {
if (nRow < startIndex) return;
if (nRow >= endIndex) return;
var row = $("<tr/>");
row.append($('<td> <button class="btn btn-link btn-xs py-0 offset">' +
nRow + '</button> </td>'));
$.each(el,
function(column, content) {
if (column == 'url_id') return
row.append(
$(editable_html).
text(content).
data('tableInfo', { 'nRow': nRow, 'column': column })
);
});
$("#table tbody").append(row);
});
$("#table td:contains('B-PER')").addClass('ner_per');
$("#table td:contains('I-PER')").addClass('ner_per');
$("#table td:contains('B-LOC')").addClass('ner_loc');
$("#table td:contains('I-LOC')").addClass('ner_loc');
$("#table td:contains('B-ORG')").addClass('ner_org');
$("#table td:contains('I-ORG')").addClass('ner_org');
$("#table td:contains('B-OTH')").addClass('ner_oth');
$("#table td:contains('I-OTH')").addClass('ner_oth');
$("#table td:contains('B-TODO')").addClass('ner_todo');
$("#table td:contains('I-TODO')").addClass('ner_todo');
$(".offset").on('click',
function(evt) {
if (urls != null) {
return;
}
let url_mapping_html =
`
<br/>
<br/>
<br/>
<input type="file" id="url-mapping-tsv-file" style="visibility: hidden; width: 1px; height: 1px"/>
Please
<a href="" onclick="$('#url-mapping-tsv-file').click(); return false">upload a url mapping file</a>
or<button class="btn btn-link" id="goback">go back to edit mode.</button>
`;
$("#tableregion").html(url_mapping_html);
$("#btn-region").empty();
$('#goback').on('click',
function(evt) {
setupInterface(data, file);
}
);
$('#url-mapping-tsv-file').change(
function(evt) {
loadFile(evt,
function(results, url_mapping_file) {
urls = results;
setupInterface(data, file);
});
}
);
}
);
}
let table_html =
let table_html =
`
<table id="table">
<thead>
<tr>
<th><button class="btn btn-link" id="back"><<</button>OFFSET</th>
<th><button class="btn btn-link" id="back"><<</button>LOCATION</th>
<th>POSITION</th>
<th>TOKEN</th>
<th>NE-TAG</th>
@ -33,6 +134,53 @@ function loadFile(evt) {
$("#btn-region").html(save_html)
$("#file-region").html('<h3>' + file.name + '</h3>');
function saveFile(evt) {
let csv =
Papa.unparse(data,
{
header: true,
delimiter: '\t',
comments: "#",
quoteChar: String.fromCharCode(0),
escapeChar: String.fromCharCode(0),
skipEmptyLines: true,
dynamicTyping: true
});
openSaveFileDialog (csv, file.name, null)
}
function openSaveFileDialog (data, filename, mimetype) {
if (!data) return;
var blob = data.constructor !== Blob
? new Blob([data], {type: mimetype || 'application/octet-stream'})
: data ;
if (navigator.msSaveBlob) {
navigator.msSaveBlob(blob, filename);
return;
}
var lnk = document.createElement('a'),
url = window.URL,
objectURL;
if (mimetype) {
lnk.type = mimetype;
}
lnk.download = filename || 'untitled';
lnk.href = objectURL = url.createObjectURL(blob);
lnk.dispatchEvent(new MouseEvent('click'));
setTimeout(url.revokeObjectURL.bind(url, objectURL));
}
$('.saveButton').on('click', saveFile)
let editingTd;
@ -108,167 +256,74 @@ function loadFile(evt) {
makeTdEditable(target);
});
file = evt.target.files[0];
// TODO: adapt to streaming with 'chunk' callback for large file support, see https://www.papaparse.com/docs
Papa.parse(file, {
header: true,
delimiter: '\t',
comments: "#",
skipEmptyLines: true,
dynamicTyping: true,
complete: function(results) {
//console.log(results);
data = results;
updateTable();
$("#file-region").html('<h3>' + file.name + '</h3>');
$('#tableregion')[0].addEventListener("wheel",
function(event) {
if (event.deltaY < 0) {
if (startIndex <= 0) return;
updateTable();
startIndex -= 1;
endIndex -= 1;
}
else {
$('#tableregion')[0].addEventListener("wheel",
function(event) {
if (endIndex >= data.data.length) return;
if (event.deltaY < 0) {
startIndex += 1;
endIndex += 1;
}
if (startIndex <= 0) return;
updateTable();
});
startIndex -= 1;
endIndex -= 1;
}
else {
$('#back').on('click',
function(evt) {
if (endIndex >= data.data.length) return;
if (startIndex >= displayRows) {
startIndex -= displayRows;
endIndex -= displayRows;
}
else {
startIndex = 0;
endIndex = displayRows;
}
startIndex += 1;
endIndex += 1;
}
updateTable();
}
);
updateTable();
});
$('#next').on('click',
function(evt) {
$('#back').on('click',
function(evt) {
if (endIndex + displayRows < data.data.length) {
endIndex += displayRows;
startIndex = endIndex - displayRows;
}
else {
endIndex = data.data.length;
startIndex = endIndex - displayRows;
}
if (startIndex >= displayRows) {
startIndex -= displayRows;
endIndex -= displayRows;
}
else {
startIndex = 0;
endIndex = displayRows;
}
updateTable();
}
);
updateTable();
}
});
}
function updateTable() {
let editable_html =
`
<td class="editable">
`;
$('#table-body').empty();
);
$.each(data.data,
function(nRow, el) {
$('#next').on('click',
function(evt) {
if (nRow < startIndex) return;
if (nRow >= endIndex) return;
var row = $("<tr/>");
row.append($("<td/>").text(nRow));
$.each(el,
function(column, content) {
row.append(
$(editable_html).
text(content).
data('tableInfo', { 'nRow': nRow, 'column': column })
);
});
$("#table tbody").append(row);
});
$("#table td:contains('B-PER')").addClass('ner_per');
$("#table td:contains('I-PER')").addClass('ner_per');
$("#table td:contains('B-LOC')").addClass('ner_loc');
$("#table td:contains('I-LOC')").addClass('ner_loc');
$("#table td:contains('B-ORG')").addClass('ner_org');
$("#table td:contains('I-ORG')").addClass('ner_org');
$("#table td:contains('B-OTH')").addClass('ner_oth');
$("#table td:contains('I-OTH')").addClass('ner_oth');
$("#table td:contains('B-TODO')").addClass('ner_todo');
$("#table td:contains('I-TODO')").addClass('ner_todo');
}
function saveFile(evt) {
let csv =
Papa.unparse(data,
{
header: true,
delimiter: '\t',
comments: "#",
skipEmptyLines: true,
dynamicTyping: true
});
if (endIndex + displayRows < data.data.length) {
endIndex += displayRows;
startIndex = endIndex - displayRows;
}
else {
endIndex = data.data.length;
startIndex = endIndex - displayRows;
}
openSaveFileDialog (csv, file.name, null)
updateTable();
}
);
}
function openSaveFileDialog (data, filename, mimetype) {
if (!data) return;
var blob = data.constructor !== Blob
? new Blob([data], {type: mimetype || 'application/octet-stream'})
: data ;
if (navigator.msSaveBlob) {
navigator.msSaveBlob(blob, filename);
return;
}
var lnk = document.createElement('a'),
url = window.URL,
objectURL;
if (mimetype) {
lnk.type = mimetype;
}
lnk.download = filename || 'untitled';
lnk.href = objectURL = url.createObjectURL(blob);
lnk.dispatchEvent(new MouseEvent('click'));
setTimeout(url.revokeObjectURL.bind(url, objectURL));
}
$(document).ready(
function() {
$('#tsv-file').change(loadFile);
$('#tsv-file').change(
function(evt) {
loadFile ( evt,
function(results, file) {
setupInterface(results, file);
})
}
);
}
);

@ -0,0 +1,97 @@
import re
import click
import pandas as pd
from io import StringIO
@click.command()
@click.argument('tsv-file', type=click.Path(exists=True), required=True, nargs=1)
def extract_document_links(tsv_file):
parts = extract_doc_links(tsv_file)
for part in parts:
print(part['url'])
@click.command()
@click.argument('tsv-file', type=click.Path(exists=True), required=True, nargs=1)
@click.argument('annotated-tsv-file', type=click.Path(exists=False), required=True, nargs=1)
def annotate_tsv(tsv_file, annotated_tsv_file):
parts = extract_doc_links(tsv_file)
annotated_parts = []
urls = []
for part in parts:
part_data = StringIO(part['header'] + part['text'])
urls.append(part['url'])
df = pd.read_csv(part_data, sep="\t", comment='#', quoting=3)
df['url_id'] = len(annotated_parts)
annotated_parts.append(df)
df = pd.concat(annotated_parts)
df.to_csv(annotated_tsv_file, sep="\t", quoting=3, index=False)
def extract_doc_links(tsv_file):
parts = []
header = None
with open(tsv_file, 'r') as f:
text = []
url = None
for line in f:
if header is None:
header = "\t".join(line.split()) + '\n'
continue
urls = [url for url in
re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line)]
if len(urls) > 0:
if url is not None:
parts.append({"url": url, 'header': header, 'text': "".join(text)})
text = []
url = urls[-1]
else:
if url is None:
continue
line = '\t'.join(line.split())
if line.count('\t') == 2:
line = "\t" + line
if line.count('\t') == 3:
text.append(line + '\n')
continue
if line.startswith('#'):
continue
if len(line) == 0:
continue
print('Line error: |', line, '|Number of Tabs: ', line.count('\t'))
parts.append({"url": url, 'header': header, 'text': "".join(text)})
return parts

@ -0,0 +1,3 @@
numpy
pandas
click

@ -0,0 +1,35 @@
from io import open
from setuptools import find_packages, setup
with open('requirements.txt') as fp:
install_requires = fp.read()
setup(
name="ner-edith",
version="0.0.1",
author="",
author_email="qurator@sbb.spk-berlin.de",
description="ner.edith",
long_description=open("README.md", "r", encoding='utf-8').read(),
long_description_content_type="text/markdown",
keywords='qurator',
license='Apache',
url="https://github.com/cneud/ner.edith",
packages=find_packages(exclude=["*.tests", "*.tests.*",
"tests.*", "tests"]),
install_requires=install_requires,
entry_points={
'console_scripts': [
"extract-doc-links=cli:extract_document_links",
"annotate-tsv=cli:annotate_tsv"
]
},
python_requires='>=3.6.0',
tests_require=['pytest'],
classifiers=[
'Intended Audience :: Science/Research',
'License :: OSI Approved :: Apache Software License',
'Programming Language :: Python :: 3',
'Topic :: Scientific/Engineering :: Artificial Intelligence',
],
)
Loading…
Cancel
Save