mirror of
https://github.com/qurator-spk/neat.git
synced 2025-06-11 04:39:54 +02:00
add annotation tools and url mapping integration
This commit is contained in:
parent
3d10f7b001
commit
6afb0a6375
5 changed files with 345 additions and 155 deletions
365
ner-edith.js
365
ner-edith.js
|
@ -1,19 +1,120 @@
|
|||
var data;
|
||||
|
||||
var file = null;
|
||||
function loadFile(evt, onComplete) {
|
||||
|
||||
var file = evt.target.files[0];
|
||||
|
||||
Papa.parse(file, {
|
||||
header: true,
|
||||
delimiter: '\t',
|
||||
quoteChar: String.fromCharCode(0),
|
||||
escapeChar: String.fromCharCode(0),
|
||||
comments: "#",
|
||||
skipEmptyLines: true,
|
||||
dynamicTyping: true,
|
||||
complete: function(results) { onComplete(results, file) }
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
var displayRows=30
|
||||
var startIndex=0;
|
||||
var endIndex=displayRows;
|
||||
var urls = null;
|
||||
|
||||
function loadFile(evt) {
|
||||
function setupInterface(data, file) {
|
||||
|
||||
let table_html =
|
||||
function updateTable() {
|
||||
|
||||
let editable_html =
|
||||
`
|
||||
<td class="editable">
|
||||
`;
|
||||
|
||||
$('#table-body').empty();
|
||||
|
||||
$.each(data.data,
|
||||
function(nRow, el) {
|
||||
|
||||
if (nRow < startIndex) return;
|
||||
if (nRow >= endIndex) return;
|
||||
|
||||
var row = $("<tr/>");
|
||||
row.append($('<td> <button class="btn btn-link btn-xs py-0 offset">' +
|
||||
nRow + '</button> </td>'));
|
||||
|
||||
$.each(el,
|
||||
function(column, content) {
|
||||
|
||||
if (column == 'url_id') return
|
||||
|
||||
row.append(
|
||||
$(editable_html).
|
||||
text(content).
|
||||
data('tableInfo', { 'nRow': nRow, 'column': column })
|
||||
);
|
||||
});
|
||||
|
||||
$("#table tbody").append(row);
|
||||
});
|
||||
|
||||
$("#table td:contains('B-PER')").addClass('ner_per');
|
||||
$("#table td:contains('I-PER')").addClass('ner_per');
|
||||
$("#table td:contains('B-LOC')").addClass('ner_loc');
|
||||
$("#table td:contains('I-LOC')").addClass('ner_loc');
|
||||
$("#table td:contains('B-ORG')").addClass('ner_org');
|
||||
$("#table td:contains('I-ORG')").addClass('ner_org');
|
||||
$("#table td:contains('B-OTH')").addClass('ner_oth');
|
||||
$("#table td:contains('I-OTH')").addClass('ner_oth');
|
||||
$("#table td:contains('B-TODO')").addClass('ner_todo');
|
||||
$("#table td:contains('I-TODO')").addClass('ner_todo');
|
||||
|
||||
$(".offset").on('click',
|
||||
function(evt) {
|
||||
|
||||
if (urls != null) {
|
||||
return;
|
||||
}
|
||||
|
||||
let url_mapping_html =
|
||||
`
|
||||
<br/>
|
||||
<br/>
|
||||
<br/>
|
||||
<input type="file" id="url-mapping-tsv-file" style="visibility: hidden; width: 1px; height: 1px"/>
|
||||
Please
|
||||
<a href="" onclick="$('#url-mapping-tsv-file').click(); return false">upload a url mapping file</a>
|
||||
or<button class="btn btn-link" id="goback">go back to edit mode.</button>
|
||||
`;
|
||||
|
||||
$("#tableregion").html(url_mapping_html);
|
||||
$("#btn-region").empty();
|
||||
|
||||
$('#goback').on('click',
|
||||
function(evt) {
|
||||
setupInterface(data, file);
|
||||
}
|
||||
);
|
||||
|
||||
$('#url-mapping-tsv-file').change(
|
||||
function(evt) {
|
||||
loadFile(evt,
|
||||
function(results, url_mapping_file) {
|
||||
urls = results;
|
||||
|
||||
setupInterface(data, file);
|
||||
});
|
||||
}
|
||||
);
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
let table_html =
|
||||
`
|
||||
<table id="table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th><button class="btn btn-link" id="back"><<</button>OFFSET</th>
|
||||
<th><button class="btn btn-link" id="back"><<</button>LOCATION</th>
|
||||
<th>POSITION</th>
|
||||
<th>TOKEN</th>
|
||||
<th>NE-TAG</th>
|
||||
|
@ -33,6 +134,53 @@ function loadFile(evt) {
|
|||
|
||||
$("#btn-region").html(save_html)
|
||||
|
||||
$("#file-region").html('<h3>' + file.name + '</h3>');
|
||||
|
||||
function saveFile(evt) {
|
||||
|
||||
let csv =
|
||||
Papa.unparse(data,
|
||||
{
|
||||
header: true,
|
||||
delimiter: '\t',
|
||||
comments: "#",
|
||||
quoteChar: String.fromCharCode(0),
|
||||
escapeChar: String.fromCharCode(0),
|
||||
skipEmptyLines: true,
|
||||
dynamicTyping: true
|
||||
});
|
||||
|
||||
openSaveFileDialog (csv, file.name, null)
|
||||
}
|
||||
|
||||
function openSaveFileDialog (data, filename, mimetype) {
|
||||
|
||||
if (!data) return;
|
||||
|
||||
var blob = data.constructor !== Blob
|
||||
? new Blob([data], {type: mimetype || 'application/octet-stream'})
|
||||
: data ;
|
||||
|
||||
if (navigator.msSaveBlob) {
|
||||
navigator.msSaveBlob(blob, filename);
|
||||
return;
|
||||
}
|
||||
|
||||
var lnk = document.createElement('a'),
|
||||
url = window.URL,
|
||||
objectURL;
|
||||
|
||||
if (mimetype) {
|
||||
lnk.type = mimetype;
|
||||
}
|
||||
|
||||
lnk.download = filename || 'untitled';
|
||||
lnk.href = objectURL = url.createObjectURL(blob);
|
||||
lnk.dispatchEvent(new MouseEvent('click'));
|
||||
setTimeout(url.revokeObjectURL.bind(url, objectURL));
|
||||
|
||||
}
|
||||
|
||||
$('.saveButton').on('click', saveFile)
|
||||
|
||||
let editingTd;
|
||||
|
@ -108,167 +256,74 @@ function loadFile(evt) {
|
|||
makeTdEditable(target);
|
||||
});
|
||||
|
||||
file = evt.target.files[0];
|
||||
updateTable();
|
||||
|
||||
// TODO: adapt to streaming with 'chunk' callback for large file support, see https://www.papaparse.com/docs
|
||||
Papa.parse(file, {
|
||||
header: true,
|
||||
delimiter: '\t',
|
||||
comments: "#",
|
||||
skipEmptyLines: true,
|
||||
dynamicTyping: true,
|
||||
complete: function(results) {
|
||||
//console.log(results);
|
||||
data = results;
|
||||
$('#tableregion')[0].addEventListener("wheel",
|
||||
function(event) {
|
||||
|
||||
if (event.deltaY < 0) {
|
||||
|
||||
if (startIndex <= 0) return;
|
||||
|
||||
startIndex -= 1;
|
||||
endIndex -= 1;
|
||||
}
|
||||
else {
|
||||
|
||||
if (endIndex >= data.data.length) return;
|
||||
|
||||
startIndex += 1;
|
||||
endIndex += 1;
|
||||
}
|
||||
|
||||
updateTable();
|
||||
});
|
||||
|
||||
$("#file-region").html('<h3>' + file.name + '</h3>');
|
||||
$('#back').on('click',
|
||||
function(evt) {
|
||||
|
||||
$('#tableregion')[0].addEventListener("wheel",
|
||||
function(event) {
|
||||
if (startIndex >= displayRows) {
|
||||
startIndex -= displayRows;
|
||||
endIndex -= displayRows;
|
||||
}
|
||||
else {
|
||||
startIndex = 0;
|
||||
endIndex = displayRows;
|
||||
}
|
||||
|
||||
if (event.deltaY < 0) {
|
||||
|
||||
if (startIndex <= 0) return;
|
||||
|
||||
startIndex -= 1;
|
||||
endIndex -= 1;
|
||||
}
|
||||
else {
|
||||
|
||||
if (endIndex >= data.data.length) return;
|
||||
|
||||
startIndex += 1;
|
||||
endIndex += 1;
|
||||
}
|
||||
|
||||
updateTable();
|
||||
});
|
||||
|
||||
$('#back').on('click',
|
||||
function(evt) {
|
||||
|
||||
if (startIndex >= displayRows) {
|
||||
startIndex -= displayRows;
|
||||
endIndex -= displayRows;
|
||||
}
|
||||
else {
|
||||
startIndex = 0;
|
||||
endIndex = displayRows;
|
||||
}
|
||||
|
||||
updateTable();
|
||||
}
|
||||
);
|
||||
|
||||
$('#next').on('click',
|
||||
function(evt) {
|
||||
|
||||
if (endIndex + displayRows < data.data.length) {
|
||||
endIndex += displayRows;
|
||||
startIndex = endIndex - displayRows;
|
||||
}
|
||||
else {
|
||||
endIndex = data.data.length;
|
||||
startIndex = endIndex - displayRows;
|
||||
}
|
||||
|
||||
updateTable();
|
||||
}
|
||||
);
|
||||
updateTable();
|
||||
}
|
||||
});
|
||||
);
|
||||
|
||||
$('#next').on('click',
|
||||
function(evt) {
|
||||
|
||||
if (endIndex + displayRows < data.data.length) {
|
||||
endIndex += displayRows;
|
||||
startIndex = endIndex - displayRows;
|
||||
}
|
||||
else {
|
||||
endIndex = data.data.length;
|
||||
startIndex = endIndex - displayRows;
|
||||
}
|
||||
|
||||
updateTable();
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
function updateTable() {
|
||||
|
||||
let editable_html =
|
||||
`
|
||||
<td class="editable">
|
||||
`;
|
||||
|
||||
$('#table-body').empty();
|
||||
|
||||
$.each(data.data,
|
||||
function(nRow, el) {
|
||||
|
||||
if (nRow < startIndex) return;
|
||||
if (nRow >= endIndex) return;
|
||||
|
||||
var row = $("<tr/>");
|
||||
row.append($("<td/>").text(nRow));
|
||||
|
||||
$.each(el,
|
||||
function(column, content) {
|
||||
row.append(
|
||||
$(editable_html).
|
||||
text(content).
|
||||
data('tableInfo', { 'nRow': nRow, 'column': column })
|
||||
);
|
||||
});
|
||||
|
||||
$("#table tbody").append(row);
|
||||
});
|
||||
|
||||
$("#table td:contains('B-PER')").addClass('ner_per');
|
||||
$("#table td:contains('I-PER')").addClass('ner_per');
|
||||
$("#table td:contains('B-LOC')").addClass('ner_loc');
|
||||
$("#table td:contains('I-LOC')").addClass('ner_loc');
|
||||
$("#table td:contains('B-ORG')").addClass('ner_org');
|
||||
$("#table td:contains('I-ORG')").addClass('ner_org');
|
||||
$("#table td:contains('B-OTH')").addClass('ner_oth');
|
||||
$("#table td:contains('I-OTH')").addClass('ner_oth');
|
||||
$("#table td:contains('B-TODO')").addClass('ner_todo');
|
||||
$("#table td:contains('I-TODO')").addClass('ner_todo');
|
||||
}
|
||||
|
||||
function saveFile(evt) {
|
||||
|
||||
let csv =
|
||||
Papa.unparse(data,
|
||||
{
|
||||
header: true,
|
||||
delimiter: '\t',
|
||||
comments: "#",
|
||||
skipEmptyLines: true,
|
||||
dynamicTyping: true
|
||||
});
|
||||
|
||||
openSaveFileDialog (csv, file.name, null)
|
||||
}
|
||||
|
||||
|
||||
function openSaveFileDialog (data, filename, mimetype) {
|
||||
|
||||
if (!data) return;
|
||||
|
||||
var blob = data.constructor !== Blob
|
||||
? new Blob([data], {type: mimetype || 'application/octet-stream'})
|
||||
: data ;
|
||||
|
||||
if (navigator.msSaveBlob) {
|
||||
navigator.msSaveBlob(blob, filename);
|
||||
return;
|
||||
}
|
||||
|
||||
var lnk = document.createElement('a'),
|
||||
url = window.URL,
|
||||
objectURL;
|
||||
|
||||
if (mimetype) {
|
||||
lnk.type = mimetype;
|
||||
}
|
||||
|
||||
lnk.download = filename || 'untitled';
|
||||
lnk.href = objectURL = url.createObjectURL(blob);
|
||||
lnk.dispatchEvent(new MouseEvent('click'));
|
||||
setTimeout(url.revokeObjectURL.bind(url, objectURL));
|
||||
|
||||
}
|
||||
|
||||
$(document).ready(
|
||||
function() {
|
||||
$('#tsv-file').change(loadFile);
|
||||
$('#tsv-file').change(
|
||||
function(evt) {
|
||||
|
||||
loadFile ( evt,
|
||||
function(results, file) {
|
||||
|
||||
setupInterface(results, file);
|
||||
})
|
||||
}
|
||||
);
|
||||
}
|
||||
);
|
0
tools/README.md
Normal file
0
tools/README.md
Normal file
97
tools/cli.py
Normal file
97
tools/cli.py
Normal file
|
@ -0,0 +1,97 @@
|
|||
import re
|
||||
import click
|
||||
import pandas as pd
|
||||
from io import StringIO
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument('tsv-file', type=click.Path(exists=True), required=True, nargs=1)
|
||||
def extract_document_links(tsv_file):
|
||||
|
||||
parts = extract_doc_links(tsv_file)
|
||||
|
||||
for part in parts:
|
||||
|
||||
print(part['url'])
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument('tsv-file', type=click.Path(exists=True), required=True, nargs=1)
|
||||
@click.argument('annotated-tsv-file', type=click.Path(exists=False), required=True, nargs=1)
|
||||
def annotate_tsv(tsv_file, annotated_tsv_file):
|
||||
|
||||
parts = extract_doc_links(tsv_file)
|
||||
|
||||
annotated_parts = []
|
||||
|
||||
urls = []
|
||||
|
||||
for part in parts:
|
||||
|
||||
part_data = StringIO(part['header'] + part['text'])
|
||||
urls.append(part['url'])
|
||||
|
||||
df = pd.read_csv(part_data, sep="\t", comment='#', quoting=3)
|
||||
|
||||
df['url_id'] = len(annotated_parts)
|
||||
|
||||
annotated_parts.append(df)
|
||||
|
||||
df = pd.concat(annotated_parts)
|
||||
|
||||
df.to_csv(annotated_tsv_file, sep="\t", quoting=3, index=False)
|
||||
|
||||
|
||||
def extract_doc_links(tsv_file):
|
||||
|
||||
parts = []
|
||||
|
||||
header = None
|
||||
|
||||
with open(tsv_file, 'r') as f:
|
||||
|
||||
text = []
|
||||
url = None
|
||||
|
||||
for line in f:
|
||||
|
||||
if header is None:
|
||||
header = "\t".join(line.split()) + '\n'
|
||||
continue
|
||||
|
||||
urls = [url for url in
|
||||
re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line)]
|
||||
|
||||
if len(urls) > 0:
|
||||
if url is not None:
|
||||
parts.append({"url": url, 'header': header, 'text': "".join(text)})
|
||||
text = []
|
||||
|
||||
url = urls[-1]
|
||||
else:
|
||||
if url is None:
|
||||
continue
|
||||
|
||||
line = '\t'.join(line.split())
|
||||
|
||||
if line.count('\t') == 2:
|
||||
|
||||
line = "\t" + line
|
||||
|
||||
if line.count('\t') == 3:
|
||||
|
||||
text.append(line + '\n')
|
||||
|
||||
continue
|
||||
|
||||
if line.startswith('#'):
|
||||
continue
|
||||
|
||||
if len(line) == 0:
|
||||
continue
|
||||
|
||||
print('Line error: |', line, '|Number of Tabs: ', line.count('\t'))
|
||||
|
||||
parts.append({"url": url, 'header': header, 'text': "".join(text)})
|
||||
|
||||
return parts
|
3
tools/requirements.txt
Normal file
3
tools/requirements.txt
Normal file
|
@ -0,0 +1,3 @@
|
|||
numpy
|
||||
pandas
|
||||
click
|
35
tools/setup.py
Normal file
35
tools/setup.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
from io import open
|
||||
from setuptools import find_packages, setup
|
||||
|
||||
with open('requirements.txt') as fp:
|
||||
install_requires = fp.read()
|
||||
|
||||
setup(
|
||||
name="ner-edith",
|
||||
version="0.0.1",
|
||||
author="",
|
||||
author_email="qurator@sbb.spk-berlin.de",
|
||||
description="ner.edith",
|
||||
long_description=open("README.md", "r", encoding='utf-8').read(),
|
||||
long_description_content_type="text/markdown",
|
||||
keywords='qurator',
|
||||
license='Apache',
|
||||
url="https://github.com/cneud/ner.edith",
|
||||
packages=find_packages(exclude=["*.tests", "*.tests.*",
|
||||
"tests.*", "tests"]),
|
||||
install_requires=install_requires,
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
"extract-doc-links=cli:extract_document_links",
|
||||
"annotate-tsv=cli:annotate_tsv"
|
||||
]
|
||||
},
|
||||
python_requires='>=3.6.0',
|
||||
tests_require=['pytest'],
|
||||
classifiers=[
|
||||
'Intended Audience :: Science/Research',
|
||||
'License :: OSI Approved :: Apache Software License',
|
||||
'Programming Language :: Python :: 3',
|
||||
'Topic :: Scientific/Engineering :: Artificial Intelligence',
|
||||
],
|
||||
)
|
Loading…
Add table
Add a link
Reference in a new issue