ner + textline work

pull/2/head
Kai Labusch 5 years ago
parent 75c2c856cc
commit 3c12e5538a

@ -0,0 +1,5 @@
data/*
*.egg_info
venv
models
*.tar.gz

@ -17,4 +17,4 @@ COPY . /usr/src/qurator-mono-repo
RUN pip3 --no-cache-dir install -e /usr/src/qurator-mono-repo RUN pip3 --no-cache-dir install -e /usr/src/qurator-mono-repo
WORKDIR /usr/src/qurator-mono-repo WORKDIR /usr/src/qurator-mono-repo
CMD export LANG=C.UTF-8; env FLASK_APP=qurator/qurator_sbb_ner/webapp/app.py env FLASK_ENV=development env USE_CUDA=True flask run --host=0.0.0.0 CMD export LANG=C.UTF-8; env FLASK_APP=qurator/sbb_ner/webapp/app.py env FLASK_ENV=development env USE_CUDA=True flask run --host=0.0.0.0

@ -11,9 +11,12 @@ RUN apt-get update && \
COPY requirements.txt /tmp COPY requirements.txt /tmp
RUN pip3 --no-cache-dir install -r /tmp/requirements.txt RUN pip3 --no-cache-dir install -r /tmp/requirements.txt
COPY . /usr/src/qurator-mono-repo COPY . /usr/src/qurator-sbb-ner
RUN pip3 --no-cache-dir install -e /usr/src/qurator-mono-repo RUN mkdir -p /usr/src/qurator-sbb-ner/konvens2019
RUN mkdir -p /usr/src/qurator-sbb-ner/digisam
WORKDIR /usr/src/qurator-mono-repo RUN pip3 --no-cache-dir install -e /usr/src/qurator-sbb-ner
CMD env FLASK_APP=qurator/qurator_sbb_ner/webapp/app.py env FLASK_ENV=development env USE_CUDA=False flask run --host=0.0.0.0
WORKDIR /usr/src/qurator-sbb-ner
CMD env FLASK_APP=qurator/sbb_ner/webapp/app.py env FLASK_ENV=development env USE_CUDA=False flask run --host=0.0.0.0

@ -510,3 +510,7 @@ wikipedia-evaluation: $(BUILD_PATH)/wikipedia-de-finetuned/eval_results-LFT.pkl
wikipedia-evaluation2: $(BUILD_PATH)/wikipedia-de-finetuned/eval_results-SBB.pkl wikipedia-evaluation2: $(BUILD_PATH)/wikipedia-de-finetuned/eval_results-SBB.pkl
wikipedia-evaluation3: $(BUILD_PATH)/wikipedia-de-finetuned/eval_results-DE-CONLL-TESTA.pkl wikipedia-evaluation3: $(BUILD_PATH)/wikipedia-de-finetuned/eval_results-DE-CONLL-TESTA.pkl
###############################
model_archive:
tar --exclude='*ep[1-6]*' --exclude='*eval*' --exclude='pytorch_model.bin' --exclude='*.pkl' -chzf models.tar.gz data/konvens2019/build-wd_0.03/bert-all-german-de-finetuned data/konvens2019/build-on-all-german-de-finetuned/bert-sbb-de-finetuned data/konvens2019/build-wd_0.03/bert-sbb-de-finetuned data/konvens2019/build-wd_0.03/bert-all-german-baseline

@ -317,12 +317,11 @@ def ner(model_id):
output = [] output = []
word = None
last_prediction = 'O'
for tokens, word_predictions in prediction: for tokens, word_predictions in prediction:
word = None
last_prediction = 'O' last_prediction = 'O'
output_sentence = []
for token, word_pred in zip(tokens, word_predictions): for token, word_pred in zip(tokens, word_predictions):
@ -331,7 +330,7 @@ def ner(model_id):
if not token.startswith('##'): if not token.startswith('##'):
if word is not None: if word is not None:
output.append({'word': word, 'prediction': last_prediction}) output_sentence.append({'word': word, 'prediction': last_prediction})
word = '' word = ''
@ -342,8 +341,10 @@ def ner(model_id):
if word_pred != 'X': if word_pred != 'X':
last_prediction = word_pred last_prediction = word_pred
if word is not None and len(word) > 0: if word is not None and len(word) > 0:
output.append({'word': word, 'prediction': last_prediction}) output_sentence.append({'word': word, 'prediction': last_prediction})
output.append(output_sentence)
return jsonify(output) return jsonify(output)

@ -9,7 +9,7 @@
<link rel="stylesheet" href="css/bootstrap.min.css" <link rel="stylesheet" href="css/bootstrap.min.css"
integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous"> integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
<title>NER auf den digitalen Sammlungen</title> <title>NER - Demo </title>
<script src="js/jquery-3.4.1.js"></script> <script src="js/jquery-3.4.1.js"></script>
</head> </head>
<body> <body>
@ -22,7 +22,7 @@
<div class="col-10"> <div class="col-10">
<div class="row"> <div class="row">
<div class="col-9 text-center"> <div class="col-9 text-center">
<h1>NER auf den digitalen Sammlungen</h1> <h1>NER - Demo</h1>
</div> </div>
<div class="col"> <div class="col">
</div> </div>
@ -35,7 +35,6 @@
<div class="form-group row ml-2"> <div class="form-group row ml-2">
<label for="task" class="col-sm-2 col-form-label">Task:</label> <label for="task" class="col-sm-2 col-form-label">Task:</label>
<select id="task" class="selectpicker col-md-auto" onchange="task_select()"> <select id="task" class="selectpicker col-md-auto" onchange="task_select()">
<option value="1">OCR-Text aus ALTO Datei</option>
<option value="2">Wort- und Satztokenisierung</option> <option value="2">Wort- und Satztokenisierung</option>
<option value="3" selected>Named Entity Recognition</option> <option value="3" selected>Named Entity Recognition</option>
<option value="4">BERT Tokens</option> <option value="4">BERT Tokens</option>
@ -48,10 +47,13 @@
</div> </div>
<div class="form-group row ml-2"> <div class="form-group row ml-2">
<label for="ppn" class="col-sm-2 col-form-label">PPN:</label> <label for="inputtext" class="col-sm-2 col-form-label">Input text:</label>
<input id="ppn" list="ppnexamples" class="col-sm-8" type="text"/> <!-- <input id="inputtext" class="col-sm-8" type="text" rows=10/> -->
<datalist id="ppnexamples"> <textarea id="inputtext" class=" col-sm-8 form-control" rows="3" required></textarea>
</datalist> </div>
<div class="form-group row ml-2">
<div class="col-sm-2"></div>
<button class="btn btn-primary" type="submit">Go</button> <button class="btn btn-primary" type="submit">Go</button>
</div> </div>
</form> </form>

@ -0,0 +1,155 @@
$(document).ready(function(){
$('#nerform').submit(
function(e){
e.preventDefault();
load_ppn();
}
);
$.get( "/models")
.done(
function( data ) {
var tmp="";
$.each(data,
function(index, item){
selected=""
if (item.default) {
selected = "selected"
}
tmp += '<option value="' + item.id + '" ' + selected + ' >' + item.name + '</option>'
});
$('#model').html(tmp);
}
);
$.get( "/ppnexamples")
.done(
function( data ) {
var tmp="";
$.each(data,
function(index, item){
tmp += '<option value="' + item.ppn + '">' + item.name + '</option>'
});
$('#ppnexamples').html(tmp);
}
);
task_select()
});
function task_select() {
var task = $('#task').val();
if (task < 3) {
$('#model_select').hide()
}
else {
$('#model_select').show()
}
$("#resultregion").html("");
$("#legende").html("");
}
function load_ppn() {
var ppn = $('#ppn').val()
var text_region_html =
`<div class="card">
<div class="card-header">
Ergebnis:
</div>
<div class="card-block">
<div id="textregion" style="overflow-y:scroll;height: 65vh;"></div>
</div>
</div>`;
var legende_html =
`<div class="card">
<div class="card-header">
Legende:
<div class="ml-2" >[<font color="red">Person</font>]</div>
<div class="ml-2" >[<font color="green">Ort</font>]</div>
<div class="ml-2" >[<font color="blue">Organisation</font>]</div>
<div class="ml-2" >[keine Named Entity]</div>
</div>
</div>`;
var spinner_html =
`<div class="d-flex justify-content-center">
<div class="spinner-border align-center" role="status">
<span class="sr-only">Loading...</span>
</div>
</div>`;
$("#legende").html("");
var task = $('#task').val();
var model_id = $('#model').val();
console.log("Task: " + task);
if (task == 1) {
$("#resultregion").html(spinner_html);
$.get( "/digisam-fulltext/" + ppn)
.done(function( data ) {
$("#resultregion").html(text_region_html)
$("#textregion").html(data.text)
})
.fail(
function() {
console.log('Failed.');
$("#resultregion").html('Failed.');
});
}
else if (task == 2) {
$("#resultregion").html(spinner_html);
$.get( "/digisam-tokenized/" + ppn,
function( data ) {
$("#resultregion").html(text_region_html)
$("#textregion").html(data.text)
}).fail(
function() {
console.log('Failed.')
$("#resultregion").html('Failed.')
});
}
else if (task == 3) {
$("#resultregion").html(spinner_html);
$.get( "/digisam-ner/" + model_id + "/" + ppn,
function( data ) {
$("#resultregion").html(text_region_html)
$("#textregion").html(data.text)
$("#legende").html(legende_html)
}).fail(
function(a,b,c) {
console.log('Failed.')
$("#resultregion").html('Failed.')
});
}
else if (task == 4) {
$("#resultregion").html(spinner_html);
$.get( "/digisam-ner-bert-tokens/" + model_id + "/" + ppn,
function( data ) {
$("#resultregion").html(text_region_html)
$("#textregion").html(data.text)
}).fail(
function(a,b,c) {
console.log('Failed.')
$("#resultregion").html('Failed.')
});
}
}

@ -4,7 +4,7 @@ $(document).ready(function(){
$('#nerform').submit( $('#nerform').submit(
function(e){ function(e){
e.preventDefault(); e.preventDefault();
load_ppn(); do_task();
} }
); );
@ -26,19 +26,6 @@ $(document).ready(function(){
} }
); );
$.get( "/ppnexamples")
.done(
function( data ) {
var tmp="";
$.each(data,
function(index, item){
tmp += '<option value="' + item.ppn + '">' + item.name + '</option>'
});
$('#ppnexamples').html(tmp);
}
);
task_select() task_select()
}); });
@ -58,9 +45,9 @@ function task_select() {
} }
function load_ppn() { function do_task() {
var ppn = $('#ppn').val() var input_text = $('#inputtext').val()
var text_region_html = var text_region_html =
`<div class="card"> `<div class="card">
@ -95,61 +82,102 @@ function load_ppn() {
var task = $('#task').val(); var task = $('#task').val();
var model_id = $('#model').val(); var model_id = $('#model').val();
console.log("Task: " + task); // if (task == 2) {
// $("#resultregion").html(spinner_html);
//
// $.get( "/digisam-tokenized/" + ppn,
// function( data ) {
// $("#resultregion").html(text_region_html)
// $("#textregion").html(data.text)
// }).fail(
// function() {
// console.log('Failed.')
// $("#resultregion").html('Failed.')
// });
// }
// else
//
if (task == 3) {
if (task == 1) {
$("#resultregion").html(spinner_html);
$.get( "/digisam-fulltext/" + ppn)
.done(function( data ) {
$("#resultregion").html(text_region_html)
$("#textregion").html(data.text)
})
.fail(
function() {
console.log('Failed.');
$("#resultregion").html('Failed.');
});
}
else if (task == 2) {
$("#resultregion").html(spinner_html); $("#resultregion").html(spinner_html);
$.get( "/digisam-tokenized/" + ppn, post_data = { "text" : input_text }
function( data ) {
$("#resultregion").html(text_region_html) console.log(post_data)
$("#textregion").html(data.text)
}).fail( $.ajax({
function() { url: "/ner/" + model_id,
console.log('Failed.') data: JSON.stringify(post_data),
$("#resultregion").html('Failed.') type: 'POST',
}); contentType: "application/json",
} success:
else if (task == 3) { function( data ) {
text_html = ""
$("#resultregion").html(spinner_html); data.forEach(
function(sentence) {
$.get( "/digisam-ner/" + model_id + "/" + ppn, sentence.forEach(
function( data ) { function(token) {
$("#resultregion").html(text_region_html)
$("#textregion").html(data.text) if (text_html != "") text_html += ' '
$("#legende").html(legende_html)
}).fail( if (token.prediction == 'O')
function(a,b,c) { text_html += token.word
console.log('Failed.') else if (token.prediction.endsWith('PER'))
$("#resultregion").html('Failed.') text_html += '<font color="red">' + token.word + '</font>'
}); else if (token.prediction.endsWith('LOC'))
} text_html += '<font color="green">' + token.word + '</font>'
else if (task == 4) { else if (token.prediction.endsWith('ORG'))
$("#resultregion").html(spinner_html); text_html += '<font color="blue">' + token.word + '</font>'
})
$.get( "/digisam-ner-bert-tokens/" + model_id + "/" + ppn, text_html += '<br/>'
function( data ) { }
$("#resultregion").html(text_region_html) )
$("#textregion").html(data.text) $("#resultregion").html(text_region_html)
}).fail( $("#textregion").html(text_html)
function(a,b,c) { $("#legende").html(legende_html)
console.log('Failed.') }
$("#resultregion").html('Failed.') ,
}); error: function(error) {
console.log(error);
}
});
// $.post( "/ner/" + model_id, post_data).done(
// function( data ) {
//
// text_region_html = ""
// data.forEach(
// function(sentence) {
// sentence.forEach(
// function(token) {
// text_region_html += token.word + "(" + token.prediction + ") "
// })
// }
// )
//
// $("#resultregion").html(text_region_html)
// $("#textregion").html(data.text)
// $("#legende").html(legende_html)
// }).fail(
// function(a,b,c) {
// console.log('Failed.')
// $("#resultregion").html('Failed.')
// });
} }
// else
//
// if (task == 4) {
// $("#resultregion").html(spinner_html);
//
// $.get( "/digisam-ner-bert-tokens/" + model_id + "/" + ppn,
// function( data ) {
// $("#resultregion").html(text_region_html)
// $("#textregion").html(data.text)
// }).fail(
// function(a,b,c) {
// console.log('Failed.')
// $("#resultregion").html('Failed.')
// });
// }
} }

@ -0,0 +1,77 @@
<!doctype html>
<html lang="en">
<head>
<!-- Required meta tags -->
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<!-- Bootstrap CSS -->
<link rel="stylesheet" href="css/bootstrap.min.css"
integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
<title>NER auf den digitalen Sammlungen</title>
<script src="js/jquery-3.4.1.js"></script>
</head>
<body>
<div class="container-fluid" style="height: 95vh;">
<div class="row" style="margin-top: 5vh">
<div class="col-2">
</div>
<div class="col-10">
<div class="row">
<div class="col-9 text-center">
<h1>NER auf den digitalen Sammlungen</h1>
</div>
<div class="col">
</div>
</div>
<div class="row" style="margin-top: 2vh">
<div class="col-9">
<div class="card">
<div class="card-block">
<form class="mt-3 mb-3" role="form" id="nerform">
<div class="form-group row ml-2">
<label for="task" class="col-sm-2 col-form-label">Task:</label>
<select id="task" class="selectpicker col-md-auto" onchange="task_select()">
<option value="1">OCR-Text aus ALTO Datei</option>
<option value="2">Wort- und Satztokenisierung</option>
<option value="3" selected>Named Entity Recognition</option>
<option value="4">BERT Tokens</option>
</select>
</div>
<div class="form-group row ml-2" id="model_select">
<label for="model" class="col-sm-2 col-form-label">Model:</label>
<select id="model" class="selectpicker col-md-auto">
</select>
</div>
<div class="form-group row ml-2">
<label for="ppn" class="col-sm-2 col-form-label">PPN:</label>
<input id="ppn" list="ppnexamples" class="col-sm-8" type="text"/>
<datalist id="ppnexamples">
</datalist>
<button class="btn btn-primary" type="submit">Go</button>
</div>
</form>
</div>
</div>
</div>
<div class="col">
</div>
</div>
<div class="row mt-5">
<div class="col-9" id="resultregion">
</div>
<div class="col" id="legende">
</div>
</div>
</div>
</div>
</div>
<script src="js/ner-ds-sbb.js"></script>
</body>
</html>

@ -4,6 +4,7 @@ dask
pyarrow pyarrow
tqdm tqdm
pytorch-pretrained-bert==0.6.2 pytorch-pretrained-bert==0.6.2
sklearn
click click
langid langid
seqeval seqeval

Loading…
Cancel
Save