diff --git a/qurator/sbb_ner/models/bert.py b/qurator/sbb_ner/models/bert.py index 3bdce31..be18335 100644 --- a/qurator/sbb_ner/models/bert.py +++ b/qurator/sbb_ner/models/bert.py @@ -385,6 +385,8 @@ def model_predict(dataloader, device, label_map, model): temp_2.pop() # skip last token since its [SEP] y_pred.append(temp_2) break + else: + y_pred.append(temp_2) return y_pred diff --git a/qurator/sbb_ner/webapp/app.py b/qurator/sbb_ner/webapp/app.py index 3629b2f..6de6d45 100644 --- a/qurator/sbb_ner/webapp/app.py +++ b/qurator/sbb_ner/webapp/app.py @@ -93,12 +93,16 @@ class NERPredictor: features = [convert_examples_to_features(ex, self._label_to_id, self._max_seq_length, self._bert_tokenizer) for ex in examples] + assert len(sentences) == len(features) + data_loader = NerProcessor.make_data_loader(None, self._batch_size, self._local_rank, self._label_to_id, self._max_seq_length, self._bert_tokenizer, features=features, sequential=True) prediction_tmp = model_predict(data_loader, self._device, self._label_map, self._model) + assert len(sentences) == len(prediction_tmp) + prediction = [] for fe, pr in zip(features, prediction_tmp): prediction.append((fe.tokens[1:-1], pr)) @@ -185,125 +189,47 @@ def fulltext(ppn): if row_data.text is None: continue - text += html.escape(str(row_data.text)) + '


' - - ret = {'text': text, 'ppn': ppn} - - return jsonify(ret) - - -@app.route('/digisam-tokenized/') -def tokenized(ppn): - - df = digisam.get(ppn) - - if len(df) == 0: - return 'bad request!', 400 - - text = '' - for row_index, row_data in df.iterrows(): - - if row_data.text is None: - continue - - sentences = tokenizer.parse_text(row_data.text) - - for sen, _ in sentences: - - text += html.escape(str(sen)) + '
' - - text += '


' - - ret = {'text': text, 'ppn': ppn} - - return jsonify(ret) - - -@app.route('/ner-bert-tokens//') -def ner_bert_tokens(model_id, ppn): - - df = digisam.get(ppn) - - if len(df) == 0: - return 'bad request!', 400 - - text = '' - for row_index, row_data in df.iterrows(): - - if row_data.text is None: - continue - - sentences = tokenizer.parse_text(row_data.text) - - prediction = predictor_store.get(model_id).classify_text(sentences) - - for tokens, word_predictions in prediction: - - for token, word_pred in zip(tokens, word_predictions): - - text += html.escape("{}({})".format(token, word_pred)) - - text += '
' - - text += '


' + text += row_data.text + " " ret = {'text': text, 'ppn': ppn} return jsonify(ret) -@app.route('/digisam-ner//') -def digisam_ner(model_id, ppn): - - df = digisam.get(ppn) +@app.route('/tokenized', methods=['GET', 'POST']) +def tokenized(): - if len(df) == 0: - return 'bad request!', 400 - - text = '' - - for row_index, row_data in df.iterrows(): + raw_text = request.json['text'] - if row_data.text is None: - continue + sentences = tokenizer.parse_text(raw_text) - sentences = tokenizer.parse_text(row_data.text) + result = [(sen, i) for i, (sen, _) in enumerate(sentences)] - prediction = predictor_store.get(model_id).classify_text(sentences) + return jsonify(result) - for tokens, word_predictions in prediction: - last_prediction = 'O' +@app.route('/ner-bert-tokens/', methods=['GET', 'POST']) +def ner_bert_tokens(model_id): - for token, word_pred in zip(tokens, word_predictions): + raw_text = request.json['text'] - if token == '[UNK]': - continue + sentences = tokenizer.parse_text(raw_text) - if not token.startswith('##'): - text += ' ' + prediction = predictor_store.get(model_id).classify_text(sentences) - token = token[2:] if token.startswith('##') else token + output = [] - if word_pred != 'X': - last_prediction = word_pred + for tokens, word_predictions in prediction: - if last_prediction == 'O': - text += html.escape(token) - elif last_prediction.endswith('PER'): - text += '' + html.escape(token) + '' - elif last_prediction.endswith('LOC'): - text += '' + html.escape(token) + '' - elif last_prediction.endswith('ORG'): - text += '' + html.escape(token) + '' + output_sentence = [] - text += '
' + for token, word_pred in zip(tokens, word_predictions): - text += '


' + output_sentence.append({'token': html.escape(token), 'prediction': word_pred}) - ret = {'text': text, 'ppn': ppn} + output.append(output_sentence) - return jsonify(ret) + return jsonify(output) @app.route('/ner/', methods=['GET', 'POST']) diff --git a/qurator/sbb_ner/webapp/static/index.html b/qurator/sbb_ner/webapp/static/index.html index 227d06c..672f69b 100644 --- a/qurator/sbb_ner/webapp/static/index.html +++ b/qurator/sbb_ner/webapp/static/index.html @@ -35,9 +35,9 @@
@@ -48,8 +48,7 @@
- - +
@@ -75,5 +74,6 @@
+ \ No newline at end of file diff --git a/qurator/sbb_ner/webapp/static/js/ner-demo.js b/qurator/sbb_ner/webapp/static/js/ner-demo.js new file mode 100644 index 0000000..a08c391 --- /dev/null +++ b/qurator/sbb_ner/webapp/static/js/ner-demo.js @@ -0,0 +1,34 @@ +$(document).ready(function(){ + + $('#nerform').submit( + function(e){ + e.preventDefault(); + + var task = $('#task').val(); + var model_id = $('#model').val(); + var input_text = $('#inputtext').val() + + do_task(task, model_id, input_text); + } + ); + + $.get( "/models") + .done( + function( data ) { + var tmp=""; + $.each(data, + function(index, item){ + + selected="" + if (item.default) { + selected = "selected" + } + + tmp += '' + }); + $('#model').html(tmp); + } + ); + + task_select() +}); diff --git a/qurator/sbb_ner/webapp/static/js/ner-ds-sbb.js b/qurator/sbb_ner/webapp/static/js/ner-ds-sbb.js index 2dee0dc..6287685 100644 --- a/qurator/sbb_ner/webapp/static/js/ner-ds-sbb.js +++ b/qurator/sbb_ner/webapp/static/js/ner-ds-sbb.js @@ -4,7 +4,31 @@ $(document).ready(function(){ $('#nerform').submit( function(e){ e.preventDefault(); - load_ppn(); + + var task = $('#task').val(); + var model_id = $('#model').val(); + + var spinner_html = + `
+
+ Loading... +
+
`; + + var ppn = $('#ppn').val() + + $("#resultregion").html(spinner_html); + + $.get( "/digisam-fulltext/" + ppn) + .done(function( data ) { + + do_task(task, model_id, data.text) + }) + .fail( + function() { + console.log('Failed.'); + $("#resultregion").html('Failed.'); + }); } ); @@ -41,115 +65,3 @@ $(document).ready(function(){ task_select() }); - -function task_select() { - - var task = $('#task').val(); - - if (task < 3) { - $('#model_select').hide() - } - else { - $('#model_select').show() - } - - $("#resultregion").html(""); - $("#legende").html(""); -} - - -function load_ppn() { - - var ppn = $('#ppn').val() - - var text_region_html = - `
-
- Ergebnis: -
-
-
-
-
`; - - var legende_html = - `
-
- Legende: -
[Person]
-
[Ort]
-
[Organisation]
-
[keine Named Entity]
-
-
`; - - var spinner_html = - `
-
- Loading... -
-
`; - - $("#legende").html(""); - - var task = $('#task').val(); - var model_id = $('#model').val(); - - console.log("Task: " + task); - - if (task == 1) { - $("#resultregion").html(spinner_html); - - $.get( "/digisam-fulltext/" + ppn) - .done(function( data ) { - $("#resultregion").html(text_region_html) - $("#textregion").html(data.text) - }) - .fail( - function() { - console.log('Failed.'); - $("#resultregion").html('Failed.'); - }); - } - else if (task == 2) { - $("#resultregion").html(spinner_html); - - $.get( "/digisam-tokenized/" + ppn, - function( data ) { - $("#resultregion").html(text_region_html) - $("#textregion").html(data.text) - }).fail( - function() { - console.log('Failed.') - $("#resultregion").html('Failed.') - }); - } - else if (task == 3) { - - $("#resultregion").html(spinner_html); - - $.get( "/digisam-ner/" + model_id + "/" + ppn, - function( data ) { - $("#resultregion").html(text_region_html) - $("#textregion").html(data.text) - $("#legende").html(legende_html) - }).fail( - function(a,b,c) { - console.log('Failed.') - $("#resultregion").html('Failed.') - }); - } - else if (task == 4) { - $("#resultregion").html(spinner_html); - - $.get( "/digisam-ner-bert-tokens/" + model_id + "/" + ppn, - function( data ) { - $("#resultregion").html(text_region_html) - $("#textregion").html(data.text) - }).fail( - function(a,b,c) { - console.log('Failed.') - $("#resultregion").html('Failed.') - }); - } -} \ No newline at end of file diff --git a/qurator/sbb_ner/webapp/static/js/ner.js b/qurator/sbb_ner/webapp/static/js/ner.js index ed92b69..fa71a16 100644 --- a/qurator/sbb_ner/webapp/static/js/ner.js +++ b/qurator/sbb_ner/webapp/static/js/ner.js @@ -1,39 +1,9 @@ -$(document).ready(function(){ - - $('#nerform').submit( - function(e){ - e.preventDefault(); - do_task(); - } - ); - - $.get( "/models") - .done( - function( data ) { - var tmp=""; - $.each(data, - function(index, item){ - - selected="" - if (item.default) { - selected = "selected" - } - - tmp += '' - }); - $('#model').html(tmp); - } - ); - - task_select() -}); - function task_select() { var task = $('#task').val(); - if (task < 3) { + if ((task != "ner") && (task != "bert-tokens")){ $('#model_select').hide() } else { @@ -44,10 +14,9 @@ function task_select() { $("#legende").html(""); } +function do_task(task, model_id, input_text) { -function do_task() { - - var input_text = $('#inputtext').val() + var post_data = { "text" : input_text } var text_region_html = `
@@ -55,7 +24,7 @@ function do_task() { Ergebnis:
-
+
`; @@ -79,31 +48,45 @@ function do_task() { $("#legende").html(""); - var task = $('#task').val(); - var model_id = $('#model').val(); - -// if (task == 2) { -// $("#resultregion").html(spinner_html); -// -// $.get( "/digisam-tokenized/" + ppn, -// function( data ) { -// $("#resultregion").html(text_region_html) -// $("#textregion").html(data.text) -// }).fail( -// function() { -// console.log('Failed.') -// $("#resultregion").html('Failed.') -// }); -// } -// else -// - if (task == 3) { + if (task == "fulltext") { + $("#resultregion").html(text_region_html) + $("#textregion").html(input_text) + } + else if (task == "tokenize") { - $("#resultregion").html(spinner_html); + $("#resultregion").html(spinner_html) - post_data = { "text" : input_text } + $.ajax( + { + url: "/tokenized", + data: JSON.stringify(post_data), + type: 'POST', + contentType: "application/json", + success: + function( data ) { + text_html = "" + data.forEach( + function(sentence) { - console.log(post_data) + text_html += JSON.stringify(sentence) + + text_html += '
' + } + ) + $("#resultregion").html(text_region_html) + $("#textregion").html(text_html) + $("#legende").html(legende_html) + } + , + error: + function(error) { + console.log(error); + } + }) + } + else if (task == "ner") { + + $("#resultregion").html(spinner_html) $.ajax({ url: "/ner/" + model_id, @@ -141,43 +124,40 @@ function do_task() { console.log(error); } }); + } + else if (task == "bert-tokens") { + $("#resultregion").html(spinner_html); + $.ajax( + { + url: "/ner-bert-tokens/" + model_id, + data: JSON.stringify(post_data), + type: 'POST', + contentType: "application/json", + success: + function( data ) { + text_html = "" + data.forEach( + function(sentence) { + sentence.forEach( + function(part) { -// $.post( "/ner/" + model_id, post_data).done( -// function( data ) { -// -// text_region_html = "" -// data.forEach( -// function(sentence) { -// sentence.forEach( -// function(token) { -// text_region_html += token.word + "(" + token.prediction + ") " -// }) -// } -// ) -// -// $("#resultregion").html(text_region_html) -// $("#textregion").html(data.text) -// $("#legende").html(legende_html) -// }).fail( -// function(a,b,c) { -// console.log('Failed.') -// $("#resultregion").html('Failed.') -// }); + if (text_html != "") text_html += ' ' + + text_html += part.token + "(" + part.prediction + ")" + }) + text_html += '
' + } + ) + $("#resultregion").html(text_region_html) + $("#textregion").html(text_html) + $("#legende").html(legende_html) + } + , + error: + function(error) { + console.log(error); + } + }) } -// else -// -// if (task == 4) { -// $("#resultregion").html(spinner_html); -// -// $.get( "/digisam-ner-bert-tokens/" + model_id + "/" + ppn, -// function( data ) { -// $("#resultregion").html(text_region_html) -// $("#textregion").html(data.text) -// }).fail( -// function(a,b,c) { -// console.log('Failed.') -// $("#resultregion").html('Failed.') -// }); -// } } \ No newline at end of file diff --git a/qurator/sbb_ner/webapp/static/ner-ds-sbb.html b/qurator/sbb_ner/webapp/static/ner-ds-sbb.html index aea93d9..f1a533d 100644 --- a/qurator/sbb_ner/webapp/static/ner-ds-sbb.html +++ b/qurator/sbb_ner/webapp/static/ner-ds-sbb.html @@ -35,10 +35,10 @@
@@ -72,6 +72,7 @@
+ \ No newline at end of file