diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..623642d --- /dev/null +++ b/.dockerignore @@ -0,0 +1,5 @@ +data/* +*.egg_info +venv +models +*.tar.gz diff --git a/Dockerfile b/Dockerfile index 668fb9d..07e47a5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,4 +17,4 @@ COPY . /usr/src/qurator-mono-repo RUN pip3 --no-cache-dir install -e /usr/src/qurator-mono-repo WORKDIR /usr/src/qurator-mono-repo -CMD export LANG=C.UTF-8; env FLASK_APP=qurator/qurator_sbb_ner/webapp/app.py env FLASK_ENV=development env USE_CUDA=True flask run --host=0.0.0.0 +CMD export LANG=C.UTF-8; env FLASK_APP=qurator/sbb_ner/webapp/app.py env FLASK_ENV=development env USE_CUDA=True flask run --host=0.0.0.0 diff --git a/Dockerfile.cpu b/Dockerfile.cpu index cd20388..83061c1 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -11,9 +11,12 @@ RUN apt-get update && \ COPY requirements.txt /tmp RUN pip3 --no-cache-dir install -r /tmp/requirements.txt -COPY . /usr/src/qurator-mono-repo +COPY . /usr/src/qurator-sbb-ner -RUN pip3 --no-cache-dir install -e /usr/src/qurator-mono-repo +RUN mkdir -p /usr/src/qurator-sbb-ner/konvens2019 +RUN mkdir -p /usr/src/qurator-sbb-ner/digisam -WORKDIR /usr/src/qurator-mono-repo -CMD env FLASK_APP=qurator/qurator_sbb_ner/webapp/app.py env FLASK_ENV=development env USE_CUDA=False flask run --host=0.0.0.0 +RUN pip3 --no-cache-dir install -e /usr/src/qurator-sbb-ner + +WORKDIR /usr/src/qurator-sbb-ner +CMD env FLASK_APP=qurator/sbb_ner/webapp/app.py env FLASK_ENV=development env USE_CUDA=False flask run --host=0.0.0.0 diff --git a/Makefile b/Makefile index dd89591..a362fd7 100644 --- a/Makefile +++ b/Makefile @@ -510,3 +510,7 @@ wikipedia-evaluation: $(BUILD_PATH)/wikipedia-de-finetuned/eval_results-LFT.pkl wikipedia-evaluation2: $(BUILD_PATH)/wikipedia-de-finetuned/eval_results-SBB.pkl wikipedia-evaluation3: $(BUILD_PATH)/wikipedia-de-finetuned/eval_results-DE-CONLL-TESTA.pkl +############################### + +model_archive: + tar --exclude='*ep[1-6]*' --exclude='*eval*' --exclude='pytorch_model.bin' --exclude='*.pkl' -chzf models.tar.gz data/konvens2019/build-wd_0.03/bert-all-german-de-finetuned data/konvens2019/build-on-all-german-de-finetuned/bert-sbb-de-finetuned data/konvens2019/build-wd_0.03/bert-sbb-de-finetuned data/konvens2019/build-wd_0.03/bert-all-german-baseline diff --git a/qurator/sbb_ner/webapp/app.py b/qurator/sbb_ner/webapp/app.py index b215f27..3629b2f 100644 --- a/qurator/sbb_ner/webapp/app.py +++ b/qurator/sbb_ner/webapp/app.py @@ -317,12 +317,11 @@ def ner(model_id): output = [] - word = None - last_prediction = 'O' - for tokens, word_predictions in prediction: + word = None last_prediction = 'O' + output_sentence = [] for token, word_pred in zip(tokens, word_predictions): @@ -331,7 +330,7 @@ def ner(model_id): if not token.startswith('##'): if word is not None: - output.append({'word': word, 'prediction': last_prediction}) + output_sentence.append({'word': word, 'prediction': last_prediction}) word = '' @@ -342,8 +341,10 @@ def ner(model_id): if word_pred != 'X': last_prediction = word_pred - if word is not None and len(word) > 0: - output.append({'word': word, 'prediction': last_prediction}) + if word is not None and len(word) > 0: + output_sentence.append({'word': word, 'prediction': last_prediction}) + + output.append(output_sentence) return jsonify(output) diff --git a/qurator/sbb_ner/webapp/static/index.html b/qurator/sbb_ner/webapp/static/index.html index e2a7d89..227d06c 100644 --- a/qurator/sbb_ner/webapp/static/index.html +++ b/qurator/sbb_ner/webapp/static/index.html @@ -9,7 +9,7 @@ - NER auf den digitalen Sammlungen + NER - Demo @@ -22,7 +22,7 @@
-

NER auf den digitalen Sammlungen

+

NER - Demo

@@ -35,7 +35,6 @@
- - + + + +
+ +
+
diff --git a/qurator/sbb_ner/webapp/static/js/ner-ds-sbb.js b/qurator/sbb_ner/webapp/static/js/ner-ds-sbb.js new file mode 100644 index 0000000..2dee0dc --- /dev/null +++ b/qurator/sbb_ner/webapp/static/js/ner-ds-sbb.js @@ -0,0 +1,155 @@ + +$(document).ready(function(){ + + $('#nerform').submit( + function(e){ + e.preventDefault(); + load_ppn(); + } + ); + + $.get( "/models") + .done( + function( data ) { + var tmp=""; + $.each(data, + function(index, item){ + + selected="" + if (item.default) { + selected = "selected" + } + + tmp += '' + }); + $('#model').html(tmp); + } + ); + + $.get( "/ppnexamples") + .done( + function( data ) { + var tmp=""; + $.each(data, + function(index, item){ + + tmp += '' + }); + $('#ppnexamples').html(tmp); + } + ); + + task_select() +}); + +function task_select() { + + var task = $('#task').val(); + + if (task < 3) { + $('#model_select').hide() + } + else { + $('#model_select').show() + } + + $("#resultregion").html(""); + $("#legende").html(""); +} + + +function load_ppn() { + + var ppn = $('#ppn').val() + + var text_region_html = + `
+
+ Ergebnis: +
+
+
+
+
`; + + var legende_html = + `
+
+ Legende: +
[Person]
+
[Ort]
+
[Organisation]
+
[keine Named Entity]
+
+
`; + + var spinner_html = + `
+
+ Loading... +
+
`; + + $("#legende").html(""); + + var task = $('#task').val(); + var model_id = $('#model').val(); + + console.log("Task: " + task); + + if (task == 1) { + $("#resultregion").html(spinner_html); + + $.get( "/digisam-fulltext/" + ppn) + .done(function( data ) { + $("#resultregion").html(text_region_html) + $("#textregion").html(data.text) + }) + .fail( + function() { + console.log('Failed.'); + $("#resultregion").html('Failed.'); + }); + } + else if (task == 2) { + $("#resultregion").html(spinner_html); + + $.get( "/digisam-tokenized/" + ppn, + function( data ) { + $("#resultregion").html(text_region_html) + $("#textregion").html(data.text) + }).fail( + function() { + console.log('Failed.') + $("#resultregion").html('Failed.') + }); + } + else if (task == 3) { + + $("#resultregion").html(spinner_html); + + $.get( "/digisam-ner/" + model_id + "/" + ppn, + function( data ) { + $("#resultregion").html(text_region_html) + $("#textregion").html(data.text) + $("#legende").html(legende_html) + }).fail( + function(a,b,c) { + console.log('Failed.') + $("#resultregion").html('Failed.') + }); + } + else if (task == 4) { + $("#resultregion").html(spinner_html); + + $.get( "/digisam-ner-bert-tokens/" + model_id + "/" + ppn, + function( data ) { + $("#resultregion").html(text_region_html) + $("#textregion").html(data.text) + }).fail( + function(a,b,c) { + console.log('Failed.') + $("#resultregion").html('Failed.') + }); + } +} \ No newline at end of file diff --git a/qurator/sbb_ner/webapp/static/js/ner.js b/qurator/sbb_ner/webapp/static/js/ner.js index 2dee0dc..ed92b69 100644 --- a/qurator/sbb_ner/webapp/static/js/ner.js +++ b/qurator/sbb_ner/webapp/static/js/ner.js @@ -4,7 +4,7 @@ $(document).ready(function(){ $('#nerform').submit( function(e){ e.preventDefault(); - load_ppn(); + do_task(); } ); @@ -26,19 +26,6 @@ $(document).ready(function(){ } ); - $.get( "/ppnexamples") - .done( - function( data ) { - var tmp=""; - $.each(data, - function(index, item){ - - tmp += '' - }); - $('#ppnexamples').html(tmp); - } - ); - task_select() }); @@ -58,9 +45,9 @@ function task_select() { } -function load_ppn() { +function do_task() { - var ppn = $('#ppn').val() + var input_text = $('#inputtext').val() var text_region_html = `
@@ -95,61 +82,102 @@ function load_ppn() { var task = $('#task').val(); var model_id = $('#model').val(); - console.log("Task: " + task); +// if (task == 2) { +// $("#resultregion").html(spinner_html); +// +// $.get( "/digisam-tokenized/" + ppn, +// function( data ) { +// $("#resultregion").html(text_region_html) +// $("#textregion").html(data.text) +// }).fail( +// function() { +// console.log('Failed.') +// $("#resultregion").html('Failed.') +// }); +// } +// else +// + if (task == 3) { - if (task == 1) { - $("#resultregion").html(spinner_html); - - $.get( "/digisam-fulltext/" + ppn) - .done(function( data ) { - $("#resultregion").html(text_region_html) - $("#textregion").html(data.text) - }) - .fail( - function() { - console.log('Failed.'); - $("#resultregion").html('Failed.'); - }); - } - else if (task == 2) { $("#resultregion").html(spinner_html); - $.get( "/digisam-tokenized/" + ppn, - function( data ) { - $("#resultregion").html(text_region_html) - $("#textregion").html(data.text) - }).fail( - function() { - console.log('Failed.') - $("#resultregion").html('Failed.') - }); - } - else if (task == 3) { - - $("#resultregion").html(spinner_html); - - $.get( "/digisam-ner/" + model_id + "/" + ppn, - function( data ) { - $("#resultregion").html(text_region_html) - $("#textregion").html(data.text) - $("#legende").html(legende_html) - }).fail( - function(a,b,c) { - console.log('Failed.') - $("#resultregion").html('Failed.') - }); - } - else if (task == 4) { - $("#resultregion").html(spinner_html); - - $.get( "/digisam-ner-bert-tokens/" + model_id + "/" + ppn, - function( data ) { - $("#resultregion").html(text_region_html) - $("#textregion").html(data.text) - }).fail( - function(a,b,c) { - console.log('Failed.') - $("#resultregion").html('Failed.') - }); + post_data = { "text" : input_text } + + console.log(post_data) + + $.ajax({ + url: "/ner/" + model_id, + data: JSON.stringify(post_data), + type: 'POST', + contentType: "application/json", + success: + function( data ) { + text_html = "" + data.forEach( + function(sentence) { + sentence.forEach( + function(token) { + + if (text_html != "") text_html += ' ' + + if (token.prediction == 'O') + text_html += token.word + else if (token.prediction.endsWith('PER')) + text_html += '' + token.word + '' + else if (token.prediction.endsWith('LOC')) + text_html += '' + token.word + '' + else if (token.prediction.endsWith('ORG')) + text_html += '' + token.word + '' + }) + text_html += '
' + } + ) + $("#resultregion").html(text_region_html) + $("#textregion").html(text_html) + $("#legende").html(legende_html) + } + , + error: function(error) { + console.log(error); + } + }); + + +// $.post( "/ner/" + model_id, post_data).done( +// function( data ) { +// +// text_region_html = "" +// data.forEach( +// function(sentence) { +// sentence.forEach( +// function(token) { +// text_region_html += token.word + "(" + token.prediction + ") " +// }) +// } +// ) +// +// $("#resultregion").html(text_region_html) +// $("#textregion").html(data.text) +// $("#legende").html(legende_html) +// }).fail( +// function(a,b,c) { +// console.log('Failed.') +// $("#resultregion").html('Failed.') +// }); } +// else +// +// if (task == 4) { +// $("#resultregion").html(spinner_html); +// +// $.get( "/digisam-ner-bert-tokens/" + model_id + "/" + ppn, +// function( data ) { +// $("#resultregion").html(text_region_html) +// $("#textregion").html(data.text) +// }).fail( +// function(a,b,c) { +// console.log('Failed.') +// $("#resultregion").html('Failed.') +// }); +// } } \ No newline at end of file diff --git a/qurator/sbb_ner/webapp/static/ner-ds-sbb.html b/qurator/sbb_ner/webapp/static/ner-ds-sbb.html new file mode 100644 index 0000000..aea93d9 --- /dev/null +++ b/qurator/sbb_ner/webapp/static/ner-ds-sbb.html @@ -0,0 +1,77 @@ + + + + + + + + + + + NER auf den digitalen Sammlungen + + + +
+ +
+ +
+
+
+
+
+

NER auf den digitalen Sammlungen

+
+
+
+
+
+
+
+
+
+
+ + +
+
+ + +
+ +
+ + + + + +
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+ +
+ + + \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 42e9ea1..89a5a21 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ dask pyarrow tqdm pytorch-pretrained-bert==0.6.2 +sklearn click langid seqeval