fix NER output; fix BERT Tokenizer

2026-02-06 00:41:55 +01:00 · 2019-11-22 17:10:21 +01:00 · 2019-11-22 17:10:21 +01:00 · 775d0cd753
commit 775d0cd753
parent 3eabe5054a
1 changed files with 5 additions and 4 deletions
--- a/qurator/sbb_ner/webapp/app.py
+++ b/qurator/sbb_ner/webapp/app.py
@ -262,16 +262,17 @@ def ner(model_id):

        for pos, (token, word_pred) in enumerate(zip(tokens, word_predictions)):

-            if not token.startswith('##'):
+            if not token.startswith('##') and token != '[UNK]':
                if len(word) > 0:
                    output_sentence.append({'word': word, 'prediction': last_prediction} )

                word = ''

            if token == '[UNK]':
-                orig_pos = len("".join([pred['word'] for pred in output_sentence]))
+                orig_pos = len("".join([pred['word'] for pred in output_sentence]) + word)
+
+                word += original_text[orig_pos]

-                output_sentence.append({'word': original_text[orig_pos], 'prediction': last_prediction})
                continue

            token = token[2:] if token.startswith('##') else token