1
0
Fork 0
mirror of https://github.com/qurator-spk/sbb_ner.git synced 2025-06-08 11:50:16 +02:00

fix NER output; fix BERT Tokenizer

This commit is contained in:
Kai Labusch 2019-11-22 17:10:21 +01:00
parent 3eabe5054a
commit 775d0cd753

View file

@ -262,16 +262,17 @@ def ner(model_id):
for pos, (token, word_pred) in enumerate(zip(tokens, word_predictions)): for pos, (token, word_pred) in enumerate(zip(tokens, word_predictions)):
if not token.startswith('##'): if not token.startswith('##') and token != '[UNK]':
if len(word) > 0: if len(word) > 0:
output_sentence.append({'word': word, 'prediction': last_prediction}) output_sentence.append({'word': word, 'prediction': last_prediction} )
word = '' word = ''
if token == '[UNK]': if token == '[UNK]':
orig_pos = len("".join([pred['word'] for pred in output_sentence])) orig_pos = len("".join([pred['word'] for pred in output_sentence]) + word)
word += original_text[orig_pos]
output_sentence.append({'word': original_text[orig_pos], 'prediction': last_prediction})
continue continue
token = token[2:] if token.startswith('##') else token token = token[2:] if token.startswith('##') else token