mirror of
https://github.com/qurator-spk/sbb_ner.git
synced 2025-07-28 04:09:53 +02:00
fix unicode problem
This commit is contained in:
parent
d7d6d61280
commit
a4b51ca7f4
2 changed files with 55 additions and 20 deletions
|
@ -388,6 +388,9 @@ def convert_examples_to_features(example, label_map, max_seq_len, tokenizer):
|
||||||
for i, word in enumerate(example.text_a): # example.text_a is a sequence of words
|
for i, word in enumerate(example.text_a): # example.text_a is a sequence of words
|
||||||
|
|
||||||
token = tokenizer.tokenize(word)
|
token = tokenizer.tokenize(word)
|
||||||
|
|
||||||
|
# import ipdb;ipdb.set_trace()
|
||||||
|
|
||||||
tokens.extend(token)
|
tokens.extend(token)
|
||||||
|
|
||||||
label_1 = example.label[i] if i < len(example.label) else 'O'
|
label_1 = example.label[i] if i < len(example.label) else 'O'
|
||||||
|
|
|
@ -219,57 +219,88 @@ def ner(model_id=None):
|
||||||
|
|
||||||
output = []
|
output = []
|
||||||
|
|
||||||
for (tokens, word_predictions), (input_sentence, _) in zip(prediction, sentences):
|
for (tokens, token_predictions), (input_sentence, _) in zip(prediction, sentences):
|
||||||
|
|
||||||
|
output_text = ""
|
||||||
original_text = "".join(input_sentence)
|
original_text = "".join(input_sentence)
|
||||||
original_word_positions = \
|
original_word_positions = \
|
||||||
[pos for positions in [[idx] * len(word) for idx, word in enumerate(input_sentence)] for pos in positions]
|
[pos for positions in [[idx] * len(word) for idx, word in enumerate(input_sentence)] for pos in positions]
|
||||||
|
|
||||||
word = ''
|
word = ''
|
||||||
last_prediction = 'O'
|
word_prediction = 'O'
|
||||||
output_sentence = []
|
output_sentence = []
|
||||||
|
|
||||||
for pos, (token, word_pred) in enumerate(zip(tokens, word_predictions)):
|
for pos, (token, token_prediction) in enumerate(zip(tokens, token_predictions)):
|
||||||
|
|
||||||
if word_pred == '[SEP]':
|
if not token.startswith('##') and token_prediction == 'X' or token_prediction == '[SEP]':
|
||||||
word_pred = 'O'
|
token_prediction = 'O'
|
||||||
|
|
||||||
if not token.startswith('##') and token != '[UNK]':
|
orig_pos = len(output_text + word)
|
||||||
if len(word) > 0:
|
|
||||||
output_sentence.append({'word': word, 'prediction': last_prediction})
|
|
||||||
|
|
||||||
|
# if the current word length is greater than 0
|
||||||
|
# and its either a word start token (does not start with ##) and not an unknown token or the original text
|
||||||
|
# positions indicate a word break
|
||||||
|
if len(word) > 0 and ((not token.startswith('##') and token != '[UNK]') or
|
||||||
|
(orig_pos > 0 and
|
||||||
|
original_word_positions[orig_pos-1] != original_word_positions[orig_pos])):
|
||||||
|
output_sentence.append({'word': word, 'prediction': word_prediction})
|
||||||
|
output_text += word
|
||||||
word = ''
|
word = ''
|
||||||
|
word_prediction = 'O'
|
||||||
|
|
||||||
if token == '[UNK]':
|
if token == '[UNK]':
|
||||||
|
|
||||||
orig_pos = len("".join([pred['word'] for pred in output_sentence]) + word)
|
orig_pos = len(output_text + word)
|
||||||
|
|
||||||
|
# are we on a word boundary?
|
||||||
if orig_pos > 0 and original_word_positions[orig_pos-1] != original_word_positions[orig_pos]:
|
if orig_pos > 0 and original_word_positions[orig_pos-1] != original_word_positions[orig_pos]:
|
||||||
output_sentence.append({'word': word, 'prediction': last_prediction})
|
|
||||||
|
# we are on a word boundary - start a new word ...
|
||||||
|
output_sentence.append({'word': word, 'prediction': word_prediction})
|
||||||
|
output_text += word
|
||||||
word = ''
|
word = ''
|
||||||
|
word_prediction = 'O'
|
||||||
|
|
||||||
word += original_text[orig_pos]
|
# get character that corresponds to [UNK] token from original text
|
||||||
|
token = original_text[orig_pos]
|
||||||
|
|
||||||
if word_pred != 'X':
|
else:
|
||||||
last_prediction = word_pred
|
token = token[2:] if token.startswith('##') else token
|
||||||
|
|
||||||
continue
|
# if the output_text plus the current word and token is not a prefix of the original text, it means,
|
||||||
|
# we would miss characters. Therefore we take the missing characters from the original text at the current
|
||||||
|
# word position
|
||||||
|
while not original_text.startswith(output_text + word + token) \
|
||||||
|
and len(output_text + word) < len(original_text):
|
||||||
|
|
||||||
if not token.startswith('##') and word_pred == 'X':
|
word += original_text[len(output_text + word)]
|
||||||
word_pred = 'O'
|
|
||||||
|
|
||||||
token = token[2:] if token.startswith('##') else token
|
orig_pos = len(output_text + word)
|
||||||
|
|
||||||
|
# are we on a word boundary?
|
||||||
|
if orig_pos > 0 and original_word_positions[orig_pos - 1] != original_word_positions[orig_pos]:
|
||||||
|
# we are on a word boundary - start a new word ...
|
||||||
|
output_sentence.append({'word': word, 'prediction': word_prediction})
|
||||||
|
output_text += word
|
||||||
|
word = ''
|
||||||
|
word_prediction = 'O'
|
||||||
|
|
||||||
word += token
|
word += token
|
||||||
|
|
||||||
if word_pred != 'X':
|
if token_prediction != 'X':
|
||||||
last_prediction = word_pred
|
word_prediction = token_prediction
|
||||||
|
|
||||||
if len(word) > 0:
|
if len(word) > 0:
|
||||||
output_sentence.append({'word': word, 'prediction': last_prediction})
|
output_text += word
|
||||||
|
output_sentence.append({'word': word, 'prediction': word_prediction})
|
||||||
|
|
||||||
output.append(output_sentence)
|
output.append(output_sentence)
|
||||||
|
|
||||||
|
try:
|
||||||
|
assert output_text == original_text
|
||||||
|
except AssertionError:
|
||||||
|
import ipdb;ipdb.set_trace()
|
||||||
|
|
||||||
for output_sentence, (input_sentence, _) in zip(output, sentences):
|
for output_sentence, (input_sentence, _) in zip(output, sentences):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -278,6 +309,7 @@ def ner(model_id=None):
|
||||||
logger.warning('Input and output different!!! \n\n\nInput: {}\n\nOutput: {}\n'.
|
logger.warning('Input and output different!!! \n\n\nInput: {}\n\nOutput: {}\n'.
|
||||||
format("".join(input_sentence).replace(" ", ""),
|
format("".join(input_sentence).replace(" ", ""),
|
||||||
"".join([pred['word'] for pred in output_sentence])))
|
"".join([pred['word'] for pred in output_sentence])))
|
||||||
|
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
return jsonify(output)
|
return jsonify(output)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue