fix unicode problem

pull/5/merge
Kai Labusch 2 weeks ago
parent d7d6d61280
commit a4b51ca7f4

@ -388,6 +388,9 @@ def convert_examples_to_features(example, label_map, max_seq_len, tokenizer):
for i, word in enumerate(example.text_a): # example.text_a is a sequence of words for i, word in enumerate(example.text_a): # example.text_a is a sequence of words
token = tokenizer.tokenize(word) token = tokenizer.tokenize(word)
# import ipdb;ipdb.set_trace()
tokens.extend(token) tokens.extend(token)
label_1 = example.label[i] if i < len(example.label) else 'O' label_1 = example.label[i] if i < len(example.label) else 'O'

@ -219,57 +219,88 @@ def ner(model_id=None):
output = [] output = []
for (tokens, word_predictions), (input_sentence, _) in zip(prediction, sentences): for (tokens, token_predictions), (input_sentence, _) in zip(prediction, sentences):
output_text = ""
original_text = "".join(input_sentence) original_text = "".join(input_sentence)
original_word_positions = \ original_word_positions = \
[pos for positions in [[idx] * len(word) for idx, word in enumerate(input_sentence)] for pos in positions] [pos for positions in [[idx] * len(word) for idx, word in enumerate(input_sentence)] for pos in positions]
word = '' word = ''
last_prediction = 'O' word_prediction = 'O'
output_sentence = [] output_sentence = []
for pos, (token, word_pred) in enumerate(zip(tokens, word_predictions)): for pos, (token, token_prediction) in enumerate(zip(tokens, token_predictions)):
if word_pred == '[SEP]': if not token.startswith('##') and token_prediction == 'X' or token_prediction == '[SEP]':
word_pred = 'O' token_prediction = 'O'
if not token.startswith('##') and token != '[UNK]': orig_pos = len(output_text + word)
if len(word) > 0:
output_sentence.append({'word': word, 'prediction': last_prediction})
# if the current word length is greater than 0
# and its either a word start token (does not start with ##) and not an unknown token or the original text
# positions indicate a word break
if len(word) > 0 and ((not token.startswith('##') and token != '[UNK]') or
(orig_pos > 0 and
original_word_positions[orig_pos-1] != original_word_positions[orig_pos])):
output_sentence.append({'word': word, 'prediction': word_prediction})
output_text += word
word = '' word = ''
word_prediction = 'O'
if token == '[UNK]': if token == '[UNK]':
orig_pos = len("".join([pred['word'] for pred in output_sentence]) + word) orig_pos = len(output_text + word)
# are we on a word boundary?
if orig_pos > 0 and original_word_positions[orig_pos-1] != original_word_positions[orig_pos]: if orig_pos > 0 and original_word_positions[orig_pos-1] != original_word_positions[orig_pos]:
output_sentence.append({'word': word, 'prediction': last_prediction})
# we are on a word boundary - start a new word ...
output_sentence.append({'word': word, 'prediction': word_prediction})
output_text += word
word = '' word = ''
word_prediction = 'O'
word += original_text[orig_pos] # get character that corresponds to [UNK] token from original text
token = original_text[orig_pos]
if word_pred != 'X': else:
last_prediction = word_pred token = token[2:] if token.startswith('##') else token
continue # if the output_text plus the current word and token is not a prefix of the original text, it means,
# we would miss characters. Therefore we take the missing characters from the original text at the current
# word position
while not original_text.startswith(output_text + word + token) \
and len(output_text + word) < len(original_text):
if not token.startswith('##') and word_pred == 'X': word += original_text[len(output_text + word)]
word_pred = 'O'
token = token[2:] if token.startswith('##') else token orig_pos = len(output_text + word)
# are we on a word boundary?
if orig_pos > 0 and original_word_positions[orig_pos - 1] != original_word_positions[orig_pos]:
# we are on a word boundary - start a new word ...
output_sentence.append({'word': word, 'prediction': word_prediction})
output_text += word
word = ''
word_prediction = 'O'
word += token word += token
if word_pred != 'X': if token_prediction != 'X':
last_prediction = word_pred word_prediction = token_prediction
if len(word) > 0: if len(word) > 0:
output_sentence.append({'word': word, 'prediction': last_prediction}) output_text += word
output_sentence.append({'word': word, 'prediction': word_prediction})
output.append(output_sentence) output.append(output_sentence)
try:
assert output_text == original_text
except AssertionError:
import ipdb;ipdb.set_trace()
for output_sentence, (input_sentence, _) in zip(output, sentences): for output_sentence, (input_sentence, _) in zip(output, sentences):
try: try:
@ -278,6 +309,7 @@ def ner(model_id=None):
logger.warning('Input and output different!!! \n\n\nInput: {}\n\nOutput: {}\n'. logger.warning('Input and output different!!! \n\n\nInput: {}\n\nOutput: {}\n'.
format("".join(input_sentence).replace(" ", ""), format("".join(input_sentence).replace(" ", ""),
"".join([pred['word'] for pred in output_sentence]))) "".join([pred['word'] for pred in output_sentence])))
torch.cuda.empty_cache() torch.cuda.empty_cache()
return jsonify(output) return jsonify(output)

Loading…
Cancel
Save