From a4b51ca7f4e1b093816c8d814bdcdd3349875bab Mon Sep 17 00:00:00 2001 From: Kai Labusch Date: Thu, 7 Nov 2024 18:30:26 +0100 Subject: [PATCH] fix unicode problem --- .../sbb_ner/ground_truth/data_processor.py | 3 + qurator/sbb_ner/webapp/app.py | 72 +++++++++++++------ 2 files changed, 55 insertions(+), 20 deletions(-) diff --git a/qurator/sbb_ner/ground_truth/data_processor.py b/qurator/sbb_ner/ground_truth/data_processor.py index 577448b..85245f8 100644 --- a/qurator/sbb_ner/ground_truth/data_processor.py +++ b/qurator/sbb_ner/ground_truth/data_processor.py @@ -388,6 +388,9 @@ def convert_examples_to_features(example, label_map, max_seq_len, tokenizer): for i, word in enumerate(example.text_a): # example.text_a is a sequence of words token = tokenizer.tokenize(word) + + # import ipdb;ipdb.set_trace() + tokens.extend(token) label_1 = example.label[i] if i < len(example.label) else 'O' diff --git a/qurator/sbb_ner/webapp/app.py b/qurator/sbb_ner/webapp/app.py index b1d566f..cbeb25b 100644 --- a/qurator/sbb_ner/webapp/app.py +++ b/qurator/sbb_ner/webapp/app.py @@ -219,57 +219,88 @@ def ner(model_id=None): output = [] - for (tokens, word_predictions), (input_sentence, _) in zip(prediction, sentences): + for (tokens, token_predictions), (input_sentence, _) in zip(prediction, sentences): + output_text = "" original_text = "".join(input_sentence) original_word_positions = \ [pos for positions in [[idx] * len(word) for idx, word in enumerate(input_sentence)] for pos in positions] word = '' - last_prediction = 'O' + word_prediction = 'O' output_sentence = [] - for pos, (token, word_pred) in enumerate(zip(tokens, word_predictions)): + for pos, (token, token_prediction) in enumerate(zip(tokens, token_predictions)): - if word_pred == '[SEP]': - word_pred = 'O' + if not token.startswith('##') and token_prediction == 'X' or token_prediction == '[SEP]': + token_prediction = 'O' - if not token.startswith('##') and token != '[UNK]': - if len(word) > 0: - output_sentence.append({'word': word, 'prediction': last_prediction}) + orig_pos = len(output_text + word) + # if the current word length is greater than 0 + # and its either a word start token (does not start with ##) and not an unknown token or the original text + # positions indicate a word break + if len(word) > 0 and ((not token.startswith('##') and token != '[UNK]') or + (orig_pos > 0 and + original_word_positions[orig_pos-1] != original_word_positions[orig_pos])): + output_sentence.append({'word': word, 'prediction': word_prediction}) + output_text += word word = '' + word_prediction = 'O' if token == '[UNK]': - orig_pos = len("".join([pred['word'] for pred in output_sentence]) + word) + orig_pos = len(output_text + word) + # are we on a word boundary? if orig_pos > 0 and original_word_positions[orig_pos-1] != original_word_positions[orig_pos]: - output_sentence.append({'word': word, 'prediction': last_prediction}) + + # we are on a word boundary - start a new word ... + output_sentence.append({'word': word, 'prediction': word_prediction}) + output_text += word word = '' + word_prediction = 'O' - word += original_text[orig_pos] + # get character that corresponds to [UNK] token from original text + token = original_text[orig_pos] - if word_pred != 'X': - last_prediction = word_pred + else: + token = token[2:] if token.startswith('##') else token - continue + # if the output_text plus the current word and token is not a prefix of the original text, it means, + # we would miss characters. Therefore we take the missing characters from the original text at the current + # word position + while not original_text.startswith(output_text + word + token) \ + and len(output_text + word) < len(original_text): - if not token.startswith('##') and word_pred == 'X': - word_pred = 'O' + word += original_text[len(output_text + word)] - token = token[2:] if token.startswith('##') else token + orig_pos = len(output_text + word) + + # are we on a word boundary? + if orig_pos > 0 and original_word_positions[orig_pos - 1] != original_word_positions[orig_pos]: + # we are on a word boundary - start a new word ... + output_sentence.append({'word': word, 'prediction': word_prediction}) + output_text += word + word = '' + word_prediction = 'O' word += token - if word_pred != 'X': - last_prediction = word_pred + if token_prediction != 'X': + word_prediction = token_prediction if len(word) > 0: - output_sentence.append({'word': word, 'prediction': last_prediction}) + output_text += word + output_sentence.append({'word': word, 'prediction': word_prediction}) output.append(output_sentence) + try: + assert output_text == original_text + except AssertionError: + import ipdb;ipdb.set_trace() + for output_sentence, (input_sentence, _) in zip(output, sentences): try: @@ -278,6 +309,7 @@ def ner(model_id=None): logger.warning('Input and output different!!! \n\n\nInput: {}\n\nOutput: {}\n'. format("".join(input_sentence).replace(" ", ""), "".join([pred['word'] for pred in output_sentence]))) + torch.cuda.empty_cache() return jsonify(output)