diff --git a/qurator/sbb_ner/ground_truth/data_processor.py b/qurator/sbb_ner/ground_truth/data_processor.py index 054848f..577448b 100644 --- a/qurator/sbb_ner/ground_truth/data_processor.py +++ b/qurator/sbb_ner/ground_truth/data_processor.py @@ -408,6 +408,9 @@ def convert_examples_to_features(example, label_map, max_seq_len, tokenizer): tokens[start_pos + window_len].startswith('##'): window_len -= 1 + if window_len == 1: + window_len = min(max_seq_len - 2, len(tokens) - start_pos) + token_window = tokens[start_pos:start_pos+window_len] start_pos += window_len diff --git a/qurator/sbb_ner/models/tokenization.py b/qurator/sbb_ner/models/tokenization.py index 67fae26..d3ebf20 100644 --- a/qurator/sbb_ner/models/tokenization.py +++ b/qurator/sbb_ner/models/tokenization.py @@ -344,9 +344,9 @@ class WordpieceTokenizer(object): output_tokens = [] for token in whitespace_tokenize(text): chars = list(token) - if len(chars) > self.max_input_chars_per_word: - output_tokens.append(self.unk_token) - continue + # if len(chars) > self.max_input_chars_per_word: + # output_tokens.append(self.unk_token) + # continue # is_bad = False start = 0