fix BERT Tokenizer

2025-07-27 19:59:53 +02:00 · 2019-11-22 18:06:29 +01:00 · 2019-11-22 18:06:29 +01:00 · 6c76ce13f1
commit 6c76ce13f1
parent 67f64300f7
2 changed files with 6 additions and 3 deletions
--- a/qurator/sbb_ner/ground_truth/data_processor.py
+++ b/qurator/sbb_ner/ground_truth/data_processor.py
@ -408,6 +408,9 @@ def convert_examples_to_features(example, label_map, max_seq_len, tokenizer):
                tokens[start_pos + window_len].startswith('##'):
            window_len -= 1

+        if window_len == 1:
+            window_len = min(max_seq_len - 2, len(tokens) - start_pos)
+
        token_window = tokens[start_pos:start_pos+window_len]
        start_pos += window_len

--- a/qurator/sbb_ner/models/tokenization.py
+++ b/qurator/sbb_ner/models/tokenization.py
@ -344,9 +344,9 @@ class WordpieceTokenizer(object):
        output_tokens = []
        for token in whitespace_tokenize(text):
            chars = list(token)
-            if len(chars) > self.max_input_chars_per_word:
-                output_tokens.append(self.unk_token)
-                continue
+            # if len(chars) > self.max_input_chars_per_word:
+            #     output_tokens.append(self.unk_token)
+            #     continue

            # is_bad = False
            start = 0