1
0
Fork 0
mirror of https://github.com/qurator-spk/sbb_ner.git synced 2025-06-09 12:20:00 +02:00

fix BERT Tokenizer

This commit is contained in:
Kai Labusch 2019-11-22 18:06:29 +01:00
parent 67f64300f7
commit 6c76ce13f1
2 changed files with 6 additions and 3 deletions

View file

@ -408,6 +408,9 @@ def convert_examples_to_features(example, label_map, max_seq_len, tokenizer):
tokens[start_pos + window_len].startswith('##'):
window_len -= 1
if window_len == 1:
window_len = min(max_seq_len - 2, len(tokens) - start_pos)
token_window = tokens[start_pos:start_pos+window_len]
start_pos += window_len

View file

@ -344,9 +344,9 @@ class WordpieceTokenizer(object):
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
# if len(chars) > self.max_input_chars_per_word:
# output_tokens.append(self.unk_token)
# continue
# is_bad = False
start = 0