fix BERT Tokenizer

pull/2/head
Kai Labusch 5 years ago
parent 67f64300f7
commit 6c76ce13f1

@ -408,6 +408,9 @@ def convert_examples_to_features(example, label_map, max_seq_len, tokenizer):
tokens[start_pos + window_len].startswith('##'): tokens[start_pos + window_len].startswith('##'):
window_len -= 1 window_len -= 1
if window_len == 1:
window_len = min(max_seq_len - 2, len(tokens) - start_pos)
token_window = tokens[start_pos:start_pos+window_len] token_window = tokens[start_pos:start_pos+window_len]
start_pos += window_len start_pos += window_len

@ -344,9 +344,9 @@ class WordpieceTokenizer(object):
output_tokens = [] output_tokens = []
for token in whitespace_tokenize(text): for token in whitespace_tokenize(text):
chars = list(token) chars = list(token)
if len(chars) > self.max_input_chars_per_word: # if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token) # output_tokens.append(self.unk_token)
continue # continue
# is_bad = False # is_bad = False
start = 0 start = 0

Loading…
Cancel
Save