mirror of
https://github.com/qurator-spk/sbb_ner.git
synced 2025-06-09 12:20:00 +02:00
fix BERT Tokenizer
This commit is contained in:
parent
67f64300f7
commit
6c76ce13f1
2 changed files with 6 additions and 3 deletions
|
@ -408,6 +408,9 @@ def convert_examples_to_features(example, label_map, max_seq_len, tokenizer):
|
|||
tokens[start_pos + window_len].startswith('##'):
|
||||
window_len -= 1
|
||||
|
||||
if window_len == 1:
|
||||
window_len = min(max_seq_len - 2, len(tokens) - start_pos)
|
||||
|
||||
token_window = tokens[start_pos:start_pos+window_len]
|
||||
start_pos += window_len
|
||||
|
||||
|
|
|
@ -344,9 +344,9 @@ class WordpieceTokenizer(object):
|
|||
output_tokens = []
|
||||
for token in whitespace_tokenize(text):
|
||||
chars = list(token)
|
||||
if len(chars) > self.max_input_chars_per_word:
|
||||
output_tokens.append(self.unk_token)
|
||||
continue
|
||||
# if len(chars) > self.max_input_chars_per_word:
|
||||
# output_tokens.append(self.unk_token)
|
||||
# continue
|
||||
|
||||
# is_bad = False
|
||||
start = 0
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue