x54-729
commited on
Commit
·
35f91cd
1
Parent(s):
454e418
fix no white space when using stream_chat with fast tokenizer
Browse files
configuration_internlm2.py
CHANGED
@@ -148,4 +148,4 @@ class InternLM2Config(PretrainedConfig):
|
|
148 |
f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
|
149 |
)
|
150 |
if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor < 1.0:
|
151 |
-
raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}")
|
|
|
148 |
f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
|
149 |
)
|
150 |
if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor < 1.0:
|
151 |
+
raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}")
|
tokenization_internlm2.py
CHANGED
@@ -233,4 +233,4 @@ class InternLM2Tokenizer(PreTrainedTokenizer):
|
|
233 |
|
234 |
if token_ids_1 is None:
|
235 |
return len(token_ids_0 + eos) * [0]
|
236 |
-
return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
|
|
|
233 |
|
234 |
if token_ids_1 is None:
|
235 |
return len(token_ids_0 + eos) * [0]
|
236 |
+
return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
|
tokenization_internlm2_fast.py
CHANGED
@@ -56,14 +56,14 @@ class InternLM2Converter(SpmConverter):
|
|
56 |
return unk_id
|
57 |
|
58 |
def decoder(self, replacement, add_prefix_space):
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
)
|
67 |
|
68 |
def tokenizer(self, proto):
|
69 |
model_type = proto.trainer_spec.model_type
|
@@ -211,4 +211,4 @@ class InternLM2TokenizerFast(PreTrainedTokenizerFast):
|
|
211 |
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
|
212 |
copyfile(self.vocab_file, out_vocab_file)
|
213 |
|
214 |
-
return (out_vocab_file,)
|
|
|
56 |
return unk_id
|
57 |
|
58 |
def decoder(self, replacement, add_prefix_space):
|
59 |
+
decoders_sequence = [
|
60 |
+
decoders.Replace("▁", " "),
|
61 |
+
decoders.ByteFallback(),
|
62 |
+
decoders.Fuse(),
|
63 |
+
]
|
64 |
+
if self.proto.normalizer_spec.add_dummy_prefix:
|
65 |
+
decoders_sequence.append(decoders.Strip(content=" ", left=1))
|
66 |
+
return decoders.Sequence(decoders_sequence)
|
67 |
|
68 |
def tokenizer(self, proto):
|
69 |
model_type = proto.trainer_spec.model_type
|
|
|
211 |
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
|
212 |
copyfile(self.vocab_file, out_vocab_file)
|
213 |
|
214 |
+
return (out_vocab_file,)
|