x54-729 commited on
Commit
35f91cd
·
1 Parent(s): 454e418

fix no white space when using stream_chat with fast tokenizer

Browse files
configuration_internlm2.py CHANGED
@@ -148,4 +148,4 @@ class InternLM2Config(PretrainedConfig):
148
  f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
149
  )
150
  if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor < 1.0:
151
- raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}")
 
148
  f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
149
  )
150
  if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor < 1.0:
151
+ raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}")
tokenization_internlm2.py CHANGED
@@ -233,4 +233,4 @@ class InternLM2Tokenizer(PreTrainedTokenizer):
233
 
234
  if token_ids_1 is None:
235
  return len(token_ids_0 + eos) * [0]
236
- return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
 
233
 
234
  if token_ids_1 is None:
235
  return len(token_ids_0 + eos) * [0]
236
+ return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
tokenization_internlm2_fast.py CHANGED
@@ -56,14 +56,14 @@ class InternLM2Converter(SpmConverter):
56
  return unk_id
57
 
58
  def decoder(self, replacement, add_prefix_space):
59
- return decoders.Sequence(
60
- [
61
- decoders.Replace("▁", " "),
62
- decoders.ByteFallback(),
63
- decoders.Fuse(),
64
- decoders.Strip(content=" ", left=1),
65
- ]
66
- )
67
 
68
  def tokenizer(self, proto):
69
  model_type = proto.trainer_spec.model_type
@@ -211,4 +211,4 @@ class InternLM2TokenizerFast(PreTrainedTokenizerFast):
211
  if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
212
  copyfile(self.vocab_file, out_vocab_file)
213
 
214
- return (out_vocab_file,)
 
56
  return unk_id
57
 
58
  def decoder(self, replacement, add_prefix_space):
59
+ decoders_sequence = [
60
+ decoders.Replace("▁", " "),
61
+ decoders.ByteFallback(),
62
+ decoders.Fuse(),
63
+ ]
64
+ if self.proto.normalizer_spec.add_dummy_prefix:
65
+ decoders_sequence.append(decoders.Strip(content=" ", left=1))
66
+ return decoders.Sequence(decoders_sequence)
67
 
68
  def tokenizer(self, proto):
69
  model_type = proto.trainer_spec.model_type
 
211
  if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
212
  copyfile(self.vocab_file, out_vocab_file)
213
 
214
+ return (out_vocab_file,)