BAAI
/

ldwang commited on
Commit
0f7783a
·
verified ·
1 Parent(s): 3b004d5

Delete cooldown

Browse files
cooldown/iter_0060000_hf/config.json DELETED
@@ -1,27 +0,0 @@
1
- {
2
- "architectures": [
3
- "MistralForCausalLM"
4
- ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "bos_token_id": 151849,
8
- "eos_token_id": 151850,
9
- "head_dim": 64,
10
- "hidden_act": "silu",
11
- "hidden_size": 576,
12
- "initializer_range": 0.02,
13
- "intermediate_size": 1536,
14
- "max_position_embeddings": 8192,
15
- "model_type": "mistral",
16
- "num_attention_heads": 9,
17
- "num_hidden_layers": 30,
18
- "num_key_value_heads": 3,
19
- "rms_norm_eps": 1e-05,
20
- "rope_theta": 10000,
21
- "sliding_window": 8192,
22
- "tie_word_embeddings": true,
23
- "torch_dtype": "bfloat16",
24
- "transformers_version": "4.44.2",
25
- "use_cache": true,
26
- "vocab_size": 151851
27
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cooldown/iter_0060000_hf/generation_config.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "bos_token_id": 151849,
4
- "eos_token_id": 151850,
5
- "transformers_version": "4.44.2"
6
- }
 
 
 
 
 
 
 
cooldown/iter_0060000_hf/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ea50f85473000535c9d7faa85799ce52f22f9f229f2d33b00e078640beae89ee
3
- size 562302352
 
 
 
 
cooldown/iter_0060000_hf/qwen.tiktoken DELETED
The diff for this file is too large to render. See raw diff
 
cooldown/iter_0060000_hf/qwen_generation_utils.py DELETED
@@ -1,416 +0,0 @@
1
- # Copyright (c) Alibaba Cloud.
2
- #
3
- # This source code is licensed under the license found in the
4
- # LICENSE file in the root directory of this source tree.
5
-
6
- """Generation support."""
7
-
8
- from typing import Tuple, List, Union, Iterable
9
-
10
- import numpy as np
11
- import torch
12
- import torch.nn.functional as F
13
- from transformers import PreTrainedTokenizer
14
- from transformers import logging
15
- from transformers.generation import LogitsProcessor
16
-
17
- logger = logging.get_logger(__name__)
18
-
19
- # Types.
20
- HistoryType = List[Tuple[str, str]]
21
- TokensType = List[int]
22
- BatchTokensType = List[List[int]]
23
-
24
-
25
- def pad_batch(batch: BatchTokensType, pad_id: int, seq_length: int) -> BatchTokensType:
26
- for tokens in batch:
27
- context_length = len(tokens)
28
- if context_length < seq_length:
29
- tokens.extend([pad_id] * (seq_length - context_length))
30
- return batch
31
-
32
-
33
- def get_ltor_masks_and_position_ids(
34
- data,
35
- eod_token,
36
- reset_position_ids,
37
- reset_attention_mask,
38
- eod_mask_loss,
39
- ):
40
- """Build masks and position id for left to right model."""
41
-
42
- # Extract batch size and sequence length.
43
- micro_batch_size, seq_length = data.size()
44
-
45
- # Attention mask (lower triangular).
46
- if reset_attention_mask:
47
- att_mask_batch = micro_batch_size
48
- else:
49
- att_mask_batch = 1
50
- attention_mask = torch.tril(
51
- torch.ones((att_mask_batch, seq_length, seq_length), device=data.device)
52
- ).view(att_mask_batch, 1, seq_length, seq_length)
53
-
54
- # Loss mask.
55
- loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device)
56
- if eod_mask_loss:
57
- loss_mask[data == eod_token] = 0.0
58
-
59
- # Position ids.
60
- position_ids = torch.arange(seq_length, dtype=torch.long, device=data.device)
61
- position_ids = position_ids.unsqueeze(0).expand_as(data)
62
- # We need to clone as the ids will be modifed based on batch index.
63
- if reset_position_ids:
64
- position_ids = position_ids.clone()
65
-
66
- if reset_position_ids or reset_attention_mask:
67
- # Loop through the batches:
68
- for b in range(micro_batch_size):
69
-
70
- # Find indecies where EOD token is.
71
- eod_index = position_ids[b, data[b] == eod_token]
72
- # Detach indecies from positions if going to modify positions.
73
- if reset_position_ids:
74
- eod_index = eod_index.clone()
75
-
76
- # Loop through EOD indecies:
77
- prev_index = 0
78
- for j in range(eod_index.size()[0]):
79
- i = eod_index[j]
80
- # Mask attention loss.
81
- if reset_attention_mask:
82
- attention_mask[b, 0, (i + 1) :, : (i + 1)] = 0
83
- # Reset positions.
84
- if reset_position_ids:
85
- position_ids[b, (i + 1) :] -= i + 1 - prev_index
86
- prev_index = i + 1
87
-
88
- # Convert attention mask to binary:
89
- attention_mask = attention_mask < 0.5
90
-
91
- return attention_mask, loss_mask, position_ids
92
-
93
-
94
- def get_batch(context_tokens: torch.LongTensor, eod_id: int):
95
- """Generate batch from context tokens."""
96
- # Move to GPU.
97
- tokens = context_tokens.contiguous().to(context_tokens.device)
98
- # Get the attention mask and postition ids.
99
- attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
100
- tokens,
101
- eod_id,
102
- reset_position_ids=False,
103
- reset_attention_mask=False,
104
- eod_mask_loss=False,
105
- )
106
- return tokens, attention_mask, position_ids
107
-
108
-
109
- def get_stop_words_ids(chat_format, tokenizer):
110
- if chat_format == "raw":
111
- stop_words_ids = [tokenizer.encode("Human:"), [tokenizer.eod_id]]
112
- elif chat_format == "chatml":
113
- stop_words_ids = [[tokenizer.im_end_id], [tokenizer.im_start_id]]
114
- else:
115
- raise NotImplementedError(f"Unknown chat format {chat_format!r}")
116
- return stop_words_ids
117
-
118
-
119
- def make_context(
120
- tokenizer: PreTrainedTokenizer,
121
- query: str,
122
- history: List[Tuple[str, str]] = None,
123
- system: str = "",
124
- max_window_size: int = 6144,
125
- chat_format: str = "chatml",
126
- ):
127
- if history is None:
128
- history = []
129
-
130
- if chat_format == "chatml":
131
- im_start, im_end = "<|im_start|>", "<|im_end|>"
132
- im_start_tokens = [tokenizer.im_start_id]
133
- im_end_tokens = [tokenizer.im_end_id]
134
- nl_tokens = tokenizer.encode("\n")
135
-
136
- def _tokenize_str(role, content):
137
- return f"{role}\n{content}", tokenizer.encode(
138
- role, allowed_special=set()
139
- ) + nl_tokens + tokenizer.encode(content, allowed_special=set())
140
-
141
- system_text, system_tokens_part = _tokenize_str("system", system)
142
- system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
143
-
144
- raw_text = ""
145
- context_tokens = []
146
-
147
- for turn_query, turn_response in reversed(history):
148
- query_text, query_tokens_part = _tokenize_str("user", turn_query)
149
- query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
150
- response_text, response_tokens_part = _tokenize_str(
151
- "assistant", turn_response
152
- )
153
- response_tokens = im_start_tokens + response_tokens_part + im_end_tokens
154
-
155
- next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens
156
- prev_chat = (
157
- f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}"
158
- )
159
-
160
- current_context_size = (
161
- len(system_tokens) + len(next_context_tokens) + len(context_tokens)
162
- )
163
- if current_context_size < max_window_size:
164
- context_tokens = next_context_tokens + context_tokens
165
- raw_text = prev_chat + raw_text
166
- else:
167
- break
168
-
169
- context_tokens = system_tokens + context_tokens
170
- raw_text = f"{im_start}{system_text}{im_end}" + raw_text
171
- context_tokens += (
172
- nl_tokens
173
- + im_start_tokens
174
- + _tokenize_str("user", query)[1]
175
- + im_end_tokens
176
- + nl_tokens
177
- + im_start_tokens
178
- + tokenizer.encode("assistant")
179
- + nl_tokens
180
- )
181
- raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
182
-
183
- elif chat_format == "raw":
184
- raw_text = query
185
- context_tokens = tokenizer.encode(raw_text)
186
- else:
187
- raise NotImplementedError(f"Unknown chat format {chat_format!r}")
188
-
189
- return raw_text, context_tokens
190
-
191
-
192
- def _decode_default(
193
- tokens: List[int],
194
- *,
195
- stop_words: List[str],
196
- eod_words: List[str],
197
- tokenizer: PreTrainedTokenizer,
198
- raw_text_len: int,
199
- verbose: bool = False,
200
- return_end_reason: bool = False,
201
- errors: str='replace',
202
- ):
203
- trim_decode_tokens = tokenizer.decode(tokens, errors=errors)[raw_text_len:]
204
- if verbose:
205
- print("\nRaw Generate: ", trim_decode_tokens)
206
-
207
- end_reason = f"Gen length {len(tokens)}"
208
- for stop_word in stop_words:
209
- trim_decode_tokens = trim_decode_tokens.replace(stop_word, "").strip()
210
- for eod_word in eod_words:
211
- if eod_word in trim_decode_tokens:
212
- end_reason = f"Gen {eod_word!r}"
213
- trim_decode_tokens = trim_decode_tokens.split(eod_word)[0]
214
- trim_decode_tokens = trim_decode_tokens.strip()
215
- if verbose:
216
- print("\nEnd Reason:", end_reason)
217
- print("\nGenerate: ", trim_decode_tokens)
218
-
219
- if return_end_reason:
220
- return trim_decode_tokens, end_reason
221
- else:
222
- return trim_decode_tokens
223
-
224
-
225
- def _decode_chatml(
226
- tokens: List[int],
227
- *,
228
- stop_words: List[str],
229
- eod_token_ids: List[int],
230
- tokenizer: PreTrainedTokenizer,
231
- raw_text_len: int,
232
- context_length: int,
233
- verbose: bool = False,
234
- return_end_reason: bool = False,
235
- errors: str='replace'
236
- ):
237
- end_reason = f"Gen length {len(tokens)}"
238
- eod_token_idx = context_length
239
- for eod_token_idx in range(context_length, len(tokens)):
240
- if tokens[eod_token_idx] in eod_token_ids:
241
- end_reason = f"Gen {tokenizer.decode([tokens[eod_token_idx]])!r}"
242
- break
243
-
244
- trim_decode_tokens = tokenizer.decode(tokens[:eod_token_idx], errors=errors)[raw_text_len:]
245
- if verbose:
246
- print("\nRaw Generate w/o EOD:", tokenizer.decode(tokens, errors=errors)[raw_text_len:])
247
- print("\nRaw Generate:", trim_decode_tokens)
248
- print("\nEnd Reason:", end_reason)
249
- for stop_word in stop_words:
250
- trim_decode_tokens = trim_decode_tokens.replace(stop_word, "").strip()
251
- trim_decode_tokens = trim_decode_tokens.strip()
252
- if verbose:
253
- print("\nGenerate:", trim_decode_tokens)
254
-
255
- if return_end_reason:
256
- return trim_decode_tokens, end_reason
257
- else:
258
- return trim_decode_tokens
259
-
260
-
261
- def decode_tokens(
262
- tokens: Union[torch.LongTensor, TokensType],
263
- tokenizer: PreTrainedTokenizer,
264
- raw_text_len: int,
265
- context_length: int,
266
- chat_format: str,
267
- verbose: bool = False,
268
- return_end_reason: bool = False,
269
- errors: str="replace",
270
- ) -> str:
271
- if torch.is_tensor(tokens):
272
- tokens = tokens.cpu().numpy().tolist()
273
-
274
- if chat_format == "chatml":
275
- return _decode_chatml(
276
- tokens,
277
- stop_words=[],
278
- eod_token_ids=[tokenizer.im_start_id, tokenizer.im_end_id],
279
- tokenizer=tokenizer,
280
- raw_text_len=raw_text_len,
281
- context_length=context_length,
282
- verbose=verbose,
283
- return_end_reason=return_end_reason,
284
- errors=errors,
285
- )
286
- elif chat_format == "raw":
287
- return _decode_default(
288
- tokens,
289
- stop_words=["<|endoftext|>"],
290
- eod_words=["<|endoftext|>"],
291
- tokenizer=tokenizer,
292
- raw_text_len=raw_text_len,
293
- verbose=verbose,
294
- return_end_reason=return_end_reason,
295
- errors=errors,
296
- )
297
- else:
298
- raise NotImplementedError(f"Unknown chat format {chat_format!r}")
299
-
300
-
301
- class StopWordsLogitsProcessor(LogitsProcessor):
302
- """
303
- :class:`transformers.LogitsProcessor` that enforces that when specified sequences appear, stop geration.
304
-
305
- Args:
306
- stop_words_ids (:obj:`List[List[int]]`):
307
- List of list of token ids of stop ids. In order to get the tokens of the words
308
- that should not appear in the generated text, use :obj:`tokenizer(bad_word,
309
- add_prefix_space=True).input_ids`.
310
- eos_token_id (:obj:`int`):
311
- The id of the `end-of-sequence` token.
312
- """
313
-
314
- def __init__(self, stop_words_ids: Iterable[Iterable[int]], eos_token_id: int):
315
-
316
- if not isinstance(stop_words_ids, List) or len(stop_words_ids) == 0:
317
- raise ValueError(
318
- f"`stop_words_ids` has to be a non-emtpy list, but is {stop_words_ids}."
319
- )
320
- if any(not isinstance(bad_word_ids, list) for bad_word_ids in stop_words_ids):
321
- raise ValueError(
322
- f"`stop_words_ids` has to be a list of lists, but is {stop_words_ids}."
323
- )
324
- if any(
325
- any(
326
- (not isinstance(token_id, (int, np.integer)) or token_id < 0)
327
- for token_id in stop_word_ids
328
- )
329
- for stop_word_ids in stop_words_ids
330
- ):
331
- raise ValueError(
332
- f"Each list in `stop_words_ids` has to be a list of positive integers, but is {stop_words_ids}."
333
- )
334
-
335
- self.stop_words_ids = list(
336
- filter(
337
- lambda bad_token_seq: bad_token_seq != [eos_token_id], stop_words_ids
338
- )
339
- )
340
- self.eos_token_id = eos_token_id
341
- for stop_token_seq in self.stop_words_ids:
342
- assert (
343
- len(stop_token_seq) > 0
344
- ), "Stop words token sequences {} cannot have an empty list".format(
345
- stop_words_ids
346
- )
347
-
348
- def __call__(
349
- self, input_ids: torch.LongTensor, scores: torch.FloatTensor
350
- ) -> torch.FloatTensor:
351
- stopped_samples = self._calc_stopped_samples(input_ids)
352
- for i, should_stop in enumerate(stopped_samples):
353
- if should_stop:
354
- scores[i, self.eos_token_id] = float(2**15)
355
- return scores
356
-
357
- def _tokens_match(self, prev_tokens: torch.LongTensor, tokens: List[int]) -> bool:
358
- if len(tokens) == 0:
359
- # if bad word tokens is just one token always ban it
360
- return True
361
- elif len(tokens) > len(prev_tokens):
362
- # if bad word tokens are longer then prev input_ids they can't be equal
363
- return False
364
- elif prev_tokens[-len(tokens) :].tolist() == tokens:
365
- # if tokens match
366
- return True
367
- else:
368
- return False
369
-
370
- def _calc_stopped_samples(self, prev_input_ids: Iterable[int]) -> Iterable[int]:
371
- stopped_samples = []
372
- for prev_input_ids_slice in prev_input_ids:
373
- match = False
374
- for stop_token_seq in self.stop_words_ids:
375
- if self._tokens_match(prev_input_ids_slice, stop_token_seq):
376
- # if tokens do not match continue
377
- match = True
378
- break
379
- stopped_samples.append(match)
380
-
381
- return stopped_samples
382
-
383
-
384
- def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
385
- """This function has been mostly taken from huggingface conversational
386
- ai code at
387
- https://medium.com/huggingface/how-to-build-a-state-of-the-art-
388
- conversational-ai-with-transfer-learning-2d818ac26313"""
389
-
390
- if top_k > 0:
391
- # Remove all tokens with a probability less than the
392
- # last token of the top-k
393
- indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
394
- logits[indices_to_remove] = filter_value
395
-
396
- if top_p > 0.0:
397
- # Cconvert to 1D
398
- sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
399
- cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
400
-
401
- # Remove tokens with cumulative probability above the threshold
402
- sorted_indices_to_remove = cumulative_probs > top_p
403
- # Shift the indices to the right to keep also the first token
404
- # above the threshold
405
- sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
406
- sorted_indices_to_remove[..., 0] = 0
407
- for i in range(sorted_indices.size(0)):
408
- indices_to_remove = sorted_indices[i][sorted_indices_to_remove[i]]
409
- logits[i][indices_to_remove] = filter_value
410
-
411
- return logits
412
-
413
-
414
- def switch(val1, val2, boolean):
415
- boolean = boolean.type_as(val1)
416
- return (1 - boolean) * val1 + boolean * val2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cooldown/iter_0060000_hf/special_tokens_map.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "bos_token": "<|extra_203|>",
3
- "eos_token": "<|extra_204|>",
4
- "unk_token": "<|endoftext|>",
5
- "pad_token": "<|endoftext|>"
6
- }
 
 
 
 
 
 
 
cooldown/iter_0060000_hf/tokenization_qwen.py DELETED
@@ -1,276 +0,0 @@
1
- # Copyright (c) Alibaba Cloud.
2
- #
3
- # This source code is licensed under the license found in the
4
- # LICENSE file in the root directory of this source tree.
5
-
6
- """Tokenization classes for QWen."""
7
-
8
- import base64
9
- import logging
10
- import os
11
- import unicodedata
12
- from typing import Collection, Dict, List, Set, Tuple, Union
13
-
14
- import tiktoken
15
- from transformers import PreTrainedTokenizer, AddedToken
16
-
17
- logger = logging.getLogger(__name__)
18
-
19
-
20
- VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
21
-
22
- PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
23
- ENDOFTEXT = "<|endoftext|>"
24
- IMSTART = "<|im_start|>"
25
- IMEND = "<|im_end|>"
26
- # as the default behavior is changed to allow special tokens in
27
- # regular texts, the surface forms of special tokens need to be
28
- # as different as possible to minimize the impact
29
- EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
30
- # changed to use actual index to avoid misconfiguration with vocabulary expansion
31
- SPECIAL_START_ID = 151643
32
- SPECIAL_TOKENS = tuple(
33
- enumerate(
34
- (
35
- (
36
- ENDOFTEXT,
37
- IMSTART,
38
- IMEND,
39
- )
40
- + EXTRAS
41
- ),
42
- start=SPECIAL_START_ID,
43
- )
44
- )
45
- SPECIAL_TOKENS_SET = set(t for i, t in SPECIAL_TOKENS)
46
-
47
-
48
- def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
49
- with open(tiktoken_bpe_file, "rb") as f:
50
- contents = f.read()
51
- return {
52
- base64.b64decode(token): int(rank)
53
- for token, rank in (line.split() for line in contents.splitlines() if line)
54
- }
55
-
56
-
57
- class QWenTokenizer(PreTrainedTokenizer):
58
- """QWen tokenizer."""
59
-
60
- vocab_files_names = VOCAB_FILES_NAMES
61
-
62
- def __init__(
63
- self,
64
- vocab_file,
65
- errors="replace",
66
- extra_vocab_file=None,
67
- **kwargs,
68
- ):
69
- super().__init__(**kwargs)
70
-
71
- # how to handle errors in decoding UTF-8 byte sequences
72
- # use ignore if you are in streaming inference
73
- self.errors = errors
74
-
75
- self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: Dict[bytes, int]
76
- self.special_tokens = {
77
- token: index
78
- for index, token in SPECIAL_TOKENS
79
- }
80
-
81
- # try load extra vocab from file
82
- if extra_vocab_file is not None:
83
- used_ids = set(self.mergeable_ranks.values()) | set(self.special_tokens.values())
84
- extra_mergeable_ranks = _load_tiktoken_bpe(extra_vocab_file)
85
- for token, index in extra_mergeable_ranks.items():
86
- if token in self.mergeable_ranks:
87
- logger.info(f"extra token {token} exists, skipping")
88
- continue
89
- if index in used_ids:
90
- logger.info(f'the index {index} for extra token {token} exists, skipping')
91
- continue
92
- self.mergeable_ranks[token] = index
93
- # the index may be sparse after this, but don't worry tiktoken.Encoding will handle this
94
-
95
- enc = tiktoken.Encoding(
96
- "Qwen",
97
- pat_str=PAT_STR,
98
- mergeable_ranks=self.mergeable_ranks,
99
- special_tokens=self.special_tokens,
100
- )
101
- assert (
102
- len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
103
- ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
104
-
105
- self.decoder = {
106
- v: k for k, v in self.mergeable_ranks.items()
107
- } # type: dict[int, bytes|str]
108
- self.decoder.update({v: k for k, v in self.special_tokens.items()})
109
-
110
- self.tokenizer = enc # type: tiktoken.Encoding
111
-
112
- self.eod_id = self.tokenizer.eot_token
113
- self.im_start_id = self.special_tokens[IMSTART]
114
- self.im_end_id = self.special_tokens[IMEND]
115
-
116
- def __getstate__(self):
117
- # for pickle lovers
118
- state = self.__dict__.copy()
119
- del state["tokenizer"]
120
- return state
121
-
122
- def __setstate__(self, state):
123
- # tokenizer is not python native; don't pass it; rebuild it
124
- self.__dict__.update(state)
125
- enc = tiktoken.Encoding(
126
- "Qwen",
127
- pat_str=PAT_STR,
128
- mergeable_ranks=self.mergeable_ranks,
129
- special_tokens=self.special_tokens,
130
- )
131
- self.tokenizer = enc
132
-
133
- def __len__(self) -> int:
134
- return self.tokenizer.n_vocab
135
-
136
- def get_vocab(self) -> Dict[bytes, int]:
137
- return self.mergeable_ranks
138
-
139
- def convert_tokens_to_ids(
140
- self, tokens: Union[bytes, str, List[Union[bytes, str]]]
141
- ) -> List[int]:
142
- ids = []
143
- if isinstance(tokens, (str, bytes)):
144
- if tokens in self.special_tokens:
145
- return self.special_tokens[tokens]
146
- else:
147
- return self.mergeable_ranks.get(tokens)
148
- for token in tokens:
149
- if token in self.special_tokens:
150
- ids.append(self.special_tokens[token])
151
- else:
152
- ids.append(self.mergeable_ranks.get(token))
153
- return ids
154
-
155
- def _add_tokens(
156
- self,
157
- new_tokens: Union[List[str], List[AddedToken]],
158
- special_tokens: bool = False,
159
- ) -> int:
160
- if not special_tokens and new_tokens:
161
- raise ValueError("Adding regular tokens is not supported")
162
- for token in new_tokens:
163
- surface_form = token.content if isinstance(token, AddedToken) else token
164
- if surface_form not in SPECIAL_TOKENS_SET:
165
- raise ValueError("Adding unknown special tokens is not supported")
166
- return 0
167
-
168
- def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
169
- """
170
- Save only the vocabulary of the tokenizer (vocabulary).
171
-
172
- Returns:
173
- `Tuple(str)`: Paths to the files saved.
174
- """
175
- file_path = os.path.join(save_directory, "qwen.tiktoken")
176
- with open(file_path, "w", encoding="utf8") as w:
177
- for k, v in self.mergeable_ranks.items():
178
- line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
179
- w.write(line)
180
- return (file_path,)
181
-
182
- def tokenize(
183
- self,
184
- text: str,
185
- allowed_special: Union[Set, str] = "all",
186
- disallowed_special: Union[Collection, str] = (),
187
- **kwargs,
188
- ) -> List[Union[bytes, str]]:
189
- """
190
- Converts a string in a sequence of tokens.
191
-
192
- Args:
193
- text (`str`):
194
- The sequence to be encoded.
195
- allowed_special (`Literal["all"]` or `set`):
196
- The surface forms of the tokens to be encoded as special tokens in regular texts.
197
- Default to "all".
198
- disallowed_special (`Literal["all"]` or `Collection`):
199
- The surface forms of the tokens that should not be in regular texts and trigger errors.
200
- Default to an empty tuple.
201
-
202
- kwargs (additional keyword arguments, *optional*):
203
- Will be passed to the underlying model specific encode method.
204
-
205
- Returns:
206
- `List[bytes|str]`: The list of tokens.
207
- """
208
- tokens = []
209
- text = unicodedata.normalize("NFC", text)
210
-
211
- # this implementation takes a detour: text -> token id -> token surface forms
212
- for t in self.tokenizer.encode(
213
- text, allowed_special=allowed_special, disallowed_special=disallowed_special
214
- ):
215
- tokens.append(self.decoder[t])
216
- return tokens
217
-
218
- def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
219
- """
220
- Converts a sequence of tokens in a single string.
221
- """
222
- text = ""
223
- temp = b""
224
- for t in tokens:
225
- if isinstance(t, str):
226
- if temp:
227
- text += temp.decode("utf-8", errors=self.errors)
228
- temp = b""
229
- text += t
230
- elif isinstance(t, bytes):
231
- temp += t
232
- else:
233
- raise TypeError("token should only be of type types or str")
234
- if temp:
235
- text += temp.decode("utf-8", errors=self.errors)
236
- return text
237
-
238
- @property
239
- def vocab_size(self):
240
- return self.tokenizer.n_vocab
241
-
242
- def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
243
- """Converts an id to a token, special tokens included"""
244
- if index in self.decoder:
245
- return self.decoder[index]
246
- raise ValueError("unknown ids")
247
-
248
- def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
249
- """Converts a token to an id using the vocab, special tokens included"""
250
- if token in self.special_tokens:
251
- return self.special_tokens[token]
252
- if token in self.mergeable_ranks:
253
- return self.mergeable_ranks[token]
254
- raise ValueError("unknown token")
255
-
256
- def _tokenize(self, text: str, **kwargs):
257
- """
258
- Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
259
- vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
260
-
261
- Do NOT take care of added tokens.
262
- """
263
- raise NotImplementedError
264
-
265
- def _decode(
266
- self,
267
- token_ids: Union[int, List[int]],
268
- skip_special_tokens: bool = False,
269
- errors: str = None,
270
- **kwargs,
271
- ) -> str:
272
- if isinstance(token_ids, int):
273
- token_ids = [token_ids]
274
- if skip_special_tokens:
275
- token_ids = [i for i in token_ids if i < self.eod_id]
276
- return self.tokenizer.decode(token_ids, errors=errors or self.errors)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cooldown/iter_0060000_hf/tokenizer_config.json DELETED
@@ -1,11 +0,0 @@
1
- {
2
- "model_max_length": 8192,
3
- "tokenizer_class": "QWenTokenizer",
4
- "auto_map": {
5
- "AutoTokenizer": [
6
- "tokenization_qwen.QWenTokenizer",
7
- null
8
- ]
9
- },
10
- "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
11
- }
 
 
 
 
 
 
 
 
 
 
 
 
cooldown/iter_0070000_hf/config.json DELETED
@@ -1,27 +0,0 @@
1
- {
2
- "architectures": [
3
- "MistralForCausalLM"
4
- ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "bos_token_id": 151849,
8
- "eos_token_id": 151850,
9
- "head_dim": 64,
10
- "hidden_act": "silu",
11
- "hidden_size": 576,
12
- "initializer_range": 0.02,
13
- "intermediate_size": 1536,
14
- "max_position_embeddings": 8192,
15
- "model_type": "mistral",
16
- "num_attention_heads": 9,
17
- "num_hidden_layers": 30,
18
- "num_key_value_heads": 3,
19
- "rms_norm_eps": 1e-05,
20
- "rope_theta": 10000,
21
- "sliding_window": 8192,
22
- "tie_word_embeddings": true,
23
- "torch_dtype": "bfloat16",
24
- "transformers_version": "4.44.2",
25
- "use_cache": true,
26
- "vocab_size": 151851
27
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cooldown/iter_0070000_hf/generation_config.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "bos_token_id": 151849,
4
- "eos_token_id": 151850,
5
- "transformers_version": "4.44.2"
6
- }
 
 
 
 
 
 
 
cooldown/iter_0070000_hf/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2bcb3c3a80c6c3f4fdb486cc353a684f120d3eba67ac4571fb5a511f3f11b1f4
3
- size 562302352
 
 
 
 
cooldown/iter_0070000_hf/qwen.tiktoken DELETED
The diff for this file is too large to render. See raw diff
 
cooldown/iter_0070000_hf/qwen_generation_utils.py DELETED
@@ -1,416 +0,0 @@
1
- # Copyright (c) Alibaba Cloud.
2
- #
3
- # This source code is licensed under the license found in the
4
- # LICENSE file in the root directory of this source tree.
5
-
6
- """Generation support."""
7
-
8
- from typing import Tuple, List, Union, Iterable
9
-
10
- import numpy as np
11
- import torch
12
- import torch.nn.functional as F
13
- from transformers import PreTrainedTokenizer
14
- from transformers import logging
15
- from transformers.generation import LogitsProcessor
16
-
17
- logger = logging.get_logger(__name__)
18
-
19
- # Types.
20
- HistoryType = List[Tuple[str, str]]
21
- TokensType = List[int]
22
- BatchTokensType = List[List[int]]
23
-
24
-
25
- def pad_batch(batch: BatchTokensType, pad_id: int, seq_length: int) -> BatchTokensType:
26
- for tokens in batch:
27
- context_length = len(tokens)
28
- if context_length < seq_length:
29
- tokens.extend([pad_id] * (seq_length - context_length))
30
- return batch
31
-
32
-
33
- def get_ltor_masks_and_position_ids(
34
- data,
35
- eod_token,
36
- reset_position_ids,
37
- reset_attention_mask,
38
- eod_mask_loss,
39
- ):
40
- """Build masks and position id for left to right model."""
41
-
42
- # Extract batch size and sequence length.
43
- micro_batch_size, seq_length = data.size()
44
-
45
- # Attention mask (lower triangular).
46
- if reset_attention_mask:
47
- att_mask_batch = micro_batch_size
48
- else:
49
- att_mask_batch = 1
50
- attention_mask = torch.tril(
51
- torch.ones((att_mask_batch, seq_length, seq_length), device=data.device)
52
- ).view(att_mask_batch, 1, seq_length, seq_length)
53
-
54
- # Loss mask.
55
- loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device)
56
- if eod_mask_loss:
57
- loss_mask[data == eod_token] = 0.0
58
-
59
- # Position ids.
60
- position_ids = torch.arange(seq_length, dtype=torch.long, device=data.device)
61
- position_ids = position_ids.unsqueeze(0).expand_as(data)
62
- # We need to clone as the ids will be modifed based on batch index.
63
- if reset_position_ids:
64
- position_ids = position_ids.clone()
65
-
66
- if reset_position_ids or reset_attention_mask:
67
- # Loop through the batches:
68
- for b in range(micro_batch_size):
69
-
70
- # Find indecies where EOD token is.
71
- eod_index = position_ids[b, data[b] == eod_token]
72
- # Detach indecies from positions if going to modify positions.
73
- if reset_position_ids:
74
- eod_index = eod_index.clone()
75
-
76
- # Loop through EOD indecies:
77
- prev_index = 0
78
- for j in range(eod_index.size()[0]):
79
- i = eod_index[j]
80
- # Mask attention loss.
81
- if reset_attention_mask:
82
- attention_mask[b, 0, (i + 1) :, : (i + 1)] = 0
83
- # Reset positions.
84
- if reset_position_ids:
85
- position_ids[b, (i + 1) :] -= i + 1 - prev_index
86
- prev_index = i + 1
87
-
88
- # Convert attention mask to binary:
89
- attention_mask = attention_mask < 0.5
90
-
91
- return attention_mask, loss_mask, position_ids
92
-
93
-
94
- def get_batch(context_tokens: torch.LongTensor, eod_id: int):
95
- """Generate batch from context tokens."""
96
- # Move to GPU.
97
- tokens = context_tokens.contiguous().to(context_tokens.device)
98
- # Get the attention mask and postition ids.
99
- attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
100
- tokens,
101
- eod_id,
102
- reset_position_ids=False,
103
- reset_attention_mask=False,
104
- eod_mask_loss=False,
105
- )
106
- return tokens, attention_mask, position_ids
107
-
108
-
109
- def get_stop_words_ids(chat_format, tokenizer):
110
- if chat_format == "raw":
111
- stop_words_ids = [tokenizer.encode("Human:"), [tokenizer.eod_id]]
112
- elif chat_format == "chatml":
113
- stop_words_ids = [[tokenizer.im_end_id], [tokenizer.im_start_id]]
114
- else:
115
- raise NotImplementedError(f"Unknown chat format {chat_format!r}")
116
- return stop_words_ids
117
-
118
-
119
- def make_context(
120
- tokenizer: PreTrainedTokenizer,
121
- query: str,
122
- history: List[Tuple[str, str]] = None,
123
- system: str = "",
124
- max_window_size: int = 6144,
125
- chat_format: str = "chatml",
126
- ):
127
- if history is None:
128
- history = []
129
-
130
- if chat_format == "chatml":
131
- im_start, im_end = "<|im_start|>", "<|im_end|>"
132
- im_start_tokens = [tokenizer.im_start_id]
133
- im_end_tokens = [tokenizer.im_end_id]
134
- nl_tokens = tokenizer.encode("\n")
135
-
136
- def _tokenize_str(role, content):
137
- return f"{role}\n{content}", tokenizer.encode(
138
- role, allowed_special=set()
139
- ) + nl_tokens + tokenizer.encode(content, allowed_special=set())
140
-
141
- system_text, system_tokens_part = _tokenize_str("system", system)
142
- system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
143
-
144
- raw_text = ""
145
- context_tokens = []
146
-
147
- for turn_query, turn_response in reversed(history):
148
- query_text, query_tokens_part = _tokenize_str("user", turn_query)
149
- query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
150
- response_text, response_tokens_part = _tokenize_str(
151
- "assistant", turn_response
152
- )
153
- response_tokens = im_start_tokens + response_tokens_part + im_end_tokens
154
-
155
- next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens
156
- prev_chat = (
157
- f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}"
158
- )
159
-
160
- current_context_size = (
161
- len(system_tokens) + len(next_context_tokens) + len(context_tokens)
162
- )
163
- if current_context_size < max_window_size:
164
- context_tokens = next_context_tokens + context_tokens
165
- raw_text = prev_chat + raw_text
166
- else:
167
- break
168
-
169
- context_tokens = system_tokens + context_tokens
170
- raw_text = f"{im_start}{system_text}{im_end}" + raw_text
171
- context_tokens += (
172
- nl_tokens
173
- + im_start_tokens
174
- + _tokenize_str("user", query)[1]
175
- + im_end_tokens
176
- + nl_tokens
177
- + im_start_tokens
178
- + tokenizer.encode("assistant")
179
- + nl_tokens
180
- )
181
- raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
182
-
183
- elif chat_format == "raw":
184
- raw_text = query
185
- context_tokens = tokenizer.encode(raw_text)
186
- else:
187
- raise NotImplementedError(f"Unknown chat format {chat_format!r}")
188
-
189
- return raw_text, context_tokens
190
-
191
-
192
- def _decode_default(
193
- tokens: List[int],
194
- *,
195
- stop_words: List[str],
196
- eod_words: List[str],
197
- tokenizer: PreTrainedTokenizer,
198
- raw_text_len: int,
199
- verbose: bool = False,
200
- return_end_reason: bool = False,
201
- errors: str='replace',
202
- ):
203
- trim_decode_tokens = tokenizer.decode(tokens, errors=errors)[raw_text_len:]
204
- if verbose:
205
- print("\nRaw Generate: ", trim_decode_tokens)
206
-
207
- end_reason = f"Gen length {len(tokens)}"
208
- for stop_word in stop_words:
209
- trim_decode_tokens = trim_decode_tokens.replace(stop_word, "").strip()
210
- for eod_word in eod_words:
211
- if eod_word in trim_decode_tokens:
212
- end_reason = f"Gen {eod_word!r}"
213
- trim_decode_tokens = trim_decode_tokens.split(eod_word)[0]
214
- trim_decode_tokens = trim_decode_tokens.strip()
215
- if verbose:
216
- print("\nEnd Reason:", end_reason)
217
- print("\nGenerate: ", trim_decode_tokens)
218
-
219
- if return_end_reason:
220
- return trim_decode_tokens, end_reason
221
- else:
222
- return trim_decode_tokens
223
-
224
-
225
- def _decode_chatml(
226
- tokens: List[int],
227
- *,
228
- stop_words: List[str],
229
- eod_token_ids: List[int],
230
- tokenizer: PreTrainedTokenizer,
231
- raw_text_len: int,
232
- context_length: int,
233
- verbose: bool = False,
234
- return_end_reason: bool = False,
235
- errors: str='replace'
236
- ):
237
- end_reason = f"Gen length {len(tokens)}"
238
- eod_token_idx = context_length
239
- for eod_token_idx in range(context_length, len(tokens)):
240
- if tokens[eod_token_idx] in eod_token_ids:
241
- end_reason = f"Gen {tokenizer.decode([tokens[eod_token_idx]])!r}"
242
- break
243
-
244
- trim_decode_tokens = tokenizer.decode(tokens[:eod_token_idx], errors=errors)[raw_text_len:]
245
- if verbose:
246
- print("\nRaw Generate w/o EOD:", tokenizer.decode(tokens, errors=errors)[raw_text_len:])
247
- print("\nRaw Generate:", trim_decode_tokens)
248
- print("\nEnd Reason:", end_reason)
249
- for stop_word in stop_words:
250
- trim_decode_tokens = trim_decode_tokens.replace(stop_word, "").strip()
251
- trim_decode_tokens = trim_decode_tokens.strip()
252
- if verbose:
253
- print("\nGenerate:", trim_decode_tokens)
254
-
255
- if return_end_reason:
256
- return trim_decode_tokens, end_reason
257
- else:
258
- return trim_decode_tokens
259
-
260
-
261
- def decode_tokens(
262
- tokens: Union[torch.LongTensor, TokensType],
263
- tokenizer: PreTrainedTokenizer,
264
- raw_text_len: int,
265
- context_length: int,
266
- chat_format: str,
267
- verbose: bool = False,
268
- return_end_reason: bool = False,
269
- errors: str="replace",
270
- ) -> str:
271
- if torch.is_tensor(tokens):
272
- tokens = tokens.cpu().numpy().tolist()
273
-
274
- if chat_format == "chatml":
275
- return _decode_chatml(
276
- tokens,
277
- stop_words=[],
278
- eod_token_ids=[tokenizer.im_start_id, tokenizer.im_end_id],
279
- tokenizer=tokenizer,
280
- raw_text_len=raw_text_len,
281
- context_length=context_length,
282
- verbose=verbose,
283
- return_end_reason=return_end_reason,
284
- errors=errors,
285
- )
286
- elif chat_format == "raw":
287
- return _decode_default(
288
- tokens,
289
- stop_words=["<|endoftext|>"],
290
- eod_words=["<|endoftext|>"],
291
- tokenizer=tokenizer,
292
- raw_text_len=raw_text_len,
293
- verbose=verbose,
294
- return_end_reason=return_end_reason,
295
- errors=errors,
296
- )
297
- else:
298
- raise NotImplementedError(f"Unknown chat format {chat_format!r}")
299
-
300
-
301
- class StopWordsLogitsProcessor(LogitsProcessor):
302
- """
303
- :class:`transformers.LogitsProcessor` that enforces that when specified sequences appear, stop geration.
304
-
305
- Args:
306
- stop_words_ids (:obj:`List[List[int]]`):
307
- List of list of token ids of stop ids. In order to get the tokens of the words
308
- that should not appear in the generated text, use :obj:`tokenizer(bad_word,
309
- add_prefix_space=True).input_ids`.
310
- eos_token_id (:obj:`int`):
311
- The id of the `end-of-sequence` token.
312
- """
313
-
314
- def __init__(self, stop_words_ids: Iterable[Iterable[int]], eos_token_id: int):
315
-
316
- if not isinstance(stop_words_ids, List) or len(stop_words_ids) == 0:
317
- raise ValueError(
318
- f"`stop_words_ids` has to be a non-emtpy list, but is {stop_words_ids}."
319
- )
320
- if any(not isinstance(bad_word_ids, list) for bad_word_ids in stop_words_ids):
321
- raise ValueError(
322
- f"`stop_words_ids` has to be a list of lists, but is {stop_words_ids}."
323
- )
324
- if any(
325
- any(
326
- (not isinstance(token_id, (int, np.integer)) or token_id < 0)
327
- for token_id in stop_word_ids
328
- )
329
- for stop_word_ids in stop_words_ids
330
- ):
331
- raise ValueError(
332
- f"Each list in `stop_words_ids` has to be a list of positive integers, but is {stop_words_ids}."
333
- )
334
-
335
- self.stop_words_ids = list(
336
- filter(
337
- lambda bad_token_seq: bad_token_seq != [eos_token_id], stop_words_ids
338
- )
339
- )
340
- self.eos_token_id = eos_token_id
341
- for stop_token_seq in self.stop_words_ids:
342
- assert (
343
- len(stop_token_seq) > 0
344
- ), "Stop words token sequences {} cannot have an empty list".format(
345
- stop_words_ids
346
- )
347
-
348
- def __call__(
349
- self, input_ids: torch.LongTensor, scores: torch.FloatTensor
350
- ) -> torch.FloatTensor:
351
- stopped_samples = self._calc_stopped_samples(input_ids)
352
- for i, should_stop in enumerate(stopped_samples):
353
- if should_stop:
354
- scores[i, self.eos_token_id] = float(2**15)
355
- return scores
356
-
357
- def _tokens_match(self, prev_tokens: torch.LongTensor, tokens: List[int]) -> bool:
358
- if len(tokens) == 0:
359
- # if bad word tokens is just one token always ban it
360
- return True
361
- elif len(tokens) > len(prev_tokens):
362
- # if bad word tokens are longer then prev input_ids they can't be equal
363
- return False
364
- elif prev_tokens[-len(tokens) :].tolist() == tokens:
365
- # if tokens match
366
- return True
367
- else:
368
- return False
369
-
370
- def _calc_stopped_samples(self, prev_input_ids: Iterable[int]) -> Iterable[int]:
371
- stopped_samples = []
372
- for prev_input_ids_slice in prev_input_ids:
373
- match = False
374
- for stop_token_seq in self.stop_words_ids:
375
- if self._tokens_match(prev_input_ids_slice, stop_token_seq):
376
- # if tokens do not match continue
377
- match = True
378
- break
379
- stopped_samples.append(match)
380
-
381
- return stopped_samples
382
-
383
-
384
- def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
385
- """This function has been mostly taken from huggingface conversational
386
- ai code at
387
- https://medium.com/huggingface/how-to-build-a-state-of-the-art-
388
- conversational-ai-with-transfer-learning-2d818ac26313"""
389
-
390
- if top_k > 0:
391
- # Remove all tokens with a probability less than the
392
- # last token of the top-k
393
- indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
394
- logits[indices_to_remove] = filter_value
395
-
396
- if top_p > 0.0:
397
- # Cconvert to 1D
398
- sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
399
- cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
400
-
401
- # Remove tokens with cumulative probability above the threshold
402
- sorted_indices_to_remove = cumulative_probs > top_p
403
- # Shift the indices to the right to keep also the first token
404
- # above the threshold
405
- sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
406
- sorted_indices_to_remove[..., 0] = 0
407
- for i in range(sorted_indices.size(0)):
408
- indices_to_remove = sorted_indices[i][sorted_indices_to_remove[i]]
409
- logits[i][indices_to_remove] = filter_value
410
-
411
- return logits
412
-
413
-
414
- def switch(val1, val2, boolean):
415
- boolean = boolean.type_as(val1)
416
- return (1 - boolean) * val1 + boolean * val2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cooldown/iter_0070000_hf/special_tokens_map.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "bos_token": "<|extra_203|>",
3
- "eos_token": "<|extra_204|>",
4
- "unk_token": "<|endoftext|>",
5
- "pad_token": "<|endoftext|>"
6
- }
 
 
 
 
 
 
 
cooldown/iter_0070000_hf/tokenization_qwen.py DELETED
@@ -1,276 +0,0 @@
1
- # Copyright (c) Alibaba Cloud.
2
- #
3
- # This source code is licensed under the license found in the
4
- # LICENSE file in the root directory of this source tree.
5
-
6
- """Tokenization classes for QWen."""
7
-
8
- import base64
9
- import logging
10
- import os
11
- import unicodedata
12
- from typing import Collection, Dict, List, Set, Tuple, Union
13
-
14
- import tiktoken
15
- from transformers import PreTrainedTokenizer, AddedToken
16
-
17
- logger = logging.getLogger(__name__)
18
-
19
-
20
- VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
21
-
22
- PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
23
- ENDOFTEXT = "<|endoftext|>"
24
- IMSTART = "<|im_start|>"
25
- IMEND = "<|im_end|>"
26
- # as the default behavior is changed to allow special tokens in
27
- # regular texts, the surface forms of special tokens need to be
28
- # as different as possible to minimize the impact
29
- EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
30
- # changed to use actual index to avoid misconfiguration with vocabulary expansion
31
- SPECIAL_START_ID = 151643
32
- SPECIAL_TOKENS = tuple(
33
- enumerate(
34
- (
35
- (
36
- ENDOFTEXT,
37
- IMSTART,
38
- IMEND,
39
- )
40
- + EXTRAS
41
- ),
42
- start=SPECIAL_START_ID,
43
- )
44
- )
45
- SPECIAL_TOKENS_SET = set(t for i, t in SPECIAL_TOKENS)
46
-
47
-
48
- def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
49
- with open(tiktoken_bpe_file, "rb") as f:
50
- contents = f.read()
51
- return {
52
- base64.b64decode(token): int(rank)
53
- for token, rank in (line.split() for line in contents.splitlines() if line)
54
- }
55
-
56
-
57
- class QWenTokenizer(PreTrainedTokenizer):
58
- """QWen tokenizer."""
59
-
60
- vocab_files_names = VOCAB_FILES_NAMES
61
-
62
- def __init__(
63
- self,
64
- vocab_file,
65
- errors="replace",
66
- extra_vocab_file=None,
67
- **kwargs,
68
- ):
69
- super().__init__(**kwargs)
70
-
71
- # how to handle errors in decoding UTF-8 byte sequences
72
- # use ignore if you are in streaming inference
73
- self.errors = errors
74
-
75
- self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: Dict[bytes, int]
76
- self.special_tokens = {
77
- token: index
78
- for index, token in SPECIAL_TOKENS
79
- }
80
-
81
- # try load extra vocab from file
82
- if extra_vocab_file is not None:
83
- used_ids = set(self.mergeable_ranks.values()) | set(self.special_tokens.values())
84
- extra_mergeable_ranks = _load_tiktoken_bpe(extra_vocab_file)
85
- for token, index in extra_mergeable_ranks.items():
86
- if token in self.mergeable_ranks:
87
- logger.info(f"extra token {token} exists, skipping")
88
- continue
89
- if index in used_ids:
90
- logger.info(f'the index {index} for extra token {token} exists, skipping')
91
- continue
92
- self.mergeable_ranks[token] = index
93
- # the index may be sparse after this, but don't worry tiktoken.Encoding will handle this
94
-
95
- enc = tiktoken.Encoding(
96
- "Qwen",
97
- pat_str=PAT_STR,
98
- mergeable_ranks=self.mergeable_ranks,
99
- special_tokens=self.special_tokens,
100
- )
101
- assert (
102
- len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
103
- ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
104
-
105
- self.decoder = {
106
- v: k for k, v in self.mergeable_ranks.items()
107
- } # type: dict[int, bytes|str]
108
- self.decoder.update({v: k for k, v in self.special_tokens.items()})
109
-
110
- self.tokenizer = enc # type: tiktoken.Encoding
111
-
112
- self.eod_id = self.tokenizer.eot_token
113
- self.im_start_id = self.special_tokens[IMSTART]
114
- self.im_end_id = self.special_tokens[IMEND]
115
-
116
- def __getstate__(self):
117
- # for pickle lovers
118
- state = self.__dict__.copy()
119
- del state["tokenizer"]
120
- return state
121
-
122
- def __setstate__(self, state):
123
- # tokenizer is not python native; don't pass it; rebuild it
124
- self.__dict__.update(state)
125
- enc = tiktoken.Encoding(
126
- "Qwen",
127
- pat_str=PAT_STR,
128
- mergeable_ranks=self.mergeable_ranks,
129
- special_tokens=self.special_tokens,
130
- )
131
- self.tokenizer = enc
132
-
133
- def __len__(self) -> int:
134
- return self.tokenizer.n_vocab
135
-
136
- def get_vocab(self) -> Dict[bytes, int]:
137
- return self.mergeable_ranks
138
-
139
- def convert_tokens_to_ids(
140
- self, tokens: Union[bytes, str, List[Union[bytes, str]]]
141
- ) -> List[int]:
142
- ids = []
143
- if isinstance(tokens, (str, bytes)):
144
- if tokens in self.special_tokens:
145
- return self.special_tokens[tokens]
146
- else:
147
- return self.mergeable_ranks.get(tokens)
148
- for token in tokens:
149
- if token in self.special_tokens:
150
- ids.append(self.special_tokens[token])
151
- else:
152
- ids.append(self.mergeable_ranks.get(token))
153
- return ids
154
-
155
- def _add_tokens(
156
- self,
157
- new_tokens: Union[List[str], List[AddedToken]],
158
- special_tokens: bool = False,
159
- ) -> int:
160
- if not special_tokens and new_tokens:
161
- raise ValueError("Adding regular tokens is not supported")
162
- for token in new_tokens:
163
- surface_form = token.content if isinstance(token, AddedToken) else token
164
- if surface_form not in SPECIAL_TOKENS_SET:
165
- raise ValueError("Adding unknown special tokens is not supported")
166
- return 0
167
-
168
- def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
169
- """
170
- Save only the vocabulary of the tokenizer (vocabulary).
171
-
172
- Returns:
173
- `Tuple(str)`: Paths to the files saved.
174
- """
175
- file_path = os.path.join(save_directory, "qwen.tiktoken")
176
- with open(file_path, "w", encoding="utf8") as w:
177
- for k, v in self.mergeable_ranks.items():
178
- line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
179
- w.write(line)
180
- return (file_path,)
181
-
182
- def tokenize(
183
- self,
184
- text: str,
185
- allowed_special: Union[Set, str] = "all",
186
- disallowed_special: Union[Collection, str] = (),
187
- **kwargs,
188
- ) -> List[Union[bytes, str]]:
189
- """
190
- Converts a string in a sequence of tokens.
191
-
192
- Args:
193
- text (`str`):
194
- The sequence to be encoded.
195
- allowed_special (`Literal["all"]` or `set`):
196
- The surface forms of the tokens to be encoded as special tokens in regular texts.
197
- Default to "all".
198
- disallowed_special (`Literal["all"]` or `Collection`):
199
- The surface forms of the tokens that should not be in regular texts and trigger errors.
200
- Default to an empty tuple.
201
-
202
- kwargs (additional keyword arguments, *optional*):
203
- Will be passed to the underlying model specific encode method.
204
-
205
- Returns:
206
- `List[bytes|str]`: The list of tokens.
207
- """
208
- tokens = []
209
- text = unicodedata.normalize("NFC", text)
210
-
211
- # this implementation takes a detour: text -> token id -> token surface forms
212
- for t in self.tokenizer.encode(
213
- text, allowed_special=allowed_special, disallowed_special=disallowed_special
214
- ):
215
- tokens.append(self.decoder[t])
216
- return tokens
217
-
218
- def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
219
- """
220
- Converts a sequence of tokens in a single string.
221
- """
222
- text = ""
223
- temp = b""
224
- for t in tokens:
225
- if isinstance(t, str):
226
- if temp:
227
- text += temp.decode("utf-8", errors=self.errors)
228
- temp = b""
229
- text += t
230
- elif isinstance(t, bytes):
231
- temp += t
232
- else:
233
- raise TypeError("token should only be of type types or str")
234
- if temp:
235
- text += temp.decode("utf-8", errors=self.errors)
236
- return text
237
-
238
- @property
239
- def vocab_size(self):
240
- return self.tokenizer.n_vocab
241
-
242
- def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
243
- """Converts an id to a token, special tokens included"""
244
- if index in self.decoder:
245
- return self.decoder[index]
246
- raise ValueError("unknown ids")
247
-
248
- def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
249
- """Converts a token to an id using the vocab, special tokens included"""
250
- if token in self.special_tokens:
251
- return self.special_tokens[token]
252
- if token in self.mergeable_ranks:
253
- return self.mergeable_ranks[token]
254
- raise ValueError("unknown token")
255
-
256
- def _tokenize(self, text: str, **kwargs):
257
- """
258
- Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
259
- vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
260
-
261
- Do NOT take care of added tokens.
262
- """
263
- raise NotImplementedError
264
-
265
- def _decode(
266
- self,
267
- token_ids: Union[int, List[int]],
268
- skip_special_tokens: bool = False,
269
- errors: str = None,
270
- **kwargs,
271
- ) -> str:
272
- if isinstance(token_ids, int):
273
- token_ids = [token_ids]
274
- if skip_special_tokens:
275
- token_ids = [i for i in token_ids if i < self.eod_id]
276
- return self.tokenizer.decode(token_ids, errors=errors or self.errors)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cooldown/iter_0070000_hf/tokenizer_config.json DELETED
@@ -1,11 +0,0 @@
1
- {
2
- "model_max_length": 8192,
3
- "tokenizer_class": "QWenTokenizer",
4
- "auto_map": {
5
- "AutoTokenizer": [
6
- "tokenization_qwen.QWenTokenizer",
7
- null
8
- ]
9
- },
10
- "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
11
- }
 
 
 
 
 
 
 
 
 
 
 
 
cooldown/iter_0084772_hf/config.json DELETED
@@ -1,27 +0,0 @@
1
- {
2
- "architectures": [
3
- "MistralForCausalLM"
4
- ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "bos_token_id": 151849,
8
- "eos_token_id": 151850,
9
- "head_dim": 64,
10
- "hidden_act": "silu",
11
- "hidden_size": 576,
12
- "initializer_range": 0.02,
13
- "intermediate_size": 1536,
14
- "max_position_embeddings": 8192,
15
- "model_type": "mistral",
16
- "num_attention_heads": 9,
17
- "num_hidden_layers": 30,
18
- "num_key_value_heads": 3,
19
- "rms_norm_eps": 1e-05,
20
- "rope_theta": 10000,
21
- "sliding_window": 8192,
22
- "tie_word_embeddings": true,
23
- "torch_dtype": "bfloat16",
24
- "transformers_version": "4.44.2",
25
- "use_cache": true,
26
- "vocab_size": 151851
27
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cooldown/iter_0084772_hf/generation_config.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "bos_token_id": 151849,
4
- "eos_token_id": 151850,
5
- "transformers_version": "4.44.2"
6
- }
 
 
 
 
 
 
 
cooldown/iter_0084772_hf/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe81ee1a16d461cf32d9a2ee119c901a21d139fd1f120f506e11d5fb196ba7df
3
- size 562302352
 
 
 
 
cooldown/iter_0084772_hf/qwen.tiktoken DELETED
The diff for this file is too large to render. See raw diff
 
cooldown/iter_0084772_hf/qwen_generation_utils.py DELETED
@@ -1,416 +0,0 @@
1
- # Copyright (c) Alibaba Cloud.
2
- #
3
- # This source code is licensed under the license found in the
4
- # LICENSE file in the root directory of this source tree.
5
-
6
- """Generation support."""
7
-
8
- from typing import Tuple, List, Union, Iterable
9
-
10
- import numpy as np
11
- import torch
12
- import torch.nn.functional as F
13
- from transformers import PreTrainedTokenizer
14
- from transformers import logging
15
- from transformers.generation import LogitsProcessor
16
-
17
- logger = logging.get_logger(__name__)
18
-
19
- # Types.
20
- HistoryType = List[Tuple[str, str]]
21
- TokensType = List[int]
22
- BatchTokensType = List[List[int]]
23
-
24
-
25
- def pad_batch(batch: BatchTokensType, pad_id: int, seq_length: int) -> BatchTokensType:
26
- for tokens in batch:
27
- context_length = len(tokens)
28
- if context_length < seq_length:
29
- tokens.extend([pad_id] * (seq_length - context_length))
30
- return batch
31
-
32
-
33
- def get_ltor_masks_and_position_ids(
34
- data,
35
- eod_token,
36
- reset_position_ids,
37
- reset_attention_mask,
38
- eod_mask_loss,
39
- ):
40
- """Build masks and position id for left to right model."""
41
-
42
- # Extract batch size and sequence length.
43
- micro_batch_size, seq_length = data.size()
44
-
45
- # Attention mask (lower triangular).
46
- if reset_attention_mask:
47
- att_mask_batch = micro_batch_size
48
- else:
49
- att_mask_batch = 1
50
- attention_mask = torch.tril(
51
- torch.ones((att_mask_batch, seq_length, seq_length), device=data.device)
52
- ).view(att_mask_batch, 1, seq_length, seq_length)
53
-
54
- # Loss mask.
55
- loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device)
56
- if eod_mask_loss:
57
- loss_mask[data == eod_token] = 0.0
58
-
59
- # Position ids.
60
- position_ids = torch.arange(seq_length, dtype=torch.long, device=data.device)
61
- position_ids = position_ids.unsqueeze(0).expand_as(data)
62
- # We need to clone as the ids will be modifed based on batch index.
63
- if reset_position_ids:
64
- position_ids = position_ids.clone()
65
-
66
- if reset_position_ids or reset_attention_mask:
67
- # Loop through the batches:
68
- for b in range(micro_batch_size):
69
-
70
- # Find indecies where EOD token is.
71
- eod_index = position_ids[b, data[b] == eod_token]
72
- # Detach indecies from positions if going to modify positions.
73
- if reset_position_ids:
74
- eod_index = eod_index.clone()
75
-
76
- # Loop through EOD indecies:
77
- prev_index = 0
78
- for j in range(eod_index.size()[0]):
79
- i = eod_index[j]
80
- # Mask attention loss.
81
- if reset_attention_mask:
82
- attention_mask[b, 0, (i + 1) :, : (i + 1)] = 0
83
- # Reset positions.
84
- if reset_position_ids:
85
- position_ids[b, (i + 1) :] -= i + 1 - prev_index
86
- prev_index = i + 1
87
-
88
- # Convert attention mask to binary:
89
- attention_mask = attention_mask < 0.5
90
-
91
- return attention_mask, loss_mask, position_ids
92
-
93
-
94
- def get_batch(context_tokens: torch.LongTensor, eod_id: int):
95
- """Generate batch from context tokens."""
96
- # Move to GPU.
97
- tokens = context_tokens.contiguous().to(context_tokens.device)
98
- # Get the attention mask and postition ids.
99
- attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
100
- tokens,
101
- eod_id,
102
- reset_position_ids=False,
103
- reset_attention_mask=False,
104
- eod_mask_loss=False,
105
- )
106
- return tokens, attention_mask, position_ids
107
-
108
-
109
- def get_stop_words_ids(chat_format, tokenizer):
110
- if chat_format == "raw":
111
- stop_words_ids = [tokenizer.encode("Human:"), [tokenizer.eod_id]]
112
- elif chat_format == "chatml":
113
- stop_words_ids = [[tokenizer.im_end_id], [tokenizer.im_start_id]]
114
- else:
115
- raise NotImplementedError(f"Unknown chat format {chat_format!r}")
116
- return stop_words_ids
117
-
118
-
119
- def make_context(
120
- tokenizer: PreTrainedTokenizer,
121
- query: str,
122
- history: List[Tuple[str, str]] = None,
123
- system: str = "",
124
- max_window_size: int = 6144,
125
- chat_format: str = "chatml",
126
- ):
127
- if history is None:
128
- history = []
129
-
130
- if chat_format == "chatml":
131
- im_start, im_end = "<|im_start|>", "<|im_end|>"
132
- im_start_tokens = [tokenizer.im_start_id]
133
- im_end_tokens = [tokenizer.im_end_id]
134
- nl_tokens = tokenizer.encode("\n")
135
-
136
- def _tokenize_str(role, content):
137
- return f"{role}\n{content}", tokenizer.encode(
138
- role, allowed_special=set()
139
- ) + nl_tokens + tokenizer.encode(content, allowed_special=set())
140
-
141
- system_text, system_tokens_part = _tokenize_str("system", system)
142
- system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
143
-
144
- raw_text = ""
145
- context_tokens = []
146
-
147
- for turn_query, turn_response in reversed(history):
148
- query_text, query_tokens_part = _tokenize_str("user", turn_query)
149
- query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
150
- response_text, response_tokens_part = _tokenize_str(
151
- "assistant", turn_response
152
- )
153
- response_tokens = im_start_tokens + response_tokens_part + im_end_tokens
154
-
155
- next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens
156
- prev_chat = (
157
- f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}"
158
- )
159
-
160
- current_context_size = (
161
- len(system_tokens) + len(next_context_tokens) + len(context_tokens)
162
- )
163
- if current_context_size < max_window_size:
164
- context_tokens = next_context_tokens + context_tokens
165
- raw_text = prev_chat + raw_text
166
- else:
167
- break
168
-
169
- context_tokens = system_tokens + context_tokens
170
- raw_text = f"{im_start}{system_text}{im_end}" + raw_text
171
- context_tokens += (
172
- nl_tokens
173
- + im_start_tokens
174
- + _tokenize_str("user", query)[1]
175
- + im_end_tokens
176
- + nl_tokens
177
- + im_start_tokens
178
- + tokenizer.encode("assistant")
179
- + nl_tokens
180
- )
181
- raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
182
-
183
- elif chat_format == "raw":
184
- raw_text = query
185
- context_tokens = tokenizer.encode(raw_text)
186
- else:
187
- raise NotImplementedError(f"Unknown chat format {chat_format!r}")
188
-
189
- return raw_text, context_tokens
190
-
191
-
192
- def _decode_default(
193
- tokens: List[int],
194
- *,
195
- stop_words: List[str],
196
- eod_words: List[str],
197
- tokenizer: PreTrainedTokenizer,
198
- raw_text_len: int,
199
- verbose: bool = False,
200
- return_end_reason: bool = False,
201
- errors: str='replace',
202
- ):
203
- trim_decode_tokens = tokenizer.decode(tokens, errors=errors)[raw_text_len:]
204
- if verbose:
205
- print("\nRaw Generate: ", trim_decode_tokens)
206
-
207
- end_reason = f"Gen length {len(tokens)}"
208
- for stop_word in stop_words:
209
- trim_decode_tokens = trim_decode_tokens.replace(stop_word, "").strip()
210
- for eod_word in eod_words:
211
- if eod_word in trim_decode_tokens:
212
- end_reason = f"Gen {eod_word!r}"
213
- trim_decode_tokens = trim_decode_tokens.split(eod_word)[0]
214
- trim_decode_tokens = trim_decode_tokens.strip()
215
- if verbose:
216
- print("\nEnd Reason:", end_reason)
217
- print("\nGenerate: ", trim_decode_tokens)
218
-
219
- if return_end_reason:
220
- return trim_decode_tokens, end_reason
221
- else:
222
- return trim_decode_tokens
223
-
224
-
225
- def _decode_chatml(
226
- tokens: List[int],
227
- *,
228
- stop_words: List[str],
229
- eod_token_ids: List[int],
230
- tokenizer: PreTrainedTokenizer,
231
- raw_text_len: int,
232
- context_length: int,
233
- verbose: bool = False,
234
- return_end_reason: bool = False,
235
- errors: str='replace'
236
- ):
237
- end_reason = f"Gen length {len(tokens)}"
238
- eod_token_idx = context_length
239
- for eod_token_idx in range(context_length, len(tokens)):
240
- if tokens[eod_token_idx] in eod_token_ids:
241
- end_reason = f"Gen {tokenizer.decode([tokens[eod_token_idx]])!r}"
242
- break
243
-
244
- trim_decode_tokens = tokenizer.decode(tokens[:eod_token_idx], errors=errors)[raw_text_len:]
245
- if verbose:
246
- print("\nRaw Generate w/o EOD:", tokenizer.decode(tokens, errors=errors)[raw_text_len:])
247
- print("\nRaw Generate:", trim_decode_tokens)
248
- print("\nEnd Reason:", end_reason)
249
- for stop_word in stop_words:
250
- trim_decode_tokens = trim_decode_tokens.replace(stop_word, "").strip()
251
- trim_decode_tokens = trim_decode_tokens.strip()
252
- if verbose:
253
- print("\nGenerate:", trim_decode_tokens)
254
-
255
- if return_end_reason:
256
- return trim_decode_tokens, end_reason
257
- else:
258
- return trim_decode_tokens
259
-
260
-
261
- def decode_tokens(
262
- tokens: Union[torch.LongTensor, TokensType],
263
- tokenizer: PreTrainedTokenizer,
264
- raw_text_len: int,
265
- context_length: int,
266
- chat_format: str,
267
- verbose: bool = False,
268
- return_end_reason: bool = False,
269
- errors: str="replace",
270
- ) -> str:
271
- if torch.is_tensor(tokens):
272
- tokens = tokens.cpu().numpy().tolist()
273
-
274
- if chat_format == "chatml":
275
- return _decode_chatml(
276
- tokens,
277
- stop_words=[],
278
- eod_token_ids=[tokenizer.im_start_id, tokenizer.im_end_id],
279
- tokenizer=tokenizer,
280
- raw_text_len=raw_text_len,
281
- context_length=context_length,
282
- verbose=verbose,
283
- return_end_reason=return_end_reason,
284
- errors=errors,
285
- )
286
- elif chat_format == "raw":
287
- return _decode_default(
288
- tokens,
289
- stop_words=["<|endoftext|>"],
290
- eod_words=["<|endoftext|>"],
291
- tokenizer=tokenizer,
292
- raw_text_len=raw_text_len,
293
- verbose=verbose,
294
- return_end_reason=return_end_reason,
295
- errors=errors,
296
- )
297
- else:
298
- raise NotImplementedError(f"Unknown chat format {chat_format!r}")
299
-
300
-
301
- class StopWordsLogitsProcessor(LogitsProcessor):
302
- """
303
- :class:`transformers.LogitsProcessor` that enforces that when specified sequences appear, stop geration.
304
-
305
- Args:
306
- stop_words_ids (:obj:`List[List[int]]`):
307
- List of list of token ids of stop ids. In order to get the tokens of the words
308
- that should not appear in the generated text, use :obj:`tokenizer(bad_word,
309
- add_prefix_space=True).input_ids`.
310
- eos_token_id (:obj:`int`):
311
- The id of the `end-of-sequence` token.
312
- """
313
-
314
- def __init__(self, stop_words_ids: Iterable[Iterable[int]], eos_token_id: int):
315
-
316
- if not isinstance(stop_words_ids, List) or len(stop_words_ids) == 0:
317
- raise ValueError(
318
- f"`stop_words_ids` has to be a non-emtpy list, but is {stop_words_ids}."
319
- )
320
- if any(not isinstance(bad_word_ids, list) for bad_word_ids in stop_words_ids):
321
- raise ValueError(
322
- f"`stop_words_ids` has to be a list of lists, but is {stop_words_ids}."
323
- )
324
- if any(
325
- any(
326
- (not isinstance(token_id, (int, np.integer)) or token_id < 0)
327
- for token_id in stop_word_ids
328
- )
329
- for stop_word_ids in stop_words_ids
330
- ):
331
- raise ValueError(
332
- f"Each list in `stop_words_ids` has to be a list of positive integers, but is {stop_words_ids}."
333
- )
334
-
335
- self.stop_words_ids = list(
336
- filter(
337
- lambda bad_token_seq: bad_token_seq != [eos_token_id], stop_words_ids
338
- )
339
- )
340
- self.eos_token_id = eos_token_id
341
- for stop_token_seq in self.stop_words_ids:
342
- assert (
343
- len(stop_token_seq) > 0
344
- ), "Stop words token sequences {} cannot have an empty list".format(
345
- stop_words_ids
346
- )
347
-
348
- def __call__(
349
- self, input_ids: torch.LongTensor, scores: torch.FloatTensor
350
- ) -> torch.FloatTensor:
351
- stopped_samples = self._calc_stopped_samples(input_ids)
352
- for i, should_stop in enumerate(stopped_samples):
353
- if should_stop:
354
- scores[i, self.eos_token_id] = float(2**15)
355
- return scores
356
-
357
- def _tokens_match(self, prev_tokens: torch.LongTensor, tokens: List[int]) -> bool:
358
- if len(tokens) == 0:
359
- # if bad word tokens is just one token always ban it
360
- return True
361
- elif len(tokens) > len(prev_tokens):
362
- # if bad word tokens are longer then prev input_ids they can't be equal
363
- return False
364
- elif prev_tokens[-len(tokens) :].tolist() == tokens:
365
- # if tokens match
366
- return True
367
- else:
368
- return False
369
-
370
- def _calc_stopped_samples(self, prev_input_ids: Iterable[int]) -> Iterable[int]:
371
- stopped_samples = []
372
- for prev_input_ids_slice in prev_input_ids:
373
- match = False
374
- for stop_token_seq in self.stop_words_ids:
375
- if self._tokens_match(prev_input_ids_slice, stop_token_seq):
376
- # if tokens do not match continue
377
- match = True
378
- break
379
- stopped_samples.append(match)
380
-
381
- return stopped_samples
382
-
383
-
384
- def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
385
- """This function has been mostly taken from huggingface conversational
386
- ai code at
387
- https://medium.com/huggingface/how-to-build-a-state-of-the-art-
388
- conversational-ai-with-transfer-learning-2d818ac26313"""
389
-
390
- if top_k > 0:
391
- # Remove all tokens with a probability less than the
392
- # last token of the top-k
393
- indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
394
- logits[indices_to_remove] = filter_value
395
-
396
- if top_p > 0.0:
397
- # Cconvert to 1D
398
- sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
399
- cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
400
-
401
- # Remove tokens with cumulative probability above the threshold
402
- sorted_indices_to_remove = cumulative_probs > top_p
403
- # Shift the indices to the right to keep also the first token
404
- # above the threshold
405
- sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
406
- sorted_indices_to_remove[..., 0] = 0
407
- for i in range(sorted_indices.size(0)):
408
- indices_to_remove = sorted_indices[i][sorted_indices_to_remove[i]]
409
- logits[i][indices_to_remove] = filter_value
410
-
411
- return logits
412
-
413
-
414
- def switch(val1, val2, boolean):
415
- boolean = boolean.type_as(val1)
416
- return (1 - boolean) * val1 + boolean * val2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cooldown/iter_0084772_hf/special_tokens_map.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "bos_token": "<|extra_203|>",
3
- "eos_token": "<|extra_204|>",
4
- "unk_token": "<|endoftext|>",
5
- "pad_token": "<|endoftext|>"
6
- }
 
 
 
 
 
 
 
cooldown/iter_0084772_hf/tokenization_qwen.py DELETED
@@ -1,276 +0,0 @@
1
- # Copyright (c) Alibaba Cloud.
2
- #
3
- # This source code is licensed under the license found in the
4
- # LICENSE file in the root directory of this source tree.
5
-
6
- """Tokenization classes for QWen."""
7
-
8
- import base64
9
- import logging
10
- import os
11
- import unicodedata
12
- from typing import Collection, Dict, List, Set, Tuple, Union
13
-
14
- import tiktoken
15
- from transformers import PreTrainedTokenizer, AddedToken
16
-
17
- logger = logging.getLogger(__name__)
18
-
19
-
20
- VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
21
-
22
- PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
23
- ENDOFTEXT = "<|endoftext|>"
24
- IMSTART = "<|im_start|>"
25
- IMEND = "<|im_end|>"
26
- # as the default behavior is changed to allow special tokens in
27
- # regular texts, the surface forms of special tokens need to be
28
- # as different as possible to minimize the impact
29
- EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
30
- # changed to use actual index to avoid misconfiguration with vocabulary expansion
31
- SPECIAL_START_ID = 151643
32
- SPECIAL_TOKENS = tuple(
33
- enumerate(
34
- (
35
- (
36
- ENDOFTEXT,
37
- IMSTART,
38
- IMEND,
39
- )
40
- + EXTRAS
41
- ),
42
- start=SPECIAL_START_ID,
43
- )
44
- )
45
- SPECIAL_TOKENS_SET = set(t for i, t in SPECIAL_TOKENS)
46
-
47
-
48
- def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
49
- with open(tiktoken_bpe_file, "rb") as f:
50
- contents = f.read()
51
- return {
52
- base64.b64decode(token): int(rank)
53
- for token, rank in (line.split() for line in contents.splitlines() if line)
54
- }
55
-
56
-
57
- class QWenTokenizer(PreTrainedTokenizer):
58
- """QWen tokenizer."""
59
-
60
- vocab_files_names = VOCAB_FILES_NAMES
61
-
62
- def __init__(
63
- self,
64
- vocab_file,
65
- errors="replace",
66
- extra_vocab_file=None,
67
- **kwargs,
68
- ):
69
- super().__init__(**kwargs)
70
-
71
- # how to handle errors in decoding UTF-8 byte sequences
72
- # use ignore if you are in streaming inference
73
- self.errors = errors
74
-
75
- self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: Dict[bytes, int]
76
- self.special_tokens = {
77
- token: index
78
- for index, token in SPECIAL_TOKENS
79
- }
80
-
81
- # try load extra vocab from file
82
- if extra_vocab_file is not None:
83
- used_ids = set(self.mergeable_ranks.values()) | set(self.special_tokens.values())
84
- extra_mergeable_ranks = _load_tiktoken_bpe(extra_vocab_file)
85
- for token, index in extra_mergeable_ranks.items():
86
- if token in self.mergeable_ranks:
87
- logger.info(f"extra token {token} exists, skipping")
88
- continue
89
- if index in used_ids:
90
- logger.info(f'the index {index} for extra token {token} exists, skipping')
91
- continue
92
- self.mergeable_ranks[token] = index
93
- # the index may be sparse after this, but don't worry tiktoken.Encoding will handle this
94
-
95
- enc = tiktoken.Encoding(
96
- "Qwen",
97
- pat_str=PAT_STR,
98
- mergeable_ranks=self.mergeable_ranks,
99
- special_tokens=self.special_tokens,
100
- )
101
- assert (
102
- len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
103
- ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
104
-
105
- self.decoder = {
106
- v: k for k, v in self.mergeable_ranks.items()
107
- } # type: dict[int, bytes|str]
108
- self.decoder.update({v: k for k, v in self.special_tokens.items()})
109
-
110
- self.tokenizer = enc # type: tiktoken.Encoding
111
-
112
- self.eod_id = self.tokenizer.eot_token
113
- self.im_start_id = self.special_tokens[IMSTART]
114
- self.im_end_id = self.special_tokens[IMEND]
115
-
116
- def __getstate__(self):
117
- # for pickle lovers
118
- state = self.__dict__.copy()
119
- del state["tokenizer"]
120
- return state
121
-
122
- def __setstate__(self, state):
123
- # tokenizer is not python native; don't pass it; rebuild it
124
- self.__dict__.update(state)
125
- enc = tiktoken.Encoding(
126
- "Qwen",
127
- pat_str=PAT_STR,
128
- mergeable_ranks=self.mergeable_ranks,
129
- special_tokens=self.special_tokens,
130
- )
131
- self.tokenizer = enc
132
-
133
- def __len__(self) -> int:
134
- return self.tokenizer.n_vocab
135
-
136
- def get_vocab(self) -> Dict[bytes, int]:
137
- return self.mergeable_ranks
138
-
139
- def convert_tokens_to_ids(
140
- self, tokens: Union[bytes, str, List[Union[bytes, str]]]
141
- ) -> List[int]:
142
- ids = []
143
- if isinstance(tokens, (str, bytes)):
144
- if tokens in self.special_tokens:
145
- return self.special_tokens[tokens]
146
- else:
147
- return self.mergeable_ranks.get(tokens)
148
- for token in tokens:
149
- if token in self.special_tokens:
150
- ids.append(self.special_tokens[token])
151
- else:
152
- ids.append(self.mergeable_ranks.get(token))
153
- return ids
154
-
155
- def _add_tokens(
156
- self,
157
- new_tokens: Union[List[str], List[AddedToken]],
158
- special_tokens: bool = False,
159
- ) -> int:
160
- if not special_tokens and new_tokens:
161
- raise ValueError("Adding regular tokens is not supported")
162
- for token in new_tokens:
163
- surface_form = token.content if isinstance(token, AddedToken) else token
164
- if surface_form not in SPECIAL_TOKENS_SET:
165
- raise ValueError("Adding unknown special tokens is not supported")
166
- return 0
167
-
168
- def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
169
- """
170
- Save only the vocabulary of the tokenizer (vocabulary).
171
-
172
- Returns:
173
- `Tuple(str)`: Paths to the files saved.
174
- """
175
- file_path = os.path.join(save_directory, "qwen.tiktoken")
176
- with open(file_path, "w", encoding="utf8") as w:
177
- for k, v in self.mergeable_ranks.items():
178
- line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
179
- w.write(line)
180
- return (file_path,)
181
-
182
- def tokenize(
183
- self,
184
- text: str,
185
- allowed_special: Union[Set, str] = "all",
186
- disallowed_special: Union[Collection, str] = (),
187
- **kwargs,
188
- ) -> List[Union[bytes, str]]:
189
- """
190
- Converts a string in a sequence of tokens.
191
-
192
- Args:
193
- text (`str`):
194
- The sequence to be encoded.
195
- allowed_special (`Literal["all"]` or `set`):
196
- The surface forms of the tokens to be encoded as special tokens in regular texts.
197
- Default to "all".
198
- disallowed_special (`Literal["all"]` or `Collection`):
199
- The surface forms of the tokens that should not be in regular texts and trigger errors.
200
- Default to an empty tuple.
201
-
202
- kwargs (additional keyword arguments, *optional*):
203
- Will be passed to the underlying model specific encode method.
204
-
205
- Returns:
206
- `List[bytes|str]`: The list of tokens.
207
- """
208
- tokens = []
209
- text = unicodedata.normalize("NFC", text)
210
-
211
- # this implementation takes a detour: text -> token id -> token surface forms
212
- for t in self.tokenizer.encode(
213
- text, allowed_special=allowed_special, disallowed_special=disallowed_special
214
- ):
215
- tokens.append(self.decoder[t])
216
- return tokens
217
-
218
- def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
219
- """
220
- Converts a sequence of tokens in a single string.
221
- """
222
- text = ""
223
- temp = b""
224
- for t in tokens:
225
- if isinstance(t, str):
226
- if temp:
227
- text += temp.decode("utf-8", errors=self.errors)
228
- temp = b""
229
- text += t
230
- elif isinstance(t, bytes):
231
- temp += t
232
- else:
233
- raise TypeError("token should only be of type types or str")
234
- if temp:
235
- text += temp.decode("utf-8", errors=self.errors)
236
- return text
237
-
238
- @property
239
- def vocab_size(self):
240
- return self.tokenizer.n_vocab
241
-
242
- def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
243
- """Converts an id to a token, special tokens included"""
244
- if index in self.decoder:
245
- return self.decoder[index]
246
- raise ValueError("unknown ids")
247
-
248
- def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
249
- """Converts a token to an id using the vocab, special tokens included"""
250
- if token in self.special_tokens:
251
- return self.special_tokens[token]
252
- if token in self.mergeable_ranks:
253
- return self.mergeable_ranks[token]
254
- raise ValueError("unknown token")
255
-
256
- def _tokenize(self, text: str, **kwargs):
257
- """
258
- Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
259
- vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
260
-
261
- Do NOT take care of added tokens.
262
- """
263
- raise NotImplementedError
264
-
265
- def _decode(
266
- self,
267
- token_ids: Union[int, List[int]],
268
- skip_special_tokens: bool = False,
269
- errors: str = None,
270
- **kwargs,
271
- ) -> str:
272
- if isinstance(token_ids, int):
273
- token_ids = [token_ids]
274
- if skip_special_tokens:
275
- token_ids = [i for i in token_ids if i < self.eod_id]
276
- return self.tokenizer.decode(token_ids, errors=errors or self.errors)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cooldown/iter_0084772_hf/tokenizer_config.json DELETED
@@ -1,11 +0,0 @@
1
- {
2
- "model_max_length": 8192,
3
- "tokenizer_class": "QWenTokenizer",
4
- "auto_map": {
5
- "AutoTokenizer": [
6
- "tokenization_qwen.QWenTokenizer",
7
- null
8
- ]
9
- },
10
- "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
11
- }