yujiepan commited on
Commit
54ec24f
·
verified ·
1 Parent(s): 4e3211f

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ pipeline_tag: text-generation
4
+ inference: true
5
+ widget:
6
+ - text: Hello!
7
+ example_title: Hello world
8
+ group: Python
9
+ ---
10
+
11
+ This model is for debugging. It is randomly initialized with the config from [deepseek-ai/DeepSeek-V3](https://huggingface.co/deepseek-ai/DeepSeek-V3) but is of smaller size.
12
+
13
+ Codes:
14
+ ```python
15
+ import os
16
+ from pathlib import Path
17
+
18
+ import torch
19
+ import transformers
20
+ from huggingface_hub import create_repo, upload_folder
21
+ from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
22
+ GenerationConfig, enable_full_determinism, pipeline,
23
+ set_seed)
24
+
25
+ model_id = "deepseek-ai/DeepSeek-V3"
26
+ repo_id = "yujiepan/deepseek-v3-tiny-random"
27
+ save_path = f"/tmp/{repo_id}"
28
+ os.system(f"rm -rf {save_path}")
29
+
30
+ config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
31
+ config.num_hidden_layers = 2
32
+ config.first_k_dense_replace = 1
33
+ config.hidden_size = 16
34
+ config.intermediate_size = 32
35
+ config.moe_intermediate_size = 16
36
+ config.q_lora_rank = 16
37
+ config.kv_lora_rank = 16
38
+ config.qk_rope_head_dim = 16
39
+ config.qk_nope_head_dim = 16
40
+ config.v_head_dim = 16
41
+ config.num_attention_heads = 2
42
+ config.num_key_value_heads = 2
43
+ # transformers has not supported the customized quantization config
44
+ del config.quantization_config
45
+
46
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
47
+ tokenizer.save_pretrained(save_path)
48
+
49
+ enable_full_determinism(seed=42)
50
+ model = AutoModelForCausalLM.from_config(
51
+ config, torch_dtype=torch.bfloat16, trust_remote_code=True,
52
+ ).eval()
53
+
54
+ try:
55
+ model.generation_config = GenerationConfig.from_pretrained(
56
+ model_id, trust_remote_code=True)
57
+ except:
58
+ print("No generation config found")
59
+
60
+ num_params = 0
61
+ with torch.no_grad():
62
+ for name, p in sorted(model.named_parameters()):
63
+ if 'experts' in name and 'experts.0.' not in name: # avoid printing too much
64
+ pass
65
+ else:
66
+ print(name, p.shape)
67
+ # torch.nn.init.uniform_(p, -0.2, 0.2)
68
+ num_params += p.numel()
69
+ print(f"Number of parameters: {num_params / 1e6:.2f}M")
70
+ model.save_pretrained(save_path)
71
+
72
+ # patch to use official modeling codes
73
+ auto_map = config.auto_map
74
+ import json
75
+ with open(f"{save_path}/config.json", "r") as f:
76
+ config = json.load(f)
77
+ config['auto_map'] = auto_map
78
+ with open(f"{save_path}/config.json", "w") as f:
79
+ json.dump(config, f, indent=2)
80
+
81
+ ! cat {save_path}/config.json
82
+
83
+ del model
84
+ del tokenizer
85
+ for p in Path(save_path).glob("*.py"):
86
+ os.remove(p)
87
+
88
+ os.system(f"ls -alh {save_path}")
89
+ torch.use_deterministic_algorithms(False)
90
+ tokenizer = AutoTokenizer.from_pretrained(save_path)
91
+ model = AutoModelForCausalLM.from_pretrained(
92
+ save_path, trust_remote_code=True).eval()
93
+ prompt = 'Hello!'
94
+ messages = [
95
+ {"role": "system", "content": "You are a helpful assistant."}
96
+ ]
97
+ messages.append({"role": "user", "content": prompt})
98
+ tokenized_chat = tokenizer.apply_chat_template(
99
+ messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
100
+
101
+ device = torch.device("cuda")
102
+ outputs = model.to(device).generate(
103
+ tokenized_chat.to(device),
104
+ max_new_tokens=16,
105
+ do_sample=False,
106
+ use_cache=True,
107
+ )
108
+ tokens = tokenizer.convert_ids_to_tokens(outputs[0])
109
+ string = tokenizer.decode(outputs[0])
110
+ print(tokens)
111
+
112
+
113
+ # create_repo(repo_id, exist_ok=True)
114
+ # upload_folder(repo_id=repo_id, folder_path=save_path)
115
+ ```
config.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "deepseek-ai/DeepSeek-V3",
3
+ "architectures": [
4
+ "DeepseekV3ForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "auto_map": {
9
+ "AutoConfig": "deepseek-ai/DeepSeek-V3--configuration_deepseek.DeepseekV3Config",
10
+ "AutoModel": "deepseek-ai/DeepSeek-V3--modeling_deepseek.DeepseekV3Model",
11
+ "AutoModelForCausalLM": "deepseek-ai/DeepSeek-V3--modeling_deepseek.DeepseekV3ForCausalLM"
12
+ },
13
+ "aux_loss_alpha": 0.001,
14
+ "bos_token_id": 0,
15
+ "eos_token_id": 1,
16
+ "ep_size": 1,
17
+ "first_k_dense_replace": 1,
18
+ "hidden_act": "silu",
19
+ "hidden_size": 16,
20
+ "initializer_range": 0.02,
21
+ "intermediate_size": 32,
22
+ "kv_lora_rank": 16,
23
+ "max_position_embeddings": 163840,
24
+ "model_type": "deepseek_v3",
25
+ "moe_intermediate_size": 16,
26
+ "moe_layer_freq": 1,
27
+ "n_group": 8,
28
+ "n_routed_experts": 256,
29
+ "n_shared_experts": 1,
30
+ "norm_topk_prob": true,
31
+ "num_attention_heads": 2,
32
+ "num_experts_per_tok": 8,
33
+ "num_hidden_layers": 2,
34
+ "num_key_value_heads": 2,
35
+ "num_nextn_predict_layers": 1,
36
+ "pretraining_tp": 1,
37
+ "q_lora_rank": 16,
38
+ "qk_nope_head_dim": 16,
39
+ "qk_rope_head_dim": 16,
40
+ "rms_norm_eps": 1e-06,
41
+ "rope_scaling": {
42
+ "beta_fast": 32,
43
+ "beta_slow": 1,
44
+ "factor": 40,
45
+ "mscale": 1.0,
46
+ "mscale_all_dim": 1.0,
47
+ "original_max_position_embeddings": 4096,
48
+ "type": "yarn"
49
+ },
50
+ "rope_theta": 10000,
51
+ "routed_scaling_factor": 2.5,
52
+ "scoring_func": "sigmoid",
53
+ "seq_aux": true,
54
+ "tie_word_embeddings": false,
55
+ "topk_group": 4,
56
+ "topk_method": "noaux_tc",
57
+ "torch_dtype": "bfloat16",
58
+ "transformers_version": "4.38.2",
59
+ "use_cache": true,
60
+ "v_head_dim": 16,
61
+ "vocab_size": 129280
62
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "transformers_version": "4.38.2"
6
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13cc050b5806e2718d01551302a49cabed2e156c822700679f44f6b5fad50fef
3
+ size 8785464
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|begin▁of▁sentence|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|end▁of▁sentence|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|end▁of▁sentence|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff