w32zhong commited on
Commit
67f03fc
·
verified ·
1 Parent(s): d985c94

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoModelForCausalLM": "modeling_speculative_qwen3.SpeculativeQwen3ForCausalLM"
9
+ },
10
+ "bos_token_id": 151643,
11
+ "draft_layers": 1,
12
+ "eos_token_id": 151645,
13
+ "head_dim": 128,
14
+ "hidden_act": "silu",
15
+ "hidden_size": 2560,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 9728,
18
+ "layer_types": [
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention",
51
+ "full_attention",
52
+ "full_attention",
53
+ "full_attention",
54
+ "full_attention"
55
+ ],
56
+ "max_position_embeddings": 262144,
57
+ "max_window_layers": 36,
58
+ "model_type": "qwen3",
59
+ "num_attention_heads": 32,
60
+ "num_hidden_layers": 0,
61
+ "num_key_value_heads": 8,
62
+ "rms_norm_eps": 1e-06,
63
+ "rope_scaling": null,
64
+ "rope_theta": 5000000,
65
+ "skip_input_layernorm": true,
66
+ "skip_output_norm": true,
67
+ "sliding_window": null,
68
+ "speculative_decoding_algorithm": "EagleV2",
69
+ "speculative_decoding_base_model_path": "Qwen/Qwen3-4B-Instruct-2507",
70
+ "speculative_decoding_draft_model": "Qwen3MoeDrafter",
71
+ "tie_word_embeddings": true,
72
+ "torch_dtype": "float32",
73
+ "transformers_version": "4.54.1",
74
+ "use_cache": true,
75
+ "use_sliding_window": false,
76
+ "vocab_size": 151936
77
+ }
draft_model/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3MoeForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "decoder_sparse_step": 1,
9
+ "eos_token_id": 151645,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 2560,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 6144,
15
+ "max_position_embeddings": 262144,
16
+ "max_window_layers": 48,
17
+ "mlp_only_layers": [],
18
+ "model_type": "qwen3_moe",
19
+ "moe_intermediate_size": 1216,
20
+ "norm_topk_prob": true,
21
+ "num_attention_heads": 32,
22
+ "num_experts": 128,
23
+ "num_experts_per_tok": 8,
24
+ "num_hidden_layers": 1,
25
+ "num_key_value_heads": 8,
26
+ "output_router_logits": false,
27
+ "rms_norm_eps": 1e-06,
28
+ "rope_scaling": null,
29
+ "rope_theta": 5000000,
30
+ "router_aux_loss_coef": 0.001,
31
+ "sliding_window": null,
32
+ "tie_word_embeddings": false,
33
+ "torch_dtype": "bfloat16",
34
+ "transformers_version": "4.54.1",
35
+ "use_cache": true,
36
+ "use_sliding_window": false,
37
+ "vocab_size": 151936
38
+ }
draft_model/states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae9e2e4bc8b39094938df8bebfa7afd0b39f93ddad29843846ddf6863d51b3a0
3
+ size 4940286931
modeling_speculative_qwen3.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers.models.qwen3.modeling_qwen3 import *
3
+ from specforge_het.specforge_lm import SpecForgeLM
4
+
5
+
6
+ class Qwen3Drafter(Qwen3Model):
7
+ def __init__(self, draft_config, base_model):
8
+ draft_config.num_hidden_layers = base_model.config.draft_layers
9
+ draft_config.hidden_size = base_model.get_hidden_size()
10
+ super().__init__(draft_config)
11
+
12
+ if base_model.config.skip_input_layernorm:
13
+ for layer in self.layers:
14
+ delattr(layer, 'input_layernorm')
15
+ layer.input_layernorm = torch.nn.Identity()
16
+
17
+ if base_model.config.skip_output_norm:
18
+ delattr(self, 'norm')
19
+ self.norm = torch.nn.Identity()
20
+
21
+ delattr(self, 'embed_tokens')
22
+
23
+ def get_hidden_size(self):
24
+ return self.config.hidden_size
25
+
26
+
27
+ class SpeculativeQwen3ForCausalLM(SpecForgeLM, Qwen3ForCausalLM):
28
+ @property
29
+ def base_model(self):
30
+ return self.model
31
+
32
+ def get_hidden_size(self):
33
+ return self.config.hidden_size
34
+
35
+ def get_base_layers(self):
36
+ return self.base_model.layers
37
+
38
+ def get_token_embedding(self, input_ids):
39
+ return self.base_model.embed_tokens(input_ids)
40
+
41
+ def get_token_logits(self, hidden_states):
42
+ return self.lm_head(hidden_states)
43
+
44
+ def save_pretrained(self, path, **kwargs):
45
+ return self.save_speculative_model(path, **kwargs)
specforge_het.json ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset.debug": false,
3
+ "dataset.eval_path": null,
4
+ "dataset.git_diff": "",
5
+ "dataset.git_sha1": "unknown",
6
+ "dataset.manual_sample_ids": [],
7
+ "dataset.max_read_items": null,
8
+ "dataset.output_dir": "output",
9
+ "dataset.path": "output/datasets/ds_Qwen3-4B-Instruct-2507",
10
+ "dataset.read_eagle_format": false,
11
+ "dataset.run_name": "temp_run",
12
+ "dataset.seed": 0,
13
+ "dataset_generation.batch_size": 1,
14
+ "dataset_generation.debug": false,
15
+ "dataset_generation.debug_target": null,
16
+ "dataset_generation.ds_prefix": "ds_",
17
+ "dataset_generation.git_diff": "",
18
+ "dataset_generation.git_sha1": "unknown",
19
+ "dataset_generation.max_length": 2048,
20
+ "dataset_generation.output_dir": "output",
21
+ "dataset_generation.run_name": "temp_run",
22
+ "dataset_generation.save_every": 1000,
23
+ "dataset_generation.seed": 0,
24
+ "dataset_generation.sharegpt_path": "Aeala/ShareGPT_Vicuna_unfiltered",
25
+ "device_names": [
26
+ "NVIDIA H100 80GB HBM3"
27
+ ],
28
+ "inference.alpha_stats": false,
29
+ "inference.debug": false,
30
+ "inference.draft_growing": false,
31
+ "inference.draft_tree_shape": "mc_sim_7b_64",
32
+ "inference.dynamic_draft": true,
33
+ "inference.dynamic_draft_all_top_k": 59,
34
+ "inference.dynamic_draft_max_depth": 5,
35
+ "inference.dynamic_draft_top_k": 10,
36
+ "inference.git_diff": "",
37
+ "inference.git_sha1": "unknown",
38
+ "inference.interactive": false,
39
+ "inference.max_draft_growing_depth": 100,
40
+ "inference.max_new_tokens": 512,
41
+ "inference.mode": "speculative",
42
+ "inference.output_dir": "output",
43
+ "inference.run_name": "temp_run",
44
+ "inference.seed": 0,
45
+ "inference.timer": false,
46
+ "modeling.chat_template": "models/speculative_qwen3/chat_template.jinja2",
47
+ "modeling.debug": false,
48
+ "modeling.draft_config_modify": {
49
+ "moe_intermediate_size": "base.intermediate_size // draft_config.num_experts_per_tok",
50
+ "num_key_value_heads": "base.num_key_value_heads",
51
+ "rope_theta": "base.rope_theta"
52
+ },
53
+ "modeling.dtype": "torch.float32",
54
+ "modeling.free_base_layers": "num_hidden_layers",
55
+ "modeling.git_diff": "",
56
+ "modeling.git_sha1": "unknown",
57
+ "modeling.init_base_model": [
58
+ "SpeculativeQwen3ForCausalLM",
59
+ "Qwen/Qwen3-4B-Instruct-2507"
60
+ ],
61
+ "modeling.init_draft_config": [
62
+ "Qwen3MoeDrafter",
63
+ "Qwen/Qwen3-30B-A3B-Instruct-2507"
64
+ ],
65
+ "modeling.init_speculative_algorithm": [
66
+ "EagleV2",
67
+ "dict(draft_layers=1)"
68
+ ],
69
+ "modeling.model_path": null,
70
+ "modeling.output_dir": "output",
71
+ "modeling.run_name": "temp_run",
72
+ "modeling.seed": 0,
73
+ "modeling.tokenizer_add_tokens": "dict(unk_token=tokenizer.eos_token)",
74
+ "modeling.tokenizer_init": "",
75
+ "modeling.tokenizer_path": "Qwen/Qwen3-4B-Instruct-2507",
76
+ "training.adam_beta1": 0.9,
77
+ "training.adam_beta2": 0.95,
78
+ "training.average_tokens_across_devices": false,
79
+ "training.bf16": true,
80
+ "training.dataloader_drop_last": true,
81
+ "training.ddp_backend": "gloo",
82
+ "training.ddp_find_unused_parameters": false,
83
+ "training.debug": false,
84
+ "training.deepspeed": null,
85
+ "training.eval_steps": 100,
86
+ "training.eval_strategy": "no",
87
+ "training.filter_out_shorts": false,
88
+ "training.git_diff": "",
89
+ "training.git_sha1": "c545fcb3d9aa995dcf3e0ba5609559a39897f05e",
90
+ "training.gradient_accumulation_steps": 3,
91
+ "training.learning_rate": 3e-05,
92
+ "training.logging_first_step": true,
93
+ "training.logging_steps": 25,
94
+ "training.lr_scheduler_type": "constant_with_warmup",
95
+ "training.max_grad_norm": 0.5,
96
+ "training.max_length": 2048,
97
+ "training.max_steps": -1,
98
+ "training.model_init_ckpt": null,
99
+ "training.num_train_epochs": 10,
100
+ "training.optim": "adamw_torch_fused",
101
+ "training.output_dir": "output/eager-sky-65",
102
+ "training.overwrite_output_dir": true,
103
+ "training.per_device_eval_batch_size": 1,
104
+ "training.per_device_train_batch_size": 2,
105
+ "training.project": "eagle4",
106
+ "training.report_to": "wandb",
107
+ "training.resume_from_checkpoint": false,
108
+ "training.resume_wandb_runid": null,
109
+ "training.run_name": "eager-sky-65",
110
+ "training.save_steps": 500,
111
+ "training.save_strategy": "steps",
112
+ "training.save_total_limit": 2,
113
+ "training.seed": 0,
114
+ "training.sequential_loading": false,
115
+ "training.tf32": false,
116
+ "training.use_default_num_items_getter": true,
117
+ "training.use_eagle_pipeline": false,
118
+ "training.warmup_steps": 2000,
119
+ "training.world_size": 1
120
+ }