Upload folder using huggingface_hub
Browse files- config.json +77 -0
- draft_model/config.json +38 -0
- draft_model/states.pt +3 -0
- modeling_speculative_qwen3.py +45 -0
- specforge_het.json +120 -0
config.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"Qwen3ForCausalLM"
|
4 |
+
],
|
5 |
+
"attention_bias": false,
|
6 |
+
"attention_dropout": 0.0,
|
7 |
+
"auto_map": {
|
8 |
+
"AutoModelForCausalLM": "modeling_speculative_qwen3.SpeculativeQwen3ForCausalLM"
|
9 |
+
},
|
10 |
+
"bos_token_id": 151643,
|
11 |
+
"draft_layers": 1,
|
12 |
+
"eos_token_id": 151645,
|
13 |
+
"head_dim": 128,
|
14 |
+
"hidden_act": "silu",
|
15 |
+
"hidden_size": 2560,
|
16 |
+
"initializer_range": 0.02,
|
17 |
+
"intermediate_size": 9728,
|
18 |
+
"layer_types": [
|
19 |
+
"full_attention",
|
20 |
+
"full_attention",
|
21 |
+
"full_attention",
|
22 |
+
"full_attention",
|
23 |
+
"full_attention",
|
24 |
+
"full_attention",
|
25 |
+
"full_attention",
|
26 |
+
"full_attention",
|
27 |
+
"full_attention",
|
28 |
+
"full_attention",
|
29 |
+
"full_attention",
|
30 |
+
"full_attention",
|
31 |
+
"full_attention",
|
32 |
+
"full_attention",
|
33 |
+
"full_attention",
|
34 |
+
"full_attention",
|
35 |
+
"full_attention",
|
36 |
+
"full_attention",
|
37 |
+
"full_attention",
|
38 |
+
"full_attention",
|
39 |
+
"full_attention",
|
40 |
+
"full_attention",
|
41 |
+
"full_attention",
|
42 |
+
"full_attention",
|
43 |
+
"full_attention",
|
44 |
+
"full_attention",
|
45 |
+
"full_attention",
|
46 |
+
"full_attention",
|
47 |
+
"full_attention",
|
48 |
+
"full_attention",
|
49 |
+
"full_attention",
|
50 |
+
"full_attention",
|
51 |
+
"full_attention",
|
52 |
+
"full_attention",
|
53 |
+
"full_attention",
|
54 |
+
"full_attention"
|
55 |
+
],
|
56 |
+
"max_position_embeddings": 262144,
|
57 |
+
"max_window_layers": 36,
|
58 |
+
"model_type": "qwen3",
|
59 |
+
"num_attention_heads": 32,
|
60 |
+
"num_hidden_layers": 0,
|
61 |
+
"num_key_value_heads": 8,
|
62 |
+
"rms_norm_eps": 1e-06,
|
63 |
+
"rope_scaling": null,
|
64 |
+
"rope_theta": 5000000,
|
65 |
+
"skip_input_layernorm": true,
|
66 |
+
"skip_output_norm": true,
|
67 |
+
"sliding_window": null,
|
68 |
+
"speculative_decoding_algorithm": "EagleV2",
|
69 |
+
"speculative_decoding_base_model_path": "Qwen/Qwen3-4B-Instruct-2507",
|
70 |
+
"speculative_decoding_draft_model": "Qwen3MoeDrafter",
|
71 |
+
"tie_word_embeddings": true,
|
72 |
+
"torch_dtype": "float32",
|
73 |
+
"transformers_version": "4.54.1",
|
74 |
+
"use_cache": true,
|
75 |
+
"use_sliding_window": false,
|
76 |
+
"vocab_size": 151936
|
77 |
+
}
|
draft_model/config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"Qwen3MoeForCausalLM"
|
4 |
+
],
|
5 |
+
"attention_bias": false,
|
6 |
+
"attention_dropout": 0.0,
|
7 |
+
"bos_token_id": 151643,
|
8 |
+
"decoder_sparse_step": 1,
|
9 |
+
"eos_token_id": 151645,
|
10 |
+
"head_dim": 128,
|
11 |
+
"hidden_act": "silu",
|
12 |
+
"hidden_size": 2560,
|
13 |
+
"initializer_range": 0.02,
|
14 |
+
"intermediate_size": 6144,
|
15 |
+
"max_position_embeddings": 262144,
|
16 |
+
"max_window_layers": 48,
|
17 |
+
"mlp_only_layers": [],
|
18 |
+
"model_type": "qwen3_moe",
|
19 |
+
"moe_intermediate_size": 1216,
|
20 |
+
"norm_topk_prob": true,
|
21 |
+
"num_attention_heads": 32,
|
22 |
+
"num_experts": 128,
|
23 |
+
"num_experts_per_tok": 8,
|
24 |
+
"num_hidden_layers": 1,
|
25 |
+
"num_key_value_heads": 8,
|
26 |
+
"output_router_logits": false,
|
27 |
+
"rms_norm_eps": 1e-06,
|
28 |
+
"rope_scaling": null,
|
29 |
+
"rope_theta": 5000000,
|
30 |
+
"router_aux_loss_coef": 0.001,
|
31 |
+
"sliding_window": null,
|
32 |
+
"tie_word_embeddings": false,
|
33 |
+
"torch_dtype": "bfloat16",
|
34 |
+
"transformers_version": "4.54.1",
|
35 |
+
"use_cache": true,
|
36 |
+
"use_sliding_window": false,
|
37 |
+
"vocab_size": 151936
|
38 |
+
}
|
draft_model/states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ae9e2e4bc8b39094938df8bebfa7afd0b39f93ddad29843846ddf6863d51b3a0
|
3 |
+
size 4940286931
|
modeling_speculative_qwen3.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers.models.qwen3.modeling_qwen3 import *
|
3 |
+
from specforge_het.specforge_lm import SpecForgeLM
|
4 |
+
|
5 |
+
|
6 |
+
class Qwen3Drafter(Qwen3Model):
|
7 |
+
def __init__(self, draft_config, base_model):
|
8 |
+
draft_config.num_hidden_layers = base_model.config.draft_layers
|
9 |
+
draft_config.hidden_size = base_model.get_hidden_size()
|
10 |
+
super().__init__(draft_config)
|
11 |
+
|
12 |
+
if base_model.config.skip_input_layernorm:
|
13 |
+
for layer in self.layers:
|
14 |
+
delattr(layer, 'input_layernorm')
|
15 |
+
layer.input_layernorm = torch.nn.Identity()
|
16 |
+
|
17 |
+
if base_model.config.skip_output_norm:
|
18 |
+
delattr(self, 'norm')
|
19 |
+
self.norm = torch.nn.Identity()
|
20 |
+
|
21 |
+
delattr(self, 'embed_tokens')
|
22 |
+
|
23 |
+
def get_hidden_size(self):
|
24 |
+
return self.config.hidden_size
|
25 |
+
|
26 |
+
|
27 |
+
class SpeculativeQwen3ForCausalLM(SpecForgeLM, Qwen3ForCausalLM):
|
28 |
+
@property
|
29 |
+
def base_model(self):
|
30 |
+
return self.model
|
31 |
+
|
32 |
+
def get_hidden_size(self):
|
33 |
+
return self.config.hidden_size
|
34 |
+
|
35 |
+
def get_base_layers(self):
|
36 |
+
return self.base_model.layers
|
37 |
+
|
38 |
+
def get_token_embedding(self, input_ids):
|
39 |
+
return self.base_model.embed_tokens(input_ids)
|
40 |
+
|
41 |
+
def get_token_logits(self, hidden_states):
|
42 |
+
return self.lm_head(hidden_states)
|
43 |
+
|
44 |
+
def save_pretrained(self, path, **kwargs):
|
45 |
+
return self.save_speculative_model(path, **kwargs)
|
specforge_het.json
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset.debug": false,
|
3 |
+
"dataset.eval_path": null,
|
4 |
+
"dataset.git_diff": "",
|
5 |
+
"dataset.git_sha1": "unknown",
|
6 |
+
"dataset.manual_sample_ids": [],
|
7 |
+
"dataset.max_read_items": null,
|
8 |
+
"dataset.output_dir": "output",
|
9 |
+
"dataset.path": "output/datasets/ds_Qwen3-4B-Instruct-2507",
|
10 |
+
"dataset.read_eagle_format": false,
|
11 |
+
"dataset.run_name": "temp_run",
|
12 |
+
"dataset.seed": 0,
|
13 |
+
"dataset_generation.batch_size": 1,
|
14 |
+
"dataset_generation.debug": false,
|
15 |
+
"dataset_generation.debug_target": null,
|
16 |
+
"dataset_generation.ds_prefix": "ds_",
|
17 |
+
"dataset_generation.git_diff": "",
|
18 |
+
"dataset_generation.git_sha1": "unknown",
|
19 |
+
"dataset_generation.max_length": 2048,
|
20 |
+
"dataset_generation.output_dir": "output",
|
21 |
+
"dataset_generation.run_name": "temp_run",
|
22 |
+
"dataset_generation.save_every": 1000,
|
23 |
+
"dataset_generation.seed": 0,
|
24 |
+
"dataset_generation.sharegpt_path": "Aeala/ShareGPT_Vicuna_unfiltered",
|
25 |
+
"device_names": [
|
26 |
+
"NVIDIA H100 80GB HBM3"
|
27 |
+
],
|
28 |
+
"inference.alpha_stats": false,
|
29 |
+
"inference.debug": false,
|
30 |
+
"inference.draft_growing": false,
|
31 |
+
"inference.draft_tree_shape": "mc_sim_7b_64",
|
32 |
+
"inference.dynamic_draft": true,
|
33 |
+
"inference.dynamic_draft_all_top_k": 59,
|
34 |
+
"inference.dynamic_draft_max_depth": 5,
|
35 |
+
"inference.dynamic_draft_top_k": 10,
|
36 |
+
"inference.git_diff": "",
|
37 |
+
"inference.git_sha1": "unknown",
|
38 |
+
"inference.interactive": false,
|
39 |
+
"inference.max_draft_growing_depth": 100,
|
40 |
+
"inference.max_new_tokens": 512,
|
41 |
+
"inference.mode": "speculative",
|
42 |
+
"inference.output_dir": "output",
|
43 |
+
"inference.run_name": "temp_run",
|
44 |
+
"inference.seed": 0,
|
45 |
+
"inference.timer": false,
|
46 |
+
"modeling.chat_template": "models/speculative_qwen3/chat_template.jinja2",
|
47 |
+
"modeling.debug": false,
|
48 |
+
"modeling.draft_config_modify": {
|
49 |
+
"moe_intermediate_size": "base.intermediate_size // draft_config.num_experts_per_tok",
|
50 |
+
"num_key_value_heads": "base.num_key_value_heads",
|
51 |
+
"rope_theta": "base.rope_theta"
|
52 |
+
},
|
53 |
+
"modeling.dtype": "torch.float32",
|
54 |
+
"modeling.free_base_layers": "num_hidden_layers",
|
55 |
+
"modeling.git_diff": "",
|
56 |
+
"modeling.git_sha1": "unknown",
|
57 |
+
"modeling.init_base_model": [
|
58 |
+
"SpeculativeQwen3ForCausalLM",
|
59 |
+
"Qwen/Qwen3-4B-Instruct-2507"
|
60 |
+
],
|
61 |
+
"modeling.init_draft_config": [
|
62 |
+
"Qwen3MoeDrafter",
|
63 |
+
"Qwen/Qwen3-30B-A3B-Instruct-2507"
|
64 |
+
],
|
65 |
+
"modeling.init_speculative_algorithm": [
|
66 |
+
"EagleV2",
|
67 |
+
"dict(draft_layers=1)"
|
68 |
+
],
|
69 |
+
"modeling.model_path": null,
|
70 |
+
"modeling.output_dir": "output",
|
71 |
+
"modeling.run_name": "temp_run",
|
72 |
+
"modeling.seed": 0,
|
73 |
+
"modeling.tokenizer_add_tokens": "dict(unk_token=tokenizer.eos_token)",
|
74 |
+
"modeling.tokenizer_init": "",
|
75 |
+
"modeling.tokenizer_path": "Qwen/Qwen3-4B-Instruct-2507",
|
76 |
+
"training.adam_beta1": 0.9,
|
77 |
+
"training.adam_beta2": 0.95,
|
78 |
+
"training.average_tokens_across_devices": false,
|
79 |
+
"training.bf16": true,
|
80 |
+
"training.dataloader_drop_last": true,
|
81 |
+
"training.ddp_backend": "gloo",
|
82 |
+
"training.ddp_find_unused_parameters": false,
|
83 |
+
"training.debug": false,
|
84 |
+
"training.deepspeed": null,
|
85 |
+
"training.eval_steps": 100,
|
86 |
+
"training.eval_strategy": "no",
|
87 |
+
"training.filter_out_shorts": false,
|
88 |
+
"training.git_diff": "",
|
89 |
+
"training.git_sha1": "c545fcb3d9aa995dcf3e0ba5609559a39897f05e",
|
90 |
+
"training.gradient_accumulation_steps": 3,
|
91 |
+
"training.learning_rate": 3e-05,
|
92 |
+
"training.logging_first_step": true,
|
93 |
+
"training.logging_steps": 25,
|
94 |
+
"training.lr_scheduler_type": "constant_with_warmup",
|
95 |
+
"training.max_grad_norm": 0.5,
|
96 |
+
"training.max_length": 2048,
|
97 |
+
"training.max_steps": -1,
|
98 |
+
"training.model_init_ckpt": null,
|
99 |
+
"training.num_train_epochs": 10,
|
100 |
+
"training.optim": "adamw_torch_fused",
|
101 |
+
"training.output_dir": "output/eager-sky-65",
|
102 |
+
"training.overwrite_output_dir": true,
|
103 |
+
"training.per_device_eval_batch_size": 1,
|
104 |
+
"training.per_device_train_batch_size": 2,
|
105 |
+
"training.project": "eagle4",
|
106 |
+
"training.report_to": "wandb",
|
107 |
+
"training.resume_from_checkpoint": false,
|
108 |
+
"training.resume_wandb_runid": null,
|
109 |
+
"training.run_name": "eager-sky-65",
|
110 |
+
"training.save_steps": 500,
|
111 |
+
"training.save_strategy": "steps",
|
112 |
+
"training.save_total_limit": 2,
|
113 |
+
"training.seed": 0,
|
114 |
+
"training.sequential_loading": false,
|
115 |
+
"training.tf32": false,
|
116 |
+
"training.use_default_num_items_getter": true,
|
117 |
+
"training.use_eagle_pipeline": false,
|
118 |
+
"training.warmup_steps": 2000,
|
119 |
+
"training.world_size": 1
|
120 |
+
}
|