base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B | |
gate_mode: random | |
dtype: float16 | |
tokenizer_source: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" | |
experts_per_token: 2 | |
experts: | |
- source_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B | |
- source_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B | |
shared_experts: | |
- source_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B | |
positive_prompts: | |
- "" | |
residual_scale: 0.1 | |