Update README.md
Browse files
README.md
CHANGED
|
@@ -11,7 +11,7 @@ library_name: transformers
|
|
| 11 |
# DeepSeek-R1-Distill-Llama-8B-FP8-Dynamic
|
| 12 |
|
| 13 |
## Model Overview
|
| 14 |
-
- **Model Architecture:**
|
| 15 |
- **Input:** Text
|
| 16 |
- **Output:** Text
|
| 17 |
- **Model Optimizations:**
|
|
@@ -25,12 +25,15 @@ Quantized version of [DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/deeps
|
|
| 25 |
|
| 26 |
### Model Optimizations
|
| 27 |
|
| 28 |
-
This model was obtained by quantizing the weights and activations to FP8 data type
|
| 29 |
-
This optimization reduces the number of bits per parameter from 16 to 8, reducing the disk size and GPU memory requirements by approximately 50%.
|
| 30 |
|
| 31 |
-
|
|
|
|
|
|
|
| 32 |
|
| 33 |
-
|
|
|
|
| 34 |
|
| 35 |
This model can be deployed efficiently using the [vLLM](https://docs.vllm.ai/en/latest/) backend, as shown in the example below.
|
| 36 |
|
|
@@ -38,11 +41,12 @@ This model can be deployed efficiently using the [vLLM](https://docs.vllm.ai/en/
|
|
| 38 |
from transformers import AutoTokenizer
|
| 39 |
from vllm import LLM, SamplingParams
|
| 40 |
|
| 41 |
-
|
| 42 |
model_name = "neuralmagic/DeepSeek-R1-Distill-Llama-8B-FP8-Dynamic"
|
|
|
|
| 43 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 44 |
-
llm = LLM(model=model_name, tensor_parallel_size=tp_size, max_model_len=max_model_len, trust_remote_code=True)
|
| 45 |
sampling_params = SamplingParams(temperature=0.6, max_tokens=256, stop_token_ids=[tokenizer.eos_token_id])
|
|
|
|
| 46 |
|
| 47 |
messages_list = [
|
| 48 |
[{"role": "user", "content": "Who are you? Please respond in pirate speak!"}],
|
|
@@ -64,44 +68,40 @@ This model was created with [llm-compressor](https://github.com/vllm-project/llm
|
|
| 64 |
|
| 65 |
|
| 66 |
```python
|
| 67 |
-
import argparse
|
| 68 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 69 |
from llmcompressor.modifiers.quantization import QuantizationModifier
|
| 70 |
from llmcompressor.transformers import oneshot
|
| 71 |
import os
|
| 72 |
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
if __name__ == "__main__":
|
| 104 |
-
main()
|
| 105 |
```
|
| 106 |
|
| 107 |
## Evaluation
|
|
@@ -112,7 +112,7 @@ OpenLLM Leaderboard V1:
|
|
| 112 |
```
|
| 113 |
lm_eval \
|
| 114 |
--model vllm \
|
| 115 |
-
--model_args pretrained="neuralmagic/DeepSeek-R1-Distill-Llama-8B-FP8-Dynamic",dtype=auto,
|
| 116 |
--tasks openllm \
|
| 117 |
--write_out \
|
| 118 |
--batch_size auto \
|
|
@@ -124,7 +124,7 @@ OpenLLM Leaderboard V2:
|
|
| 124 |
```
|
| 125 |
lm_eval \
|
| 126 |
--model vllm \
|
| 127 |
-
--model_args pretrained="neuralmagic/DeepSeek-R1-Distill-Llama-8B-FP8-Dynamic",dtype=auto,
|
| 128 |
--apply_chat_template \
|
| 129 |
--fewshot_as_multiturn \
|
| 130 |
--tasks leaderboard \
|
|
@@ -132,7 +132,6 @@ lm_eval \
|
|
| 132 |
--batch_size auto \
|
| 133 |
--output_path output_dir \
|
| 134 |
--show_config
|
| 135 |
-
|
| 136 |
```
|
| 137 |
|
| 138 |
### Accuracy
|
|
|
|
| 11 |
# DeepSeek-R1-Distill-Llama-8B-FP8-Dynamic
|
| 12 |
|
| 13 |
## Model Overview
|
| 14 |
+
- **Model Architecture:** LlamaForCausalLM
|
| 15 |
- **Input:** Text
|
| 16 |
- **Output:** Text
|
| 17 |
- **Model Optimizations:**
|
|
|
|
| 25 |
|
| 26 |
### Model Optimizations
|
| 27 |
|
| 28 |
+
This model was obtained by quantizing the weights and activations of [DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B) to FP8 data type.
|
| 29 |
+
This optimization reduces the number of bits per parameter from 16 to 8, reducing the disk size and GPU memory requirements by approximately 50%.
|
| 30 |
|
| 31 |
+
Only the weights and activations of the linear operators within transformers blocks are quantized.
|
| 32 |
+
Weights are quantized using a symmetric per-channel scheme, whereas quantizations are quantized using a symmetric per-token scheme.
|
| 33 |
+
[LLM Compressor](https://github.com/vllm-project/llm-compressor) is used for quantization.
|
| 34 |
|
| 35 |
+
|
| 36 |
+
## Use with vLLM
|
| 37 |
|
| 38 |
This model can be deployed efficiently using the [vLLM](https://docs.vllm.ai/en/latest/) backend, as shown in the example below.
|
| 39 |
|
|
|
|
| 41 |
from transformers import AutoTokenizer
|
| 42 |
from vllm import LLM, SamplingParams
|
| 43 |
|
| 44 |
+
number_gpus = 1
|
| 45 |
model_name = "neuralmagic/DeepSeek-R1-Distill-Llama-8B-FP8-Dynamic"
|
| 46 |
+
|
| 47 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
|
|
| 48 |
sampling_params = SamplingParams(temperature=0.6, max_tokens=256, stop_token_ids=[tokenizer.eos_token_id])
|
| 49 |
+
llm = LLM(model=model_name, tensor_parallel_size=number_gpus, trust_remote_code=True)
|
| 50 |
|
| 51 |
messages_list = [
|
| 52 |
[{"role": "user", "content": "Who are you? Please respond in pirate speak!"}],
|
|
|
|
| 68 |
|
| 69 |
|
| 70 |
```python
|
|
|
|
| 71 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 72 |
from llmcompressor.modifiers.quantization import QuantizationModifier
|
| 73 |
from llmcompressor.transformers import oneshot
|
| 74 |
import os
|
| 75 |
|
| 76 |
+
# Load model
|
| 77 |
+
model_stub = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
|
| 78 |
+
model_name = model_stub.split("/")[-1]
|
| 79 |
+
|
| 80 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 81 |
+
model_stub,
|
| 82 |
+
torch_dtype="auto",
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
tokenizer = AutoTokenizer.from_pretrained(model_stub)
|
| 86 |
+
|
| 87 |
+
# Configure the quantization algorithm and scheme
|
| 88 |
+
recipe = QuantizationModifier(
|
| 89 |
+
targets="Linear",
|
| 90 |
+
scheme="FP8_DYNAMIC",
|
| 91 |
+
ignore=["lm_head"],
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
# Apply quantization
|
| 95 |
+
oneshot(
|
| 96 |
+
model=model,
|
| 97 |
+
recipe=recipe,
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
# Save to disk in compressed-tensors format
|
| 101 |
+
save_path = model_name + "-FP8-dynamic
|
| 102 |
+
model.save_pretrained(save_path)
|
| 103 |
+
tokenizer.save_pretrained(save_path)
|
| 104 |
+
print(f"Model and tokenizer saved to: {save_path}")
|
|
|
|
|
|
|
|
|
|
| 105 |
```
|
| 106 |
|
| 107 |
## Evaluation
|
|
|
|
| 112 |
```
|
| 113 |
lm_eval \
|
| 114 |
--model vllm \
|
| 115 |
+
--model_args pretrained="neuralmagic/DeepSeek-R1-Distill-Llama-8B-FP8-Dynamic",dtype=auto,max_model_len=4096,enable_chunked_prefill=True \
|
| 116 |
--tasks openllm \
|
| 117 |
--write_out \
|
| 118 |
--batch_size auto \
|
|
|
|
| 124 |
```
|
| 125 |
lm_eval \
|
| 126 |
--model vllm \
|
| 127 |
+
--model_args pretrained="neuralmagic/DeepSeek-R1-Distill-Llama-8B-FP8-Dynamic",dtype=auto,max_model_len=4096,enable_chunked_prefill=True \
|
| 128 |
--apply_chat_template \
|
| 129 |
--fewshot_as_multiturn \
|
| 130 |
--tasks leaderboard \
|
|
|
|
| 132 |
--batch_size auto \
|
| 133 |
--output_path output_dir \
|
| 134 |
--show_config
|
|
|
|
| 135 |
```
|
| 136 |
|
| 137 |
### Accuracy
|