nm-research commited on
Commit
3e9d983
·
verified ·
1 Parent(s): 9043b00

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +42 -43
README.md CHANGED
@@ -11,7 +11,7 @@ library_name: transformers
11
  # DeepSeek-R1-Distill-Llama-8B-FP8-Dynamic
12
 
13
  ## Model Overview
14
- - **Model Architecture:** DeepSeek-R1-Distill-Llama-8B
15
  - **Input:** Text
16
  - **Output:** Text
17
  - **Model Optimizations:**
@@ -25,12 +25,15 @@ Quantized version of [DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/deeps
25
 
26
  ### Model Optimizations
27
 
28
- This model was obtained by quantizing the weights and activations to FP8 data type, ready for inference with vLLM.
29
- This optimization reduces the number of bits per parameter from 16 to 8, reducing the disk size and GPU memory requirements by approximately 50%. Only the weights and activations of the linear operators within transformers blocks are quantized.
30
 
31
- ## Deployment
 
 
32
 
33
- ### Use with vLLM
 
34
 
35
  This model can be deployed efficiently using the [vLLM](https://docs.vllm.ai/en/latest/) backend, as shown in the example below.
36
 
@@ -38,11 +41,12 @@ This model can be deployed efficiently using the [vLLM](https://docs.vllm.ai/en/
38
  from transformers import AutoTokenizer
39
  from vllm import LLM, SamplingParams
40
 
41
- max_model_len, tp_size = 4096, 1
42
  model_name = "neuralmagic/DeepSeek-R1-Distill-Llama-8B-FP8-Dynamic"
 
43
  tokenizer = AutoTokenizer.from_pretrained(model_name)
44
- llm = LLM(model=model_name, tensor_parallel_size=tp_size, max_model_len=max_model_len, trust_remote_code=True)
45
  sampling_params = SamplingParams(temperature=0.6, max_tokens=256, stop_token_ids=[tokenizer.eos_token_id])
 
46
 
47
  messages_list = [
48
  [{"role": "user", "content": "Who are you? Please respond in pirate speak!"}],
@@ -64,44 +68,40 @@ This model was created with [llm-compressor](https://github.com/vllm-project/llm
64
 
65
 
66
  ```python
67
- import argparse
68
  from transformers import AutoModelForCausalLM, AutoTokenizer
69
  from llmcompressor.modifiers.quantization import QuantizationModifier
70
  from llmcompressor.transformers import oneshot
71
  import os
72
 
73
- def main():
74
- parser = argparse.ArgumentParser(description='Quantize a transformer model to FP8')
75
- parser.add_argument('--model_id', type=str, required=True,
76
- help='The model ID from HuggingFace (e.g., "meta-llama/Meta-Llama-3-8B-Instruct")')
77
- parser.add_argument('--save_path', type=str, default='.',
78
- help='Custom path to save the quantized model. If not provided, will use model_name-FP8-dynamic')
79
- args = parser.parse_args()
80
-
81
- # Load model
82
- model = AutoModelForCausalLM.from_pretrained(
83
- args.model_id, device_map="auto", torch_dtype="auto", trust_remote_code=True,
84
- )
85
- tokenizer = AutoTokenizer.from_pretrained(args.model_id)
86
-
87
- # Configure the quantization algorithm and scheme
88
- recipe = QuantizationModifier(
89
- targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
90
- )
91
-
92
- # Apply quantization
93
- oneshot(model=model, recipe=recipe)
94
-
95
- save_path = os.path.join(args.save_path, args.model_id.split("/")[1] + "-FP8-dynamic")
96
- os.makedirs(save_path, exist_ok=True)
97
-
98
- # Save to disk in compressed-tensors format
99
- model.save_pretrained(save_path)
100
- tokenizer.save_pretrained(save_path)
101
- print(f"Model and tokenizer saved to: {save_path}")
102
-
103
- if __name__ == "__main__":
104
- main()
105
  ```
106
 
107
  ## Evaluation
@@ -112,7 +112,7 @@ OpenLLM Leaderboard V1:
112
  ```
113
  lm_eval \
114
  --model vllm \
115
- --model_args pretrained="neuralmagic/DeepSeek-R1-Distill-Llama-8B-FP8-Dynamic",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=1,gpu_memory_utilization=0.8,enable_chunked_prefill=True,trust_remote_code=True \
116
  --tasks openllm \
117
  --write_out \
118
  --batch_size auto \
@@ -124,7 +124,7 @@ OpenLLM Leaderboard V2:
124
  ```
125
  lm_eval \
126
  --model vllm \
127
- --model_args pretrained="neuralmagic/DeepSeek-R1-Distill-Llama-8B-FP8-Dynamic",dtype=auto,add_bos_token=False,max_model_len=4096,tensor_parallel_size=1,gpu_memory_utilization=0.8,enable_chunked_prefill=True,trust_remote_code=True \
128
  --apply_chat_template \
129
  --fewshot_as_multiturn \
130
  --tasks leaderboard \
@@ -132,7 +132,6 @@ lm_eval \
132
  --batch_size auto \
133
  --output_path output_dir \
134
  --show_config
135
-
136
  ```
137
 
138
  ### Accuracy
 
11
  # DeepSeek-R1-Distill-Llama-8B-FP8-Dynamic
12
 
13
  ## Model Overview
14
+ - **Model Architecture:** LlamaForCausalLM
15
  - **Input:** Text
16
  - **Output:** Text
17
  - **Model Optimizations:**
 
25
 
26
  ### Model Optimizations
27
 
28
+ This model was obtained by quantizing the weights and activations of [DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B) to FP8 data type.
29
+ This optimization reduces the number of bits per parameter from 16 to 8, reducing the disk size and GPU memory requirements by approximately 50%.
30
 
31
+ Only the weights and activations of the linear operators within transformers blocks are quantized.
32
+ Weights are quantized using a symmetric per-channel scheme, whereas quantizations are quantized using a symmetric per-token scheme.
33
+ [LLM Compressor](https://github.com/vllm-project/llm-compressor) is used for quantization.
34
 
35
+
36
+ ## Use with vLLM
37
 
38
  This model can be deployed efficiently using the [vLLM](https://docs.vllm.ai/en/latest/) backend, as shown in the example below.
39
 
 
41
  from transformers import AutoTokenizer
42
  from vllm import LLM, SamplingParams
43
 
44
+ number_gpus = 1
45
  model_name = "neuralmagic/DeepSeek-R1-Distill-Llama-8B-FP8-Dynamic"
46
+
47
  tokenizer = AutoTokenizer.from_pretrained(model_name)
 
48
  sampling_params = SamplingParams(temperature=0.6, max_tokens=256, stop_token_ids=[tokenizer.eos_token_id])
49
+ llm = LLM(model=model_name, tensor_parallel_size=number_gpus, trust_remote_code=True)
50
 
51
  messages_list = [
52
  [{"role": "user", "content": "Who are you? Please respond in pirate speak!"}],
 
68
 
69
 
70
  ```python
 
71
  from transformers import AutoModelForCausalLM, AutoTokenizer
72
  from llmcompressor.modifiers.quantization import QuantizationModifier
73
  from llmcompressor.transformers import oneshot
74
  import os
75
 
76
+ # Load model
77
+ model_stub = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
78
+ model_name = model_stub.split("/")[-1]
79
+
80
+ model = AutoModelForCausalLM.from_pretrained(
81
+ model_stub,
82
+ torch_dtype="auto",
83
+ )
84
+
85
+ tokenizer = AutoTokenizer.from_pretrained(model_stub)
86
+
87
+ # Configure the quantization algorithm and scheme
88
+ recipe = QuantizationModifier(
89
+ targets="Linear",
90
+ scheme="FP8_DYNAMIC",
91
+ ignore=["lm_head"],
92
+ )
93
+
94
+ # Apply quantization
95
+ oneshot(
96
+ model=model,
97
+ recipe=recipe,
98
+ )
99
+
100
+ # Save to disk in compressed-tensors format
101
+ save_path = model_name + "-FP8-dynamic
102
+ model.save_pretrained(save_path)
103
+ tokenizer.save_pretrained(save_path)
104
+ print(f"Model and tokenizer saved to: {save_path}")
 
 
 
105
  ```
106
 
107
  ## Evaluation
 
112
  ```
113
  lm_eval \
114
  --model vllm \
115
+ --model_args pretrained="neuralmagic/DeepSeek-R1-Distill-Llama-8B-FP8-Dynamic",dtype=auto,max_model_len=4096,enable_chunked_prefill=True \
116
  --tasks openllm \
117
  --write_out \
118
  --batch_size auto \
 
124
  ```
125
  lm_eval \
126
  --model vllm \
127
+ --model_args pretrained="neuralmagic/DeepSeek-R1-Distill-Llama-8B-FP8-Dynamic",dtype=auto,max_model_len=4096,enable_chunked_prefill=True \
128
  --apply_chat_template \
129
  --fewshot_as_multiturn \
130
  --tasks leaderboard \
 
132
  --batch_size auto \
133
  --output_path output_dir \
134
  --show_config
 
135
  ```
136
 
137
  ### Accuracy