patrickvonplaten commited on
Commit
21fef4a
·
verified ·
1 Parent(s): b6fd6a2

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +6 -4
README.md CHANGED
@@ -65,7 +65,7 @@ sampling_params = SamplingParams(max_tokens=8192)
65
  # If you want to divide the GPU requirement over multiple devices, please add *e.g.* `tensor_parallel=2`
66
  llm = LLM(model=model_name, tokenizer_mode="mistral", config_format="mistral", load_format="mistral")
67
 
68
- prompt = "How many often does the letter 'r' occur in 'Mistral'?"
69
 
70
  messages = [
71
  {
@@ -91,6 +91,7 @@ vllm serve mistralai/Mistral-Small-Instruct-2409 --tokenizer_mode mistral --conf
91
  ```
92
 
93
  **Note:** Running Mistral-Small on a single GPU requires at least 44 GB of GPU RAM.
 
94
  If you want to divide the GPU requirement over multiple devices, please add *e.g.* `--tensor_parallel=2`
95
 
96
  2. And ping the client:
@@ -104,10 +105,11 @@ curl --location 'http://<your-node-url>:8000/v1/chat/completions' \
104
  "messages": [
105
  {
106
  "role": "user",
107
- "content": "How many often does the letter 'r' occur in 'Mistral'?",
108
  }
109
  ]
110
- }'
 
111
  ```
112
 
113
  ### Mistral-inference
@@ -157,7 +159,7 @@ from mistral_common.protocol.instruct.request import ChatCompletionRequest
157
  tokenizer = MistralTokenizer.from_file(f"{mistral_models_path}/tokenizer.model.v3")
158
  model = Transformer.from_folder(mistral_models_path)
159
 
160
- completion_request = ChatCompletionRequest(messages=[UserMessage(content="Explain Machine Learning to me in a nutshell.")])
161
 
162
  tokens = tokenizer.encode_chat_completion(completion_request).tokens
163
 
 
65
  # If you want to divide the GPU requirement over multiple devices, please add *e.g.* `tensor_parallel=2`
66
  llm = LLM(model=model_name, tokenizer_mode="mistral", config_format="mistral", load_format="mistral")
67
 
68
+ prompt = "How many often does the letter r occur in Mistral?"
69
 
70
  messages = [
71
  {
 
91
  ```
92
 
93
  **Note:** Running Mistral-Small on a single GPU requires at least 44 GB of GPU RAM.
94
+
95
  If you want to divide the GPU requirement over multiple devices, please add *e.g.* `--tensor_parallel=2`
96
 
97
  2. And ping the client:
 
105
  "messages": [
106
  {
107
  "role": "user",
108
+ "content": "How many often does the letter r occur in Mistral?"
109
  }
110
  ]
111
+ }'
112
+
113
  ```
114
 
115
  ### Mistral-inference
 
159
  tokenizer = MistralTokenizer.from_file(f"{mistral_models_path}/tokenizer.model.v3")
160
  model = Transformer.from_folder(mistral_models_path)
161
 
162
+ completion_request = ChatCompletionRequest(messages=[UserMessage(content="How many often does the letter r occur in Mistral?")])
163
 
164
  tokens = tokenizer.encode_chat_completion(completion_request).tokens
165