Update README.md
Browse files
README.md
CHANGED
|
@@ -65,7 +65,7 @@ sampling_params = SamplingParams(max_tokens=8192)
|
|
| 65 |
# If you want to divide the GPU requirement over multiple devices, please add *e.g.* `tensor_parallel=2`
|
| 66 |
llm = LLM(model=model_name, tokenizer_mode="mistral", config_format="mistral", load_format="mistral")
|
| 67 |
|
| 68 |
-
prompt = "How many often does the letter
|
| 69 |
|
| 70 |
messages = [
|
| 71 |
{
|
|
@@ -91,6 +91,7 @@ vllm serve mistralai/Mistral-Small-Instruct-2409 --tokenizer_mode mistral --conf
|
|
| 91 |
```
|
| 92 |
|
| 93 |
**Note:** Running Mistral-Small on a single GPU requires at least 44 GB of GPU RAM.
|
|
|
|
| 94 |
If you want to divide the GPU requirement over multiple devices, please add *e.g.* `--tensor_parallel=2`
|
| 95 |
|
| 96 |
2. And ping the client:
|
|
@@ -104,10 +105,11 @@ curl --location 'http://<your-node-url>:8000/v1/chat/completions' \
|
|
| 104 |
"messages": [
|
| 105 |
{
|
| 106 |
"role": "user",
|
| 107 |
-
"content": "How many often does the letter
|
| 108 |
}
|
| 109 |
]
|
| 110 |
-
|
|
|
|
| 111 |
```
|
| 112 |
|
| 113 |
### Mistral-inference
|
|
@@ -157,7 +159,7 @@ from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
|
| 157 |
tokenizer = MistralTokenizer.from_file(f"{mistral_models_path}/tokenizer.model.v3")
|
| 158 |
model = Transformer.from_folder(mistral_models_path)
|
| 159 |
|
| 160 |
-
completion_request = ChatCompletionRequest(messages=[UserMessage(content="
|
| 161 |
|
| 162 |
tokens = tokenizer.encode_chat_completion(completion_request).tokens
|
| 163 |
|
|
|
|
| 65 |
# If you want to divide the GPU requirement over multiple devices, please add *e.g.* `tensor_parallel=2`
|
| 66 |
llm = LLM(model=model_name, tokenizer_mode="mistral", config_format="mistral", load_format="mistral")
|
| 67 |
|
| 68 |
+
prompt = "How many often does the letter r occur in Mistral?"
|
| 69 |
|
| 70 |
messages = [
|
| 71 |
{
|
|
|
|
| 91 |
```
|
| 92 |
|
| 93 |
**Note:** Running Mistral-Small on a single GPU requires at least 44 GB of GPU RAM.
|
| 94 |
+
|
| 95 |
If you want to divide the GPU requirement over multiple devices, please add *e.g.* `--tensor_parallel=2`
|
| 96 |
|
| 97 |
2. And ping the client:
|
|
|
|
| 105 |
"messages": [
|
| 106 |
{
|
| 107 |
"role": "user",
|
| 108 |
+
"content": "How many often does the letter r occur in Mistral?"
|
| 109 |
}
|
| 110 |
]
|
| 111 |
+
}'
|
| 112 |
+
|
| 113 |
```
|
| 114 |
|
| 115 |
### Mistral-inference
|
|
|
|
| 159 |
tokenizer = MistralTokenizer.from_file(f"{mistral_models_path}/tokenizer.model.v3")
|
| 160 |
model = Transformer.from_folder(mistral_models_path)
|
| 161 |
|
| 162 |
+
completion_request = ChatCompletionRequest(messages=[UserMessage(content="How many often does the letter r occur in Mistral?")])
|
| 163 |
|
| 164 |
tokens = tokenizer.encode_chat_completion(completion_request).tokens
|
| 165 |
|