mistralai
/

Mistral-Small-Instruct-2409

Model card Files Files and versions

patrickvonplaten commited on Sep 17, 2024

Commit

21fef4a

·

verified ·

1 Parent(s): b6fd6a2

Update README.md

Files changed (1) hide show

README.md +6 -4

README.md CHANGED Viewed

@@ -65,7 +65,7 @@ sampling_params = SamplingParams(max_tokens=8192)
 # If you want to divide the GPU requirement over multiple devices, please add *e.g.* `tensor_parallel=2`
 llm = LLM(model=model_name, tokenizer_mode="mistral", config_format="mistral", load_format="mistral")
-prompt = "How many often does the letter 'r' occur in 'Mistral'?"
 messages = [
     {
@@ -91,6 +91,7 @@ vllm serve mistralai/Mistral-Small-Instruct-2409 --tokenizer_mode mistral --conf
 ```
 **Note:** Running Mistral-Small on a single GPU requires at least 44 GB of GPU RAM.
 If you want to divide the GPU requirement over multiple devices, please add *e.g.* `--tensor_parallel=2`
 2. And ping the client:
@@ -104,10 +105,11 @@ curl --location 'http://<your-node-url>:8000/v1/chat/completions' \
     "messages": [
       {
         "role": "user",
-        "content": "How many often does the letter 'r' occur in 'Mistral'?",
       }
     ]
-  }'
 ```
 ### Mistral-inference
@@ -157,7 +159,7 @@ from mistral_common.protocol.instruct.request import ChatCompletionRequest
 tokenizer = MistralTokenizer.from_file(f"{mistral_models_path}/tokenizer.model.v3")
 model = Transformer.from_folder(mistral_models_path)
-completion_request = ChatCompletionRequest(messages=[UserMessage(content="Explain Machine Learning to me in a nutshell.")])
 tokens = tokenizer.encode_chat_completion(completion_request).tokens

 # If you want to divide the GPU requirement over multiple devices, please add *e.g.* `tensor_parallel=2`
 llm = LLM(model=model_name, tokenizer_mode="mistral", config_format="mistral", load_format="mistral")
+prompt = "How many often does the letter r occur in Mistral?"
 messages = [
     {
 ```
 **Note:** Running Mistral-Small on a single GPU requires at least 44 GB of GPU RAM.
 If you want to divide the GPU requirement over multiple devices, please add *e.g.* `--tensor_parallel=2`
 2. And ping the client:
     "messages": [
       {
         "role": "user",
+        "content": "How many often does the letter r occur in Mistral?"
       }
     ]
+}'
 ```
 ### Mistral-inference
 tokenizer = MistralTokenizer.from_file(f"{mistral_models_path}/tokenizer.model.v3")
 model = Transformer.from_folder(mistral_models_path)
+completion_request = ChatCompletionRequest(messages=[UserMessage(content="How many often does the letter r occur in Mistral?")])
 tokens = tokenizer.encode_chat_completion(completion_request).tokens