Qwen
/

Qwen3-4B-MLX-6bit

@@ -46,14 +46,26 @@ KeyError: 'qwen3'
 The following contains a code snippet illustrating how to use the model generate content based on given inputs.
 ```python
 from mlx_lm import load, generate
 model, tokenizer = load("Qwen/Qwen3-4B-MLX-6bit")
 prompt = "hello, Introduce yourself, and what can you do ?"
 if tokenizer.chat_template is not None:
     messages = [{"role": "user", "content": prompt}]
     prompt = tokenizer.apply_chat_template(
-        messages, add_generation_prompt=True
     )
-response = generate(model, tokenizer, prompt=prompt, verbose=True, max_tokens=1024)
 ```
 ## Switching Between Thinking and Non-Thinking Mode
@@ -107,6 +119,8 @@ Here is an example of a multi-turn conversation:
 ```python
 from mlx_lm import load, generate
 class QwenChatbot:
     def __init__(self, model_name="Qwen/Qwen3-4B-MLX-6bit"):
         self.model, self.tokenizer = load(model_name)
@@ -121,13 +135,20 @@ class QwenChatbot:
             add_generation_prompt=True
         )
-        response = generate(self.model, self.tokenizer, prompt=text, verbose=True, max_tokens=32768)
         # Update history
         self.history.append({"role": "user", "content": user_input})
         self.history.append({"role": "assistant", "content": response})
         return response
 # Example Usage
 if __name__ == "__main__":
     chatbot = QwenChatbot()
@@ -143,7 +164,7 @@ if __name__ == "__main__":
     user_input_2 = "Then, how many r's in blueberries? /no_think"
     print(f"User: {user_input_2}")
     response_2 = chatbot.generate_response(user_input_2)
-    print(f"Bot: {response_2}")
     print("----------------------")
     # Third input with /think
@@ -162,35 +183,37 @@ if __name__ == "__main__":
 Qwen3 excels in tool calling capabilities. We recommend using [Qwen-Agent](https://github.com/QwenLM/Qwen-Agent) to make the best use of agentic ability of Qwen3. Qwen-Agent encapsulates tool-calling templates and tool-calling parsers internally, greatly reducing coding complexity.
 To define the available tools, you can use the MCP configuration file, use the integrated tool of Qwen-Agent, or integrate other tools by yourself.
 ```python
 from qwen_agent.agents import Assistant
 # Define LLM
 llm_cfg = {
-    'model': 'Qwen3-4B-MLX-6bit',
     # Use the endpoint provided by Alibaba Model Studio:
-    # 'model_type': 'qwen_dashscope',
-    # 'api_key': os.getenv('DASHSCOPE_API_KEY'),
     # Use a custom endpoint compatible with OpenAI API:
-    'model_server': 'http://localhost:8000/v1',  # api_base
-    'api_key': 'EMPTY',
     # Other parameters:
-    # 'generate_cfg': {
-    #         # Add: When the response content is `<think>this is the thought</think>this is the answer;
-    #         # Do not add: When the response has been separated by reasoning_content and content.
-    #         'thought_in_content': True,
-    #     },
 }
 # Define Tools
 tools = [
-    {'mcpServers': {  # You can specify the MCP configuration file
-            'time': {
-                'command': 'uvx',
-                'args': ['mcp-server-time', '--local-timezone=Asia/Shanghai']
             },
             "fetch": {
                 "command": "uvx",
@@ -198,16 +221,23 @@ tools = [
             }
         }
     },
-  'code_interpreter',  # Built-in tools
 ]
 # Define Agent
 bot = Assistant(llm=llm_cfg, function_list=tools)
 # Streaming generation
-messages = [{'role': 'user', 'content': 'https://qwenlm.github.io/blog/ Introduce the latest developments of Qwen'}]
 for responses in bot.run(messages=messages):
     pass
 print(responses)
 ```

 The following contains a code snippet illustrating how to use the model generate content based on given inputs.
 ```python
 from mlx_lm import load, generate
 model, tokenizer = load("Qwen/Qwen3-4B-MLX-6bit")
 prompt = "hello, Introduce yourself, and what can you do ?"
 if tokenizer.chat_template is not None:
     messages = [{"role": "user", "content": prompt}]
     prompt = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True
     )
+response = generate(
+    model,
+    tokenizer,
+    prompt=prompt,
+    verbose=True,
+    max_tokens=1024
+)
+print(response)
 ```
 ## Switching Between Thinking and Non-Thinking Mode
 ```python
 from mlx_lm import load, generate
 class QwenChatbot:
     def __init__(self, model_name="Qwen/Qwen3-4B-MLX-6bit"):
         self.model, self.tokenizer = load(model_name)
             add_generation_prompt=True
         )
+        response = generate(
+            self.model,
+            self.tokenizer,
+            prompt=text,
+            verbose=True,
+            max_tokens=32768
+        )
         # Update history
         self.history.append({"role": "user", "content": user_input})
         self.history.append({"role": "assistant", "content": response})
         return response
 # Example Usage
 if __name__ == "__main__":
     chatbot = QwenChatbot()
     user_input_2 = "Then, how many r's in blueberries? /no_think"
     print(f"User: {user_input_2}")
     response_2 = chatbot.generate_response(user_input_2)
+    print(f"Bot: {response_2}")
     print("----------------------")
     # Third input with /think
 Qwen3 excels in tool calling capabilities. We recommend using [Qwen-Agent](https://github.com/QwenLM/Qwen-Agent) to make the best use of agentic ability of Qwen3. Qwen-Agent encapsulates tool-calling templates and tool-calling parsers internally, greatly reducing coding complexity.
 To define the available tools, you can use the MCP configuration file, use the integrated tool of Qwen-Agent, or integrate other tools by yourself.
 ```python
 from qwen_agent.agents import Assistant
 # Define LLM
 llm_cfg = {
+    "model": "Qwen3-4B-MLX-6bit",
     # Use the endpoint provided by Alibaba Model Studio:
+    # "model_type": "qwen_dashscope",
+    # "api_key": os.getenv("DASHSCOPE_API_KEY"),
     # Use a custom endpoint compatible with OpenAI API:
+    "model_server": "http://localhost:8000/v1",  # api_base
+    "api_key": "EMPTY",
     # Other parameters:
+    # "generate_cfg": {
+    #     # Add: When the response content is `<think>this is the thought</think>this is the answer;
+    #     # Do not add: When the response has been separated by reasoning_content and content.
+    #     "thought_in_content": True,
+    # },
 }
 # Define Tools
 tools = [
+    {
+        "mcpServers": {  # You can specify the MCP configuration file
+            "time": {
+                "command": "uvx",
+                "args": ["mcp-server-time", "--local-timezone=Asia/Shanghai"]
             },
             "fetch": {
                 "command": "uvx",
             }
         }
     },
+    "code_interpreter",  # Built-in tools
 ]
 # Define Agent
 bot = Assistant(llm=llm_cfg, function_list=tools)
 # Streaming generation
+messages = [
+    {
+        "role": "user",
+        "content": "https://qwenlm.github.io/blog/ Introduce the latest developments of Qwen"
+    }
+]
 for responses in bot.run(messages=messages):
     pass
 print(responses)
 ```