Spaces:

LeoNguyen101120
/

ai-assistance

Paused

App Files Files Community

LeoNguyen101120 commited on May 29

Commit

2692e0d

1 Parent(s): c62df12

Refactor chat handling and model integration: Update .env.example to include new API keys, modify main.py to implement a lifespan context manager for resource management, and replace Message class with dictionary structures in chat_request.py and chat_service.py for improved flexibility. Remove unused message and response models to streamline codebase.

Browse files

Files changed (12) hide show

.env.example +2 -1
src/main.py +33 -5
src/models/others/message.py +0 -24
src/models/requests/chat_request.py +2 -3
src/models/responses/chat_response.py +0 -92
src/models/responses/tool_call_response.py +0 -13
src/routes/chat_routes.py +1 -3
src/services/chat_service.py +25 -41
src/utils/image_pipeline.py +21 -28
src/utils/timing.py +2 -0
src/utils/tools/tools_helper.py +12 -15
src/utils/transformer_client.py +131 -70

.env.example CHANGED Viewed

@@ -1,3 +1,4 @@
 jina_api_key=
 brave_search_api_key=
-ai_url=

 jina_api_key=
 brave_search_api_key=
+ai_url=
+serp_api

src/main.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import os
 from fastapi import FastAPI, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
@@ -7,9 +9,26 @@ from fastapi.staticfiles import StaticFiles
 from constants.config import OUTPUT_DIR
 from models.responses.base_response import BaseResponse
 from routes import chat_routes, process_file_routes, vector_store_routes
 from utils.exception import CustomException
-app = FastAPI()
 origins = ["*"]
 app.add_middleware(
@@ -20,34 +39,43 @@ app.add_middleware(
     allow_headers=["*"],
 )
 @app.exception_handler(CustomException)
 async def custom_exception_handler(request: Request, exc: CustomException):
     return JSONResponse(
         status_code=exc.status_code,
-        content=BaseResponse(status_code=exc.status_code, message=exc.message).model_dump(),
     )
 @app.exception_handler(Exception)
 async def global_exception_handler(request: Request, exc: Exception):
     # Mặc định cho các lỗi không được CustomException xử lý
     return JSONResponse(
         status_code=500,
-        content=BaseResponse(status_code=500, message=str(exc)).model_dump()
     )
 @app.exception_handler(RequestValidationError)
 async def validation_exception_handler(request: Request, exc: RequestValidationError):
     return JSONResponse(
         status_code=422,
-        content=BaseResponse(status_code=422, message="Validation error").model_dump()
     )
 app.include_router(chat_routes.router, prefix="/api/v1")
 app.include_router(process_file_routes.router, prefix="/api/v1")
 app.include_router(vector_store_routes.router, prefix="/api/v1")
 @app.get("/")
 def read_root():
     return {"message": "Welcome to my API"}
 os.makedirs(OUTPUT_DIR, exist_ok=True)
-app.mount(OUTPUT_DIR, StaticFiles(directory=OUTPUT_DIR), name="outputs")

 import os
+from sre_parse import Tokenizer
 from fastapi import FastAPI, Request
+from fastapi.concurrency import asynccontextmanager
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 from constants.config import OUTPUT_DIR
 from models.responses.base_response import BaseResponse
 from routes import chat_routes, process_file_routes, vector_store_routes
+from utils import image_pipeline, transformer_client
 from utils.exception import CustomException
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # try:
+    #     transformer_client.load_model()
+    #     image_pipeline.load_pipeline()
+    # except Exception as e:
+    #     print(f"Error during startup: {str(e)}")
+    yield
+    transformer_client.clear_resources()
+    image_pipeline.clear_resources()
+app = FastAPI(lifespan=lifespan)
 origins = ["*"]
 app.add_middleware(
     allow_headers=["*"],
 )
 @app.exception_handler(CustomException)
 async def custom_exception_handler(request: Request, exc: CustomException):
     return JSONResponse(
         status_code=exc.status_code,
+        content=BaseResponse(
+            status_code=exc.status_code, message=exc.message
+        ).model_dump(),
     )
 @app.exception_handler(Exception)
 async def global_exception_handler(request: Request, exc: Exception):
     # Mặc định cho các lỗi không được CustomException xử lý
     return JSONResponse(
         status_code=500,
+        content=BaseResponse(status_code=500, message=str(exc)).model_dump(),
     )
 @app.exception_handler(RequestValidationError)
 async def validation_exception_handler(request: Request, exc: RequestValidationError):
     return JSONResponse(
         status_code=422,
+        content=BaseResponse(status_code=422, message="Validation error").model_dump(),
     )
 app.include_router(chat_routes.router, prefix="/api/v1")
 app.include_router(process_file_routes.router, prefix="/api/v1")
 app.include_router(vector_store_routes.router, prefix="/api/v1")
 @app.get("/")
 def read_root():
     return {"message": "Welcome to my API"}
 os.makedirs(OUTPUT_DIR, exist_ok=True)
+app.mount(OUTPUT_DIR, StaticFiles(directory=OUTPUT_DIR), name="outputs")

src/models/others/message.py DELETED Viewed

@@ -1,24 +0,0 @@
-from enum import Enum
-from typing import List, Optional
-from pydantic import BaseModel
-from models.responses.tool_call_response import ToolCall
-class Role(str, Enum):
-    assistant = "assistant"
-    user = "user"
-    system = "system"
-    tool = "tool"
-class Message(BaseModel):
-    role: Role
-    content: Optional[str] = None
-    tool_calls: Optional[List[ToolCall]] = None
-    def to_map(self):
-        data = self.model_dump(exclude_none=True)
-        data["role"] = self.role.value
-        return data

src/models/requests/chat_request.py CHANGED Viewed

@@ -2,11 +2,10 @@ from typing import List, Optional
 from pydantic import BaseModel
 from constants.config import LLM_MODEL_NAME
-from models.others.message import Role, Message
 class ChatRequest(BaseModel):
-    messages: List[Message]
     has_file: bool = False
     chat_session_id: str | None = None
@@ -16,7 +15,7 @@ class ChatRequest(BaseModel):
                 {
                     "has_file": False,
                     "chat_session_id": "123",
-                    "messages": [{"role": Role.user, "content": "hello"}],
                 }
             ]
         }

 from pydantic import BaseModel
 from constants.config import LLM_MODEL_NAME
 class ChatRequest(BaseModel):
+    messages: List[dict]
     has_file: bool = False
     chat_session_id: str | None = None
                 {
                     "has_file": False,
                     "chat_session_id": "123",
+                    "messages": [{"role": 'user', "content": "hello"}],
                 }
             ]
         }

src/models/responses/chat_response.py DELETED Viewed

@@ -1,92 +0,0 @@
-from typing import Any, List, Optional
-from click import argument
-from pydantic import BaseModel
-from models.others.message import Message, Role
-from models.responses.tool_call_response import ToolCall
-class Choice(BaseModel):
-    message: Optional[Message] = None
-    delta: Optional[Message] = None
-    function_call: Optional[ToolCall] = None
-class ChatResponse(BaseModel):
-    id: Optional[str] = None
-    choices: Optional[List[Choice]] = None
-    @classmethod
-    def from_stream_chunk(cls, chunk: dict, last_role: Optional[Role] = None):
-        choices = []
-        updated_role = last_role  # Default to last role
-        for choice in chunk.get("choices", []):
-            delta_data = choice.get("delta", {})
-            # Skip chunks that contain neither content nor role
-            if not delta_data.get("content") and not delta_data.get("role"):
-                continue
-            # Determine role
-            if "role" in delta_data and delta_data["role"] is not None:
-                try:
-                    updated_role = Role(delta_data["role"])
-                except ValueError:
-                    # Skip or log invalid role values
-                    continue
-            if not updated_role:
-                # Still no role available, skip
-                continue
-            message = Message(
-                role=updated_role,
-                content=delta_data.get("content"),
-            )
-            choices.append(
-                Choice(
-                    message=message,
-                    delta=message,
-                )
-            )
-        return (
-            cls(
-                id=chunk.get("id"),
-                choices=choices,
-            ),
-            updated_role,
-        )
-    @classmethod
-    def from_llm_output(cls, output: dict) -> "ChatResponse":
-        """
-        Map the output dict from llm.create_chat_completion to a ChatResponse instance.
-        """
-        choices = []
-        for choice in output.get("choices", []):
-            message_data = choice.get("message", {})
-            tool_calls_data = message_data.get("tool_calls")
-            tool_calls = None
-            if tool_calls_data:
-                tool_calls = [ToolCall(**tc) for tc in tool_calls_data]
-            message = Message(
-                role=Role(message_data["role"]),
-                content=message_data.get("content"),
-                tool_calls=tool_calls,
-            )
-            # function_call is for OpenAI compatibility, may be None
-            function_call = None
-            if "function_call" in choice:
-                function_call = ToolCall(**choice["function_call"])
-            choices.append(
-                Choice(
-                    message=message,
-                    function_call=function_call,
-                )
-            )
-        return cls(
-            id=output.get("id"),
-            choices=choices,
-        )

src/models/responses/tool_call_response.py DELETED Viewed

@@ -1,13 +0,0 @@
-from typing import Optional
-from pydantic import BaseModel
-class FunctionOfToolCall(BaseModel):
-    name: Optional[str]
-    arguments: Optional[str]
-class ToolCall(BaseModel):
-    id: Optional[str]
-    type: Optional[str]
-    function: Optional[FunctionOfToolCall]

src/routes/chat_routes.py CHANGED Viewed

@@ -70,9 +70,7 @@ async def chat(request: ChatRequest):
     try:
         response = chat_service.chat_generate(request=request)
         return BaseResponse(
-            data=json.loads(
-                response.model_dump_json(),
-            ),
         )
     except Exception as e:
         raise BaseExceptionResponse(message=str(e))

     try:
         response = chat_service.chat_generate(request=request)
         return BaseResponse(
+            data=response,
         )
     except Exception as e:
         raise BaseExceptionResponse(message=str(e))

src/services/chat_service.py CHANGED Viewed

@@ -6,13 +6,12 @@ from services import vector_store_service
 from utils import open_ai_client
 from utils.timing import measure_time
 from utils.tools import tools_helper, tools_define
-from models.others.message import Message, Role
 from utils.transformer_client import generate, generate_stream
-def build_context_prompt(request: ChatRequest) -> list[Message]:
     """Build system prompt with context if file is provided."""
-    messages = [Message(role=Role.system, content=system_prompts.system_prompt)]
     if not request.has_file or not vector_store_service.check_if_collection_exists(
         request.chat_session_id
@@ -21,7 +20,7 @@ def build_context_prompt(request: ChatRequest) -> list[Message]:
     with measure_time("Get data from vector store"):
         vectorstore = vector_store_service.get_vector_store(request.chat_session_id)
-        query = request.messages[-1].content
         results = vectorstore.similarity_search(query=query or "", k=10)
     if not results:
@@ -42,7 +41,7 @@ def build_context_prompt(request: ChatRequest) -> list[Message]:
         f"CONTEXT: {context}\nQUESTION: {query}"
     )
-    messages.append(Message(role=Role.system, content=embedded_prompt))
     return messages
@@ -62,21 +61,19 @@ def chat_generate_stream(
     final_tool_calls = {}
     for chunk in stream:
-        if chunk.choices and len(chunk.choices) > 0:
-            delta = chunk.choices[0].delta
-            if getattr(delta, "tool_calls", None):
-                final_tool_calls = tools_helper.final_tool_calls_handler(
-                    final_tool_calls, delta.tool_calls, is_stream=True
-                )
-            yield chunk
     if not final_tool_calls:
         return
     tool_call_result = tools_helper.process_tool_calls(final_tool_calls)
-    tool_call_message = Message(
-        role=Role.tool, content=tool_call_result.get("content", "")
-    )
     messages.append(tool_call_message)
     # new_stream = open_ai_client.chat.completions.create(
@@ -94,39 +91,26 @@ def chat_generate(request: ChatRequest):
     messages = build_context_prompt(request)
     messages.extend(request.messages)
-    # output = open_ai_client.open_ai_client.chat.completions.create(
-    #     messages=messages, model="my-model", tools=tools_define.tools
-    # )
-    output = generate(messages=messages)
-    final_tool_calls = {}
-    message = None
-    if output.choices and len(output.choices) > 0:
-        message = output.choices[0].message
-        if (
-            message is not None
-            and getattr(message, "tool_calls", None)
-            and message.tool_calls
-        ):
-            final_tool_calls = tools_helper.final_tool_calls_handler(
-                final_tool_calls=final_tool_calls, tool_calls=message.tool_calls
-            )
-    if not final_tool_calls:
         return output
     tool_call_result = tools_helper.process_tool_calls(
-        final_tool_calls=final_tool_calls
-    )
-    tool_call_message = Message(
-        role=Role.tool, content=tool_call_result.get("content", "")
     )
     messages.append(tool_call_message)
-    new_output = generate(messages=messages, has_tool_call=False)
-    # new_output = open_ai_client.chat.completions.create(
-    #     messages=messages, model="my-model", tools=tools_define.tools
-    # )
     return new_output

 from utils import open_ai_client
 from utils.timing import measure_time
 from utils.tools import tools_helper, tools_define
 from utils.transformer_client import generate, generate_stream
+def build_context_prompt(request: ChatRequest) -> list[dict]:
     """Build system prompt with context if file is provided."""
+    messages = [{"role": "system", "content": system_prompts.system_prompt}]
     if not request.has_file or not vector_store_service.check_if_collection_exists(
         request.chat_session_id
     with measure_time("Get data from vector store"):
         vectorstore = vector_store_service.get_vector_store(request.chat_session_id)
+        query = request.messages[-1].get("content")
         results = vectorstore.similarity_search(query=query or "", k=10)
     if not results:
         f"CONTEXT: {context}\nQUESTION: {query}"
     )
+    messages.append({"role": "system", "content": embedded_prompt})
     return messages
     final_tool_calls = {}
     for chunk in stream:
+        choices = chunk.get("choices", [])
+        if choices and choices[0].get("delta", {}).get("tool_calls"):
+            delta = choices[0]["delta"]
+            final_tool_calls = tools_helper.final_tool_calls_handler(
+                final_tool_calls, delta["tool_calls"], is_stream=True
+            )
+        yield chunk
     if not final_tool_calls:
         return
     tool_call_result = tools_helper.process_tool_calls(final_tool_calls)
+    tool_call_message = {"role": "tool", "content": tool_call_result.get("content", "")}
     messages.append(tool_call_message)
     # new_stream = open_ai_client.chat.completions.create(
     messages = build_context_prompt(request)
     messages.extend(request.messages)
+    output = open_ai_client.open_ai_client.chat.completions.create(
+        messages=messages, model="my-model", tools=tools_define.tools
+    ).model_dump()
+    # output = generate(messages=messages)
+    choices = output.get("choices", [])
+    tool_calls = choices[0].get("message").get("tool_calls")
+    if not tool_calls:
         return output
     tool_call_result = tools_helper.process_tool_calls(
+        tool_calls=tool_calls
     )
+    tool_call_message = {"role": "tool", "content": tool_call_result.get("content", "")}
     messages.append(tool_call_message)
+    # new_output = generate(messages=messages, has_tool_call=False)
+    new_output = open_ai_client.open_ai_client.chat.completions.create(
+        messages=messages, model="my-model"
+    ).model_dump()
     return new_output

src/utils/image_pipeline.py CHANGED Viewed

@@ -1,34 +1,27 @@
-# import torch
-# from diffusers import StableDiffusionPipeline
-# from constants.config import IMAGE_MODEL_ID_OR_LINK, TORCH_DEVICE
-# torch.backends.cuda.matmul.allow_tf32 = True  # Enable TF32 for performance on CUDA
-# _pipeline = None
-# def get_pipeline() -> StableDiffusionPipeline:
-#     global _pipeline
-#     if _pipeline is None:
-#         try:
-#             _pipeline = StableDiffusionPipeline.from_pretrained(
-#                 IMAGE_MODEL_ID_OR_LINK,
-#                 torch_dtype=torch.bfloat16,
-#                 variant="fp16",
-#                 # safety_checker=True,
-#                 use_safetensors=True,
-#             )
-#             # _pipeline = StableDiffusionPipeline.from_single_file(
-#             #     IMAGE_MODEL_ID_OR_LINK,
-#             #     torch_dtype=torch.bfloat16,
-#             #     variant="fp16",
-#             #     # safety_checker=True,
-#             #     use_safetensors=True,
-#             # )
-#             _pipeline.to(TORCH_DEVICE)
-#         except Exception as e:
-#             raise RuntimeError(f"Failed to load the model: {e}")
-#     return _pipeline
-# pipeline = get_pipeline()

+import torch
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import (
+    StableDiffusionPipeline,
+)
+from constants.config import IMAGE_MODEL_ID_OR_LINK, TORCH_DEVICE
+from utils.timing import measure_time
+torch.backends.cuda.matmul.allow_tf32 = True  # Enable TF32 for performance on CUDA
+pipeline = None
+def load_pipeline():
+    global pipeline
+    with measure_time("Load image pipeline"):
+        pipeline = StableDiffusionPipeline.from_pretrained(
+            IMAGE_MODEL_ID_OR_LINK,
+            torch_dtype=torch.bfloat16,
+            variant="fp16",
+            # safety_checker=True,
+            use_safetensors=True,
+        )
+def clear_resources():
+    global pipeline
+    pipeline = None

src/utils/timing.py CHANGED Viewed

@@ -1,10 +1,12 @@
 import time
 class measure_time:
     def __init__(self, label="Operation"):
         self.label = label
     def __enter__(self):
         self.start = time.time()
         return self

 import time
 class measure_time:
     def __init__(self, label="Operation"):
         self.label = label
     def __enter__(self):
+        print(f"\nStart: {self.label}")
         self.start = time.time()
         return self

src/utils/tools/tools_helper.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import json
 from typing import List
-from models.responses.tool_call_response import ToolCall
 from utils.tools.tools_define import ToolFunction
 from services import image_service, web_data_service
@@ -15,7 +14,7 @@ def extract_tool_args(tool_call):
     Returns:
         dict: The extracted arguments as a dictionary
     """
-    return json.loads(tool_call.function.arguments)
 def handle_web_data_tool_call(tool_call):
@@ -66,7 +65,7 @@ def handle_search_web_tool_call(tool_call):
     return search_results
-def process_tool_calls(final_tool_calls):
     """
     Process all tool calls and execute them.
@@ -87,8 +86,8 @@ def process_tool_calls(final_tool_calls):
         ToolFunction.SEARCH_WEB.value: handle_search_web_tool_call,
     }
-    for tool_call in final_tool_calls.values():
-        handler = tool_handlers.get(tool_call.function.name)
         if handler:
             result = handler(tool_call)
             if isinstance(result, list):
@@ -98,14 +97,14 @@ def process_tool_calls(final_tool_calls):
     return {
         "role": "tool",
-        "tool_call_id": tool_call.id,
-        "tool_call_name": tool_call.function.name,
         "content": content,
     }
 def final_tool_calls_handler(
-    final_tool_calls: dict, tool_calls: List[ToolCall], is_stream: bool = False
 ):
     """
     Handle and combine multiple tool calls.
@@ -120,13 +119,11 @@ def final_tool_calls_handler(
     for index, tool_call in enumerate(tool_calls):
         if index not in final_tool_calls:
             final_tool_calls[index] = tool_call
-        elif tool_call.function is not None:
             if is_stream:
-                final_tool_calls[
-                    index
-                ].function.arguments += tool_call.function.arguments
             else:
-                final_tool_calls[index].function.arguments = (
-                    tool_call.function.arguments
-                )
     return final_tool_calls

 import json
 from typing import List
 from utils.tools.tools_define import ToolFunction
 from services import image_service, web_data_service
     Returns:
         dict: The extracted arguments as a dictionary
     """
+    return json.loads(tool_call.get("function", {}).get("arguments", "{}"))
 def handle_web_data_tool_call(tool_call):
     return search_results
+def process_tool_calls(tool_calls):
     """
     Process all tool calls and execute them.
         ToolFunction.SEARCH_WEB.value: handle_search_web_tool_call,
     }
+    for tool_call in tool_calls:
+        handler = tool_handlers.get(tool_call.get("function").get("name"))
         if handler:
             result = handler(tool_call)
             if isinstance(result, list):
     return {
         "role": "tool",
+        "tool_call_id": tool_call.get("id"),
+        "tool_call_name": tool_call.get("function", {}).get("name"),
         "content": content,
     }
 def final_tool_calls_handler(
+    final_tool_calls: dict, tool_calls: List[dict], is_stream: bool = False
 ):
     """
     Handle and combine multiple tool calls.
     for index, tool_call in enumerate(tool_calls):
         if index not in final_tool_calls:
             final_tool_calls[index] = tool_call
+        elif tool_call.get("function") is not None:
             if is_stream:
+                if "function" in final_tool_calls[index] and "arguments" in final_tool_calls[index]["function"]:
+                    final_tool_calls[index]["function"]["arguments"] += tool_call.get("function", {}).get("arguments", "")
             else:
+                if "function" in final_tool_calls[index]:
+                    final_tool_calls[index]["function"]["arguments"] = tool_call.get("function", {}).get("arguments", "")
     return final_tool_calls

src/utils/transformer_client.py CHANGED Viewed

@@ -1,71 +1,122 @@
-import os
 from threading import Thread
 from typing import Generator, List
-import torch
 from transformers.models.auto.modeling_auto import AutoModelForCausalLM
 from transformers.models.auto.tokenization_auto import AutoTokenizer
 from constants.config import (
-    GGUF_FILE_NAME,
     LLM_MODEL_NAME,
-    GGUF_REPO_ID,
     TORCH_DEVICE,
     USE_QUANT,
     MODEL_OPTIMIZATION,
-    IS_APPLE_SILICON,
 )
-from models.others.message import Message, Role
-from models.responses.chat_response import ChatResponse
 from transformers.generation.streamers import TextIteratorStreamer
 from utils.timing import measure_time
 from utils.tools import tools_define
 from transformers.utils.quantization_config import BitsAndBytesConfig
-# Configure model loading based on device
-if USE_QUANT:
-    quantization_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_compute_dtype=MODEL_OPTIMIZATION["torch_dtype"],
-        bnb_4bit_use_double_quant=True,
-    )
-    model = AutoModelForCausalLM.from_pretrained(
-        LLM_MODEL_NAME,
-        torch_dtype=MODEL_OPTIMIZATION["torch_dtype"],
-        device_map="auto",
-        quantization_config=quantization_config,
-        low_cpu_mem_usage=MODEL_OPTIMIZATION["low_cpu_mem_usage"],
-        use_cache=MODEL_OPTIMIZATION["use_cache"],
-    )
-else:
-    model = AutoModelForCausalLM.from_pretrained(
-        LLM_MODEL_NAME,
-        torch_dtype=MODEL_OPTIMIZATION["torch_dtype"],
-        device_map="auto",
-        low_cpu_mem_usage=MODEL_OPTIMIZATION["low_cpu_mem_usage"],
-        use_cache=MODEL_OPTIMIZATION["use_cache"],
-    )
-# Configure tokenizer with appropriate settings
-tokenizer = AutoTokenizer.from_pretrained(
-    LLM_MODEL_NAME,
-    use_fast=True,  # Use fast tokenizer for better performance
-)
-def build_prompt(messages: List[Message]) -> str:
-    return "\n".join([f"{m.role.value}: {m.content}" for m in messages])
-def generate(messages: List[Message], has_tool_call: bool = True) -> ChatResponse:
     # Convert messages to prompt
-    prompt = [message.to_map() for message in messages]
     # Prepare tools if enabled
     tools = tools_define.tools if has_tool_call else None
     tool_choice = "auto" if has_tool_call else "none"
     # Apply chat template
-    formatted_prompt = tokenizer.apply_chat_template(
         prompt,
         tools=tools,
         tool_choice=tool_choice,
@@ -73,61 +124,71 @@ def generate(messages: List[Message], has_tool_call: bool = True) -> ChatRespons
         add_generation_prompt=True,
     )
-    print("Starting create chat completion")
     try:
-        with measure_time("Starting create chat completion"):
             # Tokenize input with optimized settings
-            inputs = tokenizer(
                 formatted_prompt,
                 return_tensors="pt",
                 padding=True,
                 truncation=True,
-                max_length=2048,  # Adjust based on your needs
             ).to(TORCH_DEVICE)
             # Generate response with optimized settings
-            output_ids = model.generate(
                 **inputs,
                 max_new_tokens=4096,
                 do_sample=True,
                 temperature=0.7,
-                pad_token_id=tokenizer.pad_token_id,
-                eos_token_id=tokenizer.eos_token_id,
                 use_cache=True,  # Enable KV cache for faster generation
                 num_beams=1,  # Use greedy decoding for faster inference
             )
             # Decode response
-            output_text = tokenizer.decode(
                 output_ids[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True
             )
-            # Create ChatResponse using from_llm_output
-            return ChatResponse.from_llm_output(
-                {
-                    "choices": [
-                        {
-                            "message": {
-                                "role": Role.assistant.value,
-                                "content": output_text,
-                            }
-                        }
-                    ]
-                }
             )
     except Exception as e:
         print(f"Error in create chat completion: {str(e)}")
         raise
-def generate_stream(messages: List[Message]) -> Generator[ChatResponse, None, None]:
     # Convert messages to prompt
-    prompt = [message.to_map() for message in messages]
     # Prepare tools
     tools = tools_define.tools
     # Apply chat template
-    formatted_prompt = tokenizer.apply_chat_template(
         prompt,
         tools=tools,
         tool_choice="auto",
@@ -137,7 +198,7 @@ def generate_stream(messages: List[Message]) -> Generator[ChatResponse, None, No
     try:
         # Tokenize input with optimized settings
-        inputs = tokenizer(
             prompt,
             return_tensors="pt",
             padding=True,
@@ -147,7 +208,7 @@ def generate_stream(messages: List[Message]) -> Generator[ChatResponse, None, No
         # Generate streaming output
         streamer = TextIteratorStreamer(
-            tokenizer,
             skip_prompt=True,
             skip_special_tokens=True,
         )
@@ -158,17 +219,17 @@ def generate_stream(messages: List[Message]) -> Generator[ChatResponse, None, No
             do_sample=True,
             max_new_tokens=4096,
             temperature=0.7,
-            pad_token_id=tokenizer.pad_token_id,
-            eos_token_id=tokenizer.eos_token_id,
             use_cache=True,  # Enable KV cache for faster generation
             num_beams=1,  # Use greedy decoding for faster inference
         )
         # Generate in background thread
-        thread = Thread(target=model.generate, kwargs=generation_kwargs)
         thread.start()
-        last_role = Role.assistant
         for new_text in streamer:
             # Format the chunk to match the expected structure
             chunk = {

+from email import message
+import json
+import re
 from threading import Thread
 from typing import Generator, List
+import uuid
+from numpy import append
 from transformers.models.auto.modeling_auto import AutoModelForCausalLM
 from transformers.models.auto.tokenization_auto import AutoTokenizer
 from constants.config import (
     LLM_MODEL_NAME,
     TORCH_DEVICE,
     USE_QUANT,
     MODEL_OPTIMIZATION,
 )
 from transformers.generation.streamers import TextIteratorStreamer
 from utils.timing import measure_time
 from utils.tools import tools_define
 from transformers.utils.quantization_config import BitsAndBytesConfig
+def load_model():
+    global _model, _tokenizer
+    # Configure model loading based on device
+    try:
+        with measure_time("Load model"):
+            if USE_QUANT:
+                quantization_config = BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_quant_type="nf4",
+                    bnb_4bit_compute_dtype=MODEL_OPTIMIZATION["torch_dtype"],
+                    bnb_4bit_use_double_quant=True,
+                )
+                _model = AutoModelForCausalLM.from_pretrained(
+                    LLM_MODEL_NAME,
+                    torch_dtype=MODEL_OPTIMIZATION["torch_dtype"],
+                    device_map="auto",
+                    quantization_config=quantization_config,
+                    low_cpu_mem_usage=MODEL_OPTIMIZATION["low_cpu_mem_usage"],
+                    use_cache=MODEL_OPTIMIZATION["use_cache"],
+                )
+            else:
+                _model = AutoModelForCausalLM.from_pretrained(
+                    LLM_MODEL_NAME,
+                    torch_dtype=MODEL_OPTIMIZATION["torch_dtype"],
+                    device_map="auto",
+                    low_cpu_mem_usage=MODEL_OPTIMIZATION["low_cpu_mem_usage"],
+                    use_cache=MODEL_OPTIMIZATION["use_cache"],
+                )
+            # Configure tokenizer with appropriate settings
+            _tokenizer = AutoTokenizer.from_pretrained(
+                LLM_MODEL_NAME,
+                use_fast=True,  # Use fast tokenizer for better performance
+            )
+            _model.eval()
+    except Exception as e:
+        print(f"Failed to load model or tokenizer: {str(e)}")
+        _model = None
+        _tokenizer = None
+        raise
+def clear_resources():
+    global _model, _tokenizer
+    _model = None
+    _tokenizer = None
+def build_prompt(messages: List[dict]) -> str:
+    return "\n".join([f"{m.get('role')}: {m.get('content')}" for m in messages])
+def extract_tool_calls_and_reupdate_output(text: str):
+    """
+    Extracts all valid JSON objects found within <tool_call>{...}</tool_call> patterns.
+    """
+    tool_calls = []
+    # Match any <tool_call> JSON-like structure (greedy to match full JSON block)
+    pattern = r"<tool_call>\s*(\{.*?\})\s*</?tool_call>?"
+    matches = list(re.finditer(pattern, text, re.DOTALL))
+    for match in matches:
+        try:
+            tool_call = {}
+            tool_call["id"] = str(uuid.uuid4())
+            tool_call["type"] = "function"
+            tool_call["function"] = {
+                "name": match.group(1),
+                "arguments": json.loads(match.group(1)),
+            }
+            tool_calls.append(tool_call)
+        except json.JSONDecodeError:
+            continue
+    text = re.sub(pattern, "", text, flags=re.DOTALL).strip()
+    return text.strip(), tool_calls if tool_calls else None
+def generate(messages: List[dict], has_tool_call: bool = True) -> dict:
+    if _model is None or _tokenizer is None:
+        raise RuntimeError(
+            "Model or tokenizer not initialized. Ensure load_model was called successfully."
+        )
     # Convert messages to prompt
+    prompt = build_prompt(messages)
     # Prepare tools if enabled
     tools = tools_define.tools if has_tool_call else None
     tool_choice = "auto" if has_tool_call else "none"
     # Apply chat template
+    formatted_prompt = _tokenizer.apply_chat_template(
         prompt,
         tools=tools,
         tool_choice=tool_choice,
         add_generation_prompt=True,
     )
     try:
+        with measure_time("Create chat completion"):
             # Tokenize input with optimized settings
+            inputs = _tokenizer(
                 formatted_prompt,
                 return_tensors="pt",
                 padding=True,
                 truncation=True,
+                max_length=4096,  # Adjust based on your needs
             ).to(TORCH_DEVICE)
             # Generate response with optimized settings
+            output_ids = _model.generate(
                 **inputs,
                 max_new_tokens=4096,
                 do_sample=True,
                 temperature=0.7,
+                pad_token_id=_tokenizer.pad_token_id,
+                eos_token_id=_tokenizer.eos_token_id,
                 use_cache=True,  # Enable KV cache for faster generation
                 num_beams=1,  # Use greedy decoding for faster inference
             )
             # Decode response
+            output_text = _tokenizer.decode(
                 output_ids[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True
             )
+            cleaned_output, tool_calls = extract_tool_calls_and_reupdate_output(
+                output_text
             )
+            # Create ChatResponse using from_llm_output
+            return {
+                "id": f"chatcmpl-{uuid.uuid4().hex}",
+                "choices": [
+                    {
+                        "message": {
+                            "role": "assistant",
+                            "content": cleaned_output,
+                            "tool_calls": tool_calls,
+                        },
+                    }
+                ],
+            }
     except Exception as e:
         print(f"Error in create chat completion: {str(e)}")
         raise
+def generate_stream(messages: List[dict]) -> Generator[dict, None, None]:
+    if _model is None or _tokenizer is None:
+        raise RuntimeError(
+            "Model or tokenizer not initialized. Ensure load_model was called successfully."
+        )
     # Convert messages to prompt
+    prompt = build_prompt(messages)
     # Prepare tools
     tools = tools_define.tools
     # Apply chat template
+    formatted_prompt = _tokenizer.apply_chat_template(
         prompt,
         tools=tools,
         tool_choice="auto",
     try:
         # Tokenize input with optimized settings
+        inputs = _tokenizer(
             prompt,
             return_tensors="pt",
             padding=True,
         # Generate streaming output
         streamer = TextIteratorStreamer(
+            _tokenizer,
             skip_prompt=True,
             skip_special_tokens=True,
         )
             do_sample=True,
             max_new_tokens=4096,
             temperature=0.7,
+            pad_token_id=_tokenizer.pad_token_id,
+            eos_token_id=_tokenizer.eos_token_id,
             use_cache=True,  # Enable KV cache for faster generation
             num_beams=1,  # Use greedy decoding for faster inference
         )
         # Generate in background thread
+        thread = Thread(target=_model.generate, kwargs=generation_kwargs)
         thread.start()
+        last_role = "assistant"
         for new_text in streamer:
             # Format the chunk to match the expected structure
             chunk = {