Spaces:

LeoNguyen101120
/

ai-assistance

Paused

App Files Files Community

LeoNguyen commited on Jun 4

Commit

e3a80c0

1 Parent(s): 10ec9ff

Update documentation and refine requirements: Enhance the README with detailed installation instructions, Docker deployment steps, and key dependencies. Update requirements files to clarify optional packages and adjust CUDA-related dependencies. Modify .gitignore to include cache directories and ensure proper resource management in the application.

Browse files

Files changed (9) hide show

.gitignore +2 -1
readme.github.md +52 -9
requirements.txt +17 -14
requirements_for_server.txt +10 -10
src/constants/system_prompts.py +19 -0
src/main.py +14 -8
src/services/chat_service.py +13 -10
src/services/image_service.py +4 -4
src/utils/clients/llama_cpp_client.py +22 -6

.gitignore CHANGED Viewed

@@ -16,4 +16,5 @@ bitsandbytes/*
 llama-cpp-python/*
 local_packages_for_win/*
 llama.cpp/*
-local_packages_for_server/*

 llama-cpp-python/*
 local_packages_for_win/*
 llama.cpp/*
+local_packages_for_server/*
+.cache/*

readme.github.md CHANGED Viewed

@@ -77,33 +77,76 @@ src/
 ### Prerequisites
-- Python 3.x
-- FastAPI
-- Uvicorn
 ### Installation
 ```bash
 pip install -r requirements.txt
 ```
 ### Running the Application
 ```bash
 uvicorn main:app --reload --port 8080
 ```
-The application will be available at `http://localhost:8080`
 ## Development
-- Modular architecture: routes, services, models, utils
-- Environment variables required for some services (e.g., Brave, Jina API keys)
-### Create and run docker file
-docker build -t ai-assistance-server .
-docker run -p 80:80 ai-assistance-server
 ## License

 ### Prerequisites
+- Python 3.11
+- CUDA 12.9.0 (for GPU acceleration)
+- FastAPI 0.114.0
+- Uvicorn 0.34.2
 ### Installation
+1. Clone the repository
+2. Create a virtual environment:
+```bash
+python -m venv .venv
+source .venv/bin/activate  # On Windows: .venv\Scripts\activate
+```
+3. Install dependencies:
 ```bash
 pip install -r requirements.txt
 ```
 ### Running the Application
+#### Local Development
 ```bash
 uvicorn main:app --reload --port 8080
 ```
+#### Docker Deployment
+```bash
+# Build the Docker image
+docker build -t ai-assistance-server .
+# Run the container
+docker run -p 7860:7860 --gpus all ai-assistance-server
+```
+The application will be available at:
+- Local: `http://localhost:7860`
+- Server: `http://0.0.0.0:7860` or https://leonguyen101120-ai-assistance.hf.space
 ## Development
+### Key Dependencies
+- **AI/ML**:
+  - diffusers 0.33.1
+  - transformers 4.52.4
+  - torch 2.7.0
+  - accelerate 1.6.0
+- **File Processing** (Optional):
+  - beautifulsoup4 4.13.4
+  - langchain_chroma 0.2.2
+  - langchain_huggingface 0.1.2
+  - langchain_community 0.3.19
+  - chromadb 0.6.3
+  - pymupdf 1.25.1
+### Environment Variables
+The following environment variables are required for specific features:
+- Brave Search API key (for web search)
+- Jina API key (for web content reading)
+- HuggingFace API key (for model access)
 ## License

requirements.txt CHANGED Viewed

@@ -1,26 +1,29 @@
 fastapi[standard] == 0.114.0
 uvicorn == 0.34.2
 requests == 2.32.3
-# # If use diffusers
 diffusers == 0.33.1
 accelerate == 1.6.0
-transformers == 4.52.4
 torch==2.7.0
---extra-index-url https://download.pytorch.org/whl/cu128
-# Offline install for windows
-bitsandbytes -f ./local_packages_for_win/bitsandbytes-0.46.0-cp311-cp311-win_amd64.whl
-# If use llama-cpp-python
-# llama-cpp-python -f ./local_packages_for_win/llama_cpp_python-0.3.9-cp311-cp311-win_amd64.whl
 # If process file feature enable
-# beautifulsoup4 == 4.13.4
-# requests == 2.32.3
-# langchain_chroma == 0.2.2
-# langchain_huggingface == 0.1.2
-# langchain_community == 0.3.19
-# chromadb == 0.6.3
-# pymupdf == 1.25.1

 fastapi[standard] == 0.114.0
 uvicorn == 0.34.2
 requests == 2.32.3
+huggingface-hub == 0.32.0
+# If use diffusers
 diffusers == 0.33.1
 accelerate == 1.6.0
+# transformers == 4.52.4
 torch==2.7.0
+# # If use bitsandbytes with cuda
+# https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl
+# # If use llama-cpp-python with cuda
+# https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu124/llama_cpp_python-0.3.4-cp311-cp311-linux_x86_64.whl
+# # If use llama-cpp-python CPU
+llama-cpp-python == 0.3.9
 # If process file feature enable
+beautifulsoup4 == 4.13.4
+requests == 2.32.3
+langchain_chroma == 0.2.2
+langchain_huggingface == 0.1.2
+langchain_community == 0.3.19
+chromadb == 0.6.3
+pymupdf == 1.25.1

requirements_for_server.txt CHANGED Viewed

@@ -6,25 +6,25 @@ huggingface-hub == 0.32.0
 # If use diffusers
 diffusers == 0.33.1
 accelerate == 1.6.0
-transformers == 4.52.4
 torch==2.7.0
 --extra-index-url https://download.pytorch.org/whl/cu128
 # # If use bitsandbytes with cuda
-https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl
 # # If use llama-cpp-python with cuda
-# https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu124/llama_cpp_python-0.3.4-cp311-cp311-linux_x86_64.whl
 # # If use llama-cpp-python CPU
 # llama-cpp-python == 0.3.9
 # If process file feature enable
-# beautifulsoup4 == 4.13.4
-# requests == 2.32.3
-# langchain_chroma == 0.2.2
-# langchain_huggingface == 0.1.2
-# langchain_community == 0.3.19
-# chromadb == 0.6.3
-# pymupdf == 1.25.1

 # If use diffusers
 diffusers == 0.33.1
 accelerate == 1.6.0
+# transformers == 4.52.4
 torch==2.7.0
 --extra-index-url https://download.pytorch.org/whl/cu128
 # # If use bitsandbytes with cuda
+# https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl
 # # If use llama-cpp-python with cuda
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu124/llama_cpp_python-0.3.4-cp311-cp311-linux_x86_64.whl
 # # If use llama-cpp-python CPU
 # llama-cpp-python == 0.3.9
 # If process file feature enable
+beautifulsoup4 == 4.13.4
+requests == 2.32.3
+langchain_chroma == 0.2.2
+langchain_huggingface == 0.1.2
+langchain_community == 0.3.19
+chromadb == 0.6.3
+pymupdf == 1.25.1

src/constants/system_prompts.py CHANGED Viewed

@@ -42,6 +42,25 @@ When tool is required, or something prompt seem like request tool, respond in **
 > **Important:** No explanation, greetings, or comments should be included before or after this format. Return only the JSON block wrapped in `<tool_call> </tool_call>`.
 ### Example
 #### Example 1:
 **User:**

 > **Important:** No explanation, greetings, or comments should be included before or after this format. Return only the JSON block wrapped in `<tool_call> </tool_call>`.
+### Handling Image Generation Tool Calls
+When the user requests image generation:
+- Always return a new URL for each image generation request
+- If the tool fails to generate a new image, return the URL from the last successful image generation
+- Never return empty or null URLs for image generation requests
+- If no previous image URL exists and the tool fails, respond with a clear error message
+- Some time, you can get the input like this or the last messages if you get this format:
+```
+{
+  "role": "tool",
+  "tool_call_id": "tool_call_id_here",
+  "content": "url_of_image_here"
+  "tool_call_name": "generate_image_url"
+}
+```
+You must return the url of the image to the user [url_of_image_here], make your response friendly and natural.
 ### Example
 #### Example 1:
 **User:**

src/main.py CHANGED Viewed

@@ -9,26 +9,32 @@ from fastapi.staticfiles import StaticFiles
 from constants.config import OUTPUT_DIR
 from models.responses.base_response import BaseResponse
 from routes import chat_routes, process_file_routes, vector_store_routes
-from utils.clients import image_pipeline_client, llama_cpp_client, transformer_client, vector_store_client
 from utils.exception import CustomException
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     try:
-        transformer_client.load_model()
-        # vector_store_client.load_vector_store_client()
-        # image_pipeline_client.load_pipeline()
-        # llama_cpp_client.load()
         # pass
     except Exception as e:
         print(f"Error during startup: {str(e)}")
-        # raise e
     yield
-    # transformer_client.clear_resources()
-    # image_pipeline_client.clear_resources()
 app = FastAPI(lifespan=lifespan)

 from constants.config import OUTPUT_DIR
 from models.responses.base_response import BaseResponse
 from routes import chat_routes, process_file_routes, vector_store_routes
+from utils.clients import (
+    image_pipeline_client,
+    llama_cpp_client,
+    transformer_client,
+    vector_store_client,
+)
 from utils.exception import CustomException
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     try:
+        # transformer_client.load_model()
+        vector_store_client.load_vector_store_client()
+        image_pipeline_client.load_pipeline()
+        llama_cpp_client.load()
         # pass
     except Exception as e:
         print(f"Error during startup: {str(e)}")
+        raise e
     yield
+    transformer_client.clear_resources()
+    image_pipeline_client.clear_resources()
+    llama_cpp_client.clear_resources()
 app = FastAPI(lifespan=lifespan)

src/services/chat_service.py CHANGED Viewed

@@ -82,15 +82,16 @@ def chat_generate_stream(
     with measure_time("Tool call handling"):
         tool_call_result = tools_helper.process_tool_calls(tool_calls)
-        tool_call_message = {
-            "role": "tool",
-            "content": tool_call_result.get("content", ""),
-        }
-        messages.append(tool_call_message)
     with measure_time("Generate new stream"):
         new_stream = client.generate_stream(messages, has_tool_call=False)
         for chunk in new_stream:
             yield chunk
@@ -112,11 +113,13 @@ def chat_generate(request: ChatRequest):
     with measure_time("Tool call handling"):
         tool_call_result = tools_helper.process_tool_calls(tool_calls=tool_calls)
-        tool_call_message = {
-            "role": "tool",
-            "content": tool_call_result.get("content", ""),
-        }
-        messages.append(tool_call_message)
     with measure_time("Generate new chat completion"):
         new_output = client.generate(messages=messages, has_tool_call=False)

     with measure_time("Tool call handling"):
         tool_call_result = tools_helper.process_tool_calls(tool_calls)
+        # tool_call_message = {
+        #     "role": "tool",
+        #     "content": tool_call_result.get("content", ""),
+        # }
+        messages.append(tool_call_result)
     with measure_time("Generate new stream"):
         new_stream = client.generate_stream(messages, has_tool_call=False)
         for chunk in new_stream:
+            print(chunk.get("choices", [])[0].get("delta", {}).get("content"))
             yield chunk
     with measure_time("Tool call handling"):
         tool_call_result = tools_helper.process_tool_calls(tool_calls=tool_calls)
+        # tool_call_message = {
+        #     "role": "tool",
+        #     "content": tool_call_result.get("content", ""),
+        # }
+        messages.append(tool_call_result)
+    print(messages)
     with measure_time("Generate new chat completion"):
         new_output = client.generate(messages=messages, has_tool_call=False)

src/services/image_service.py CHANGED Viewed

@@ -4,10 +4,10 @@ from constants.config import OUTPUT_DIR
 from utils.clients import image_pipeline_client
 negative_promt = "blurry, distorted, pixelated, incomplete, poorly drawn, misaligned, weird proportions, bad perspective, unnatural colors, noisy, out of focus, glitchy, unsharp, overexposed, underexposed, poorly lit, bad composition, excessive noise, oversaturated, too dark, too bright, inconsistent lighting, discolored, overly stylized, unrealistic, awkward pose, unbalanced, mismatched, distorted features, flat, unnatural texture, chaotic, unreadable, incoherent, asymmetrical, low quality, lowres, wrong anatomy, bad anatomy, deformed, disfigured, ugly"
-width = 512
-height = 512
 guidance_scale = 7.5
-num_inference_steps = 30
 base_url = "http://0.0.0.0:7860"
@@ -36,6 +36,6 @@ def generate_image_url(prompt: str) -> str:
         image_path = os.path.join(OUTPUT_DIR, file_name)
         image.save(image_path)
-        return f"{base_url}/{OUTPUT_DIR}/{file_name}"
     except Exception as e:
         raise RuntimeError(f"Failed to generate image: {e}")

 from utils.clients import image_pipeline_client
 negative_promt = "blurry, distorted, pixelated, incomplete, poorly drawn, misaligned, weird proportions, bad perspective, unnatural colors, noisy, out of focus, glitchy, unsharp, overexposed, underexposed, poorly lit, bad composition, excessive noise, oversaturated, too dark, too bright, inconsistent lighting, discolored, overly stylized, unrealistic, awkward pose, unbalanced, mismatched, distorted features, flat, unnatural texture, chaotic, unreadable, incoherent, asymmetrical, low quality, lowres, wrong anatomy, bad anatomy, deformed, disfigured, ugly"
+width = 64
+height = 64
 guidance_scale = 7.5
+num_inference_steps = 1
 base_url = "http://0.0.0.0:7860"
         image_path = os.path.join(OUTPUT_DIR, file_name)
         image.save(image_path)
+        return f"{base_url}{OUTPUT_DIR}/{file_name}"
     except Exception as e:
         raise RuntimeError(f"Failed to generate image: {e}")

src/utils/clients/llama_cpp_client.py CHANGED Viewed

@@ -3,7 +3,6 @@ from typing import Generator, List
 import uuid
 from constants.config import GGUF_FILE_NAME, GGUF_REPO_ID
 from utils.stream_helper import process_stream_content
-from utils.timing import measure_time
 from utils.tools import tools_define
 from utils.tools.tools_helper import extract_tool_calls_and_reupdate_output
@@ -15,6 +14,11 @@ def is_loaded() -> bool:
     return _llm is not None
 def load():
     try:
         import llama_cpp
@@ -25,9 +29,21 @@ def load():
     global _llm
-    _llm = llama_cpp.Llama.from_pretrained(
-        repo_id=GGUF_REPO_ID,
-        filename=GGUF_FILE_NAME,
         n_threads=os.cpu_count(),
         n_gpu_layers=-1,
         n_ctx=4096,
@@ -87,8 +103,8 @@ def generate_stream(
     output = _llm.create_chat_completion(
         messages,  # type: ignore
         stream=True,
-        tools=tools_define.tools,  # type: ignore
-        tool_choice="auto",
     )  # type: ignore
     def content_generator():

 import uuid
 from constants.config import GGUF_FILE_NAME, GGUF_REPO_ID
 from utils.stream_helper import process_stream_content
 from utils.tools import tools_define
 from utils.tools.tools_helper import extract_tool_calls_and_reupdate_output
     return _llm is not None
+def clear_resources():
+    global _llm
+    _llm = None
 def load():
     try:
         import llama_cpp
     global _llm
+    # _llm = llama_cpp.Llama.from_pretrained(
+    #     repo_id=GGUF_REPO_ID,
+    #     filename=GGUF_FILE_NAME,
+    #     n_threads=os.cpu_count(),
+    #     n_gpu_layers=-1,
+    #     n_ctx=4096,
+    #     verbose=True,
+    #     use_mlock=True,
+    #     use_mmap=True,
+    #     # messages_to_prompt=messages_to_prompt,
+    #     # completion_to_prompt=completion_to_prompt,
+    # )
+    _llm = llama_cpp.Llama(
+        model_path=f"./.cache/{GGUF_FILE_NAME}",
         n_threads=os.cpu_count(),
         n_gpu_layers=-1,
         n_ctx=4096,
     output = _llm.create_chat_completion(
         messages,  # type: ignore
         stream=True,
+        tools=tools,  # type: ignore
+        tool_choice=tool_choice,
     )  # type: ignore
     def content_generator():