LeoNguyen
commited on
Commit
·
e3a80c0
1
Parent(s):
10ec9ff
Update documentation and refine requirements: Enhance the README with detailed installation instructions, Docker deployment steps, and key dependencies. Update requirements files to clarify optional packages and adjust CUDA-related dependencies. Modify .gitignore to include cache directories and ensure proper resource management in the application.
Browse files- .gitignore +2 -1
- readme.github.md +52 -9
- requirements.txt +17 -14
- requirements_for_server.txt +10 -10
- src/constants/system_prompts.py +19 -0
- src/main.py +14 -8
- src/services/chat_service.py +13 -10
- src/services/image_service.py +4 -4
- src/utils/clients/llama_cpp_client.py +22 -6
.gitignore
CHANGED
@@ -16,4 +16,5 @@ bitsandbytes/*
|
|
16 |
llama-cpp-python/*
|
17 |
local_packages_for_win/*
|
18 |
llama.cpp/*
|
19 |
-
local_packages_for_server/*
|
|
|
|
16 |
llama-cpp-python/*
|
17 |
local_packages_for_win/*
|
18 |
llama.cpp/*
|
19 |
+
local_packages_for_server/*
|
20 |
+
.cache/*
|
readme.github.md
CHANGED
@@ -77,33 +77,76 @@ src/
|
|
77 |
|
78 |
### Prerequisites
|
79 |
|
80 |
-
- Python 3.
|
81 |
-
-
|
82 |
-
-
|
|
|
83 |
|
84 |
### Installation
|
85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
```bash
|
87 |
pip install -r requirements.txt
|
88 |
```
|
89 |
|
90 |
### Running the Application
|
91 |
|
|
|
|
|
92 |
```bash
|
93 |
uvicorn main:app --reload --port 8080
|
94 |
```
|
95 |
|
96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
## Development
|
99 |
|
100 |
-
|
101 |
-
- Environment variables required for some services (e.g., Brave, Jina API keys)
|
102 |
|
103 |
-
|
104 |
|
105 |
-
|
106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
|
108 |
## License
|
109 |
|
|
|
77 |
|
78 |
### Prerequisites
|
79 |
|
80 |
+
- Python 3.11
|
81 |
+
- CUDA 12.9.0 (for GPU acceleration)
|
82 |
+
- FastAPI 0.114.0
|
83 |
+
- Uvicorn 0.34.2
|
84 |
|
85 |
### Installation
|
86 |
|
87 |
+
1. Clone the repository
|
88 |
+
2. Create a virtual environment:
|
89 |
+
|
90 |
+
```bash
|
91 |
+
python -m venv .venv
|
92 |
+
source .venv/bin/activate # On Windows: .venv\Scripts\activate
|
93 |
+
```
|
94 |
+
|
95 |
+
3. Install dependencies:
|
96 |
+
|
97 |
```bash
|
98 |
pip install -r requirements.txt
|
99 |
```
|
100 |
|
101 |
### Running the Application
|
102 |
|
103 |
+
#### Local Development
|
104 |
+
|
105 |
```bash
|
106 |
uvicorn main:app --reload --port 8080
|
107 |
```
|
108 |
|
109 |
+
#### Docker Deployment
|
110 |
+
|
111 |
+
```bash
|
112 |
+
# Build the Docker image
|
113 |
+
docker build -t ai-assistance-server .
|
114 |
+
|
115 |
+
# Run the container
|
116 |
+
docker run -p 7860:7860 --gpus all ai-assistance-server
|
117 |
+
```
|
118 |
+
|
119 |
+
The application will be available at:
|
120 |
+
|
121 |
+
- Local: `http://localhost:7860`
|
122 |
+
- Server: `http://0.0.0.0:7860` or https://leonguyen101120-ai-assistance.hf.space
|
123 |
|
124 |
## Development
|
125 |
|
126 |
+
### Key Dependencies
|
|
|
127 |
|
128 |
+
- **AI/ML**:
|
129 |
|
130 |
+
- diffusers 0.33.1
|
131 |
+
- transformers 4.52.4
|
132 |
+
- torch 2.7.0
|
133 |
+
- accelerate 1.6.0
|
134 |
+
|
135 |
+
- **File Processing** (Optional):
|
136 |
+
- beautifulsoup4 4.13.4
|
137 |
+
- langchain_chroma 0.2.2
|
138 |
+
- langchain_huggingface 0.1.2
|
139 |
+
- langchain_community 0.3.19
|
140 |
+
- chromadb 0.6.3
|
141 |
+
- pymupdf 1.25.1
|
142 |
+
|
143 |
+
### Environment Variables
|
144 |
+
|
145 |
+
The following environment variables are required for specific features:
|
146 |
+
|
147 |
+
- Brave Search API key (for web search)
|
148 |
+
- Jina API key (for web content reading)
|
149 |
+
- HuggingFace API key (for model access)
|
150 |
|
151 |
## License
|
152 |
|
requirements.txt
CHANGED
@@ -1,26 +1,29 @@
|
|
1 |
fastapi[standard] == 0.114.0
|
2 |
uvicorn == 0.34.2
|
3 |
requests == 2.32.3
|
|
|
4 |
|
5 |
-
#
|
6 |
diffusers == 0.33.1
|
7 |
accelerate == 1.6.0
|
8 |
-
transformers == 4.52.4
|
9 |
torch==2.7.0
|
10 |
-
--extra-index-url https://download.pytorch.org/whl/cu128
|
11 |
|
12 |
-
#
|
13 |
-
bitsandbytes
|
14 |
|
15 |
-
# If use llama-cpp-python
|
16 |
-
# llama-cpp-python
|
|
|
|
|
|
|
17 |
|
18 |
# If process file feature enable
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
|
|
|
1 |
fastapi[standard] == 0.114.0
|
2 |
uvicorn == 0.34.2
|
3 |
requests == 2.32.3
|
4 |
+
huggingface-hub == 0.32.0
|
5 |
|
6 |
+
# If use diffusers
|
7 |
diffusers == 0.33.1
|
8 |
accelerate == 1.6.0
|
9 |
+
# transformers == 4.52.4
|
10 |
torch==2.7.0
|
|
|
11 |
|
12 |
+
# # If use bitsandbytes with cuda
|
13 |
+
# https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl
|
14 |
|
15 |
+
# # If use llama-cpp-python with cuda
|
16 |
+
# https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu124/llama_cpp_python-0.3.4-cp311-cp311-linux_x86_64.whl
|
17 |
+
|
18 |
+
# # If use llama-cpp-python CPU
|
19 |
+
llama-cpp-python == 0.3.9
|
20 |
|
21 |
# If process file feature enable
|
22 |
+
beautifulsoup4 == 4.13.4
|
23 |
+
requests == 2.32.3
|
24 |
+
langchain_chroma == 0.2.2
|
25 |
+
langchain_huggingface == 0.1.2
|
26 |
+
langchain_community == 0.3.19
|
27 |
+
chromadb == 0.6.3
|
28 |
+
pymupdf == 1.25.1
|
29 |
|
requirements_for_server.txt
CHANGED
@@ -6,25 +6,25 @@ huggingface-hub == 0.32.0
|
|
6 |
# If use diffusers
|
7 |
diffusers == 0.33.1
|
8 |
accelerate == 1.6.0
|
9 |
-
transformers == 4.52.4
|
10 |
torch==2.7.0
|
11 |
--extra-index-url https://download.pytorch.org/whl/cu128
|
12 |
|
13 |
# # If use bitsandbytes with cuda
|
14 |
-
https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl
|
15 |
|
16 |
# # If use llama-cpp-python with cuda
|
17 |
-
|
18 |
|
19 |
# # If use llama-cpp-python CPU
|
20 |
# llama-cpp-python == 0.3.9
|
21 |
|
22 |
# If process file feature enable
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
|
|
|
6 |
# If use diffusers
|
7 |
diffusers == 0.33.1
|
8 |
accelerate == 1.6.0
|
9 |
+
# transformers == 4.52.4
|
10 |
torch==2.7.0
|
11 |
--extra-index-url https://download.pytorch.org/whl/cu128
|
12 |
|
13 |
# # If use bitsandbytes with cuda
|
14 |
+
# https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl
|
15 |
|
16 |
# # If use llama-cpp-python with cuda
|
17 |
+
https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu124/llama_cpp_python-0.3.4-cp311-cp311-linux_x86_64.whl
|
18 |
|
19 |
# # If use llama-cpp-python CPU
|
20 |
# llama-cpp-python == 0.3.9
|
21 |
|
22 |
# If process file feature enable
|
23 |
+
beautifulsoup4 == 4.13.4
|
24 |
+
requests == 2.32.3
|
25 |
+
langchain_chroma == 0.2.2
|
26 |
+
langchain_huggingface == 0.1.2
|
27 |
+
langchain_community == 0.3.19
|
28 |
+
chromadb == 0.6.3
|
29 |
+
pymupdf == 1.25.1
|
30 |
|
src/constants/system_prompts.py
CHANGED
@@ -42,6 +42,25 @@ When tool is required, or something prompt seem like request tool, respond in **
|
|
42 |
|
43 |
> **Important:** No explanation, greetings, or comments should be included before or after this format. Return only the JSON block wrapped in `<tool_call> </tool_call>`.
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
### Example
|
46 |
#### Example 1:
|
47 |
**User:**
|
|
|
42 |
|
43 |
> **Important:** No explanation, greetings, or comments should be included before or after this format. Return only the JSON block wrapped in `<tool_call> </tool_call>`.
|
44 |
|
45 |
+
### Handling Image Generation Tool Calls
|
46 |
+
When the user requests image generation:
|
47 |
+
- Always return a new URL for each image generation request
|
48 |
+
- If the tool fails to generate a new image, return the URL from the last successful image generation
|
49 |
+
- Never return empty or null URLs for image generation requests
|
50 |
+
- If no previous image URL exists and the tool fails, respond with a clear error message
|
51 |
+
- Some time, you can get the input like this or the last messages if you get this format:
|
52 |
+
```
|
53 |
+
{
|
54 |
+
"role": "tool",
|
55 |
+
"tool_call_id": "tool_call_id_here",
|
56 |
+
"content": "url_of_image_here"
|
57 |
+
"tool_call_name": "generate_image_url"
|
58 |
+
}
|
59 |
+
```
|
60 |
+
|
61 |
+
You must return the url of the image to the user [url_of_image_here], make your response friendly and natural.
|
62 |
+
|
63 |
+
|
64 |
### Example
|
65 |
#### Example 1:
|
66 |
**User:**
|
src/main.py
CHANGED
@@ -9,26 +9,32 @@ from fastapi.staticfiles import StaticFiles
|
|
9 |
from constants.config import OUTPUT_DIR
|
10 |
from models.responses.base_response import BaseResponse
|
11 |
from routes import chat_routes, process_file_routes, vector_store_routes
|
12 |
-
from utils.clients import
|
|
|
|
|
|
|
|
|
|
|
13 |
from utils.exception import CustomException
|
14 |
|
15 |
|
16 |
@asynccontextmanager
|
17 |
async def lifespan(app: FastAPI):
|
18 |
try:
|
19 |
-
transformer_client.load_model()
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
# pass
|
24 |
|
25 |
except Exception as e:
|
26 |
print(f"Error during startup: {str(e)}")
|
27 |
-
|
28 |
|
29 |
yield
|
30 |
-
|
31 |
-
|
|
|
32 |
|
33 |
|
34 |
app = FastAPI(lifespan=lifespan)
|
|
|
9 |
from constants.config import OUTPUT_DIR
|
10 |
from models.responses.base_response import BaseResponse
|
11 |
from routes import chat_routes, process_file_routes, vector_store_routes
|
12 |
+
from utils.clients import (
|
13 |
+
image_pipeline_client,
|
14 |
+
llama_cpp_client,
|
15 |
+
transformer_client,
|
16 |
+
vector_store_client,
|
17 |
+
)
|
18 |
from utils.exception import CustomException
|
19 |
|
20 |
|
21 |
@asynccontextmanager
|
22 |
async def lifespan(app: FastAPI):
|
23 |
try:
|
24 |
+
# transformer_client.load_model()
|
25 |
+
vector_store_client.load_vector_store_client()
|
26 |
+
image_pipeline_client.load_pipeline()
|
27 |
+
llama_cpp_client.load()
|
28 |
# pass
|
29 |
|
30 |
except Exception as e:
|
31 |
print(f"Error during startup: {str(e)}")
|
32 |
+
raise e
|
33 |
|
34 |
yield
|
35 |
+
transformer_client.clear_resources()
|
36 |
+
image_pipeline_client.clear_resources()
|
37 |
+
llama_cpp_client.clear_resources()
|
38 |
|
39 |
|
40 |
app = FastAPI(lifespan=lifespan)
|
src/services/chat_service.py
CHANGED
@@ -82,15 +82,16 @@ def chat_generate_stream(
|
|
82 |
|
83 |
with measure_time("Tool call handling"):
|
84 |
tool_call_result = tools_helper.process_tool_calls(tool_calls)
|
85 |
-
tool_call_message = {
|
86 |
-
|
87 |
-
|
88 |
-
}
|
89 |
-
messages.append(
|
90 |
|
91 |
with measure_time("Generate new stream"):
|
92 |
new_stream = client.generate_stream(messages, has_tool_call=False)
|
93 |
for chunk in new_stream:
|
|
|
94 |
yield chunk
|
95 |
|
96 |
|
@@ -112,11 +113,13 @@ def chat_generate(request: ChatRequest):
|
|
112 |
|
113 |
with measure_time("Tool call handling"):
|
114 |
tool_call_result = tools_helper.process_tool_calls(tool_calls=tool_calls)
|
115 |
-
tool_call_message = {
|
116 |
-
|
117 |
-
|
118 |
-
}
|
119 |
-
messages.append(
|
|
|
|
|
120 |
|
121 |
with measure_time("Generate new chat completion"):
|
122 |
new_output = client.generate(messages=messages, has_tool_call=False)
|
|
|
82 |
|
83 |
with measure_time("Tool call handling"):
|
84 |
tool_call_result = tools_helper.process_tool_calls(tool_calls)
|
85 |
+
# tool_call_message = {
|
86 |
+
# "role": "tool",
|
87 |
+
# "content": tool_call_result.get("content", ""),
|
88 |
+
# }
|
89 |
+
messages.append(tool_call_result)
|
90 |
|
91 |
with measure_time("Generate new stream"):
|
92 |
new_stream = client.generate_stream(messages, has_tool_call=False)
|
93 |
for chunk in new_stream:
|
94 |
+
print(chunk.get("choices", [])[0].get("delta", {}).get("content"))
|
95 |
yield chunk
|
96 |
|
97 |
|
|
|
113 |
|
114 |
with measure_time("Tool call handling"):
|
115 |
tool_call_result = tools_helper.process_tool_calls(tool_calls=tool_calls)
|
116 |
+
# tool_call_message = {
|
117 |
+
# "role": "tool",
|
118 |
+
# "content": tool_call_result.get("content", ""),
|
119 |
+
# }
|
120 |
+
messages.append(tool_call_result)
|
121 |
+
|
122 |
+
print(messages)
|
123 |
|
124 |
with measure_time("Generate new chat completion"):
|
125 |
new_output = client.generate(messages=messages, has_tool_call=False)
|
src/services/image_service.py
CHANGED
@@ -4,10 +4,10 @@ from constants.config import OUTPUT_DIR
|
|
4 |
from utils.clients import image_pipeline_client
|
5 |
|
6 |
negative_promt = "blurry, distorted, pixelated, incomplete, poorly drawn, misaligned, weird proportions, bad perspective, unnatural colors, noisy, out of focus, glitchy, unsharp, overexposed, underexposed, poorly lit, bad composition, excessive noise, oversaturated, too dark, too bright, inconsistent lighting, discolored, overly stylized, unrealistic, awkward pose, unbalanced, mismatched, distorted features, flat, unnatural texture, chaotic, unreadable, incoherent, asymmetrical, low quality, lowres, wrong anatomy, bad anatomy, deformed, disfigured, ugly"
|
7 |
-
width =
|
8 |
-
height =
|
9 |
guidance_scale = 7.5
|
10 |
-
num_inference_steps =
|
11 |
|
12 |
base_url = "http://0.0.0.0:7860"
|
13 |
|
@@ -36,6 +36,6 @@ def generate_image_url(prompt: str) -> str:
|
|
36 |
image_path = os.path.join(OUTPUT_DIR, file_name)
|
37 |
image.save(image_path)
|
38 |
|
39 |
-
return f"{base_url}
|
40 |
except Exception as e:
|
41 |
raise RuntimeError(f"Failed to generate image: {e}")
|
|
|
4 |
from utils.clients import image_pipeline_client
|
5 |
|
6 |
negative_promt = "blurry, distorted, pixelated, incomplete, poorly drawn, misaligned, weird proportions, bad perspective, unnatural colors, noisy, out of focus, glitchy, unsharp, overexposed, underexposed, poorly lit, bad composition, excessive noise, oversaturated, too dark, too bright, inconsistent lighting, discolored, overly stylized, unrealistic, awkward pose, unbalanced, mismatched, distorted features, flat, unnatural texture, chaotic, unreadable, incoherent, asymmetrical, low quality, lowres, wrong anatomy, bad anatomy, deformed, disfigured, ugly"
|
7 |
+
width = 64
|
8 |
+
height = 64
|
9 |
guidance_scale = 7.5
|
10 |
+
num_inference_steps = 1
|
11 |
|
12 |
base_url = "http://0.0.0.0:7860"
|
13 |
|
|
|
36 |
image_path = os.path.join(OUTPUT_DIR, file_name)
|
37 |
image.save(image_path)
|
38 |
|
39 |
+
return f"{base_url}{OUTPUT_DIR}/{file_name}"
|
40 |
except Exception as e:
|
41 |
raise RuntimeError(f"Failed to generate image: {e}")
|
src/utils/clients/llama_cpp_client.py
CHANGED
@@ -3,7 +3,6 @@ from typing import Generator, List
|
|
3 |
import uuid
|
4 |
from constants.config import GGUF_FILE_NAME, GGUF_REPO_ID
|
5 |
from utils.stream_helper import process_stream_content
|
6 |
-
from utils.timing import measure_time
|
7 |
from utils.tools import tools_define
|
8 |
from utils.tools.tools_helper import extract_tool_calls_and_reupdate_output
|
9 |
|
@@ -15,6 +14,11 @@ def is_loaded() -> bool:
|
|
15 |
return _llm is not None
|
16 |
|
17 |
|
|
|
|
|
|
|
|
|
|
|
18 |
def load():
|
19 |
try:
|
20 |
import llama_cpp
|
@@ -25,9 +29,21 @@ def load():
|
|
25 |
|
26 |
global _llm
|
27 |
|
28 |
-
_llm = llama_cpp.Llama.from_pretrained(
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
n_threads=os.cpu_count(),
|
32 |
n_gpu_layers=-1,
|
33 |
n_ctx=4096,
|
@@ -87,8 +103,8 @@ def generate_stream(
|
|
87 |
output = _llm.create_chat_completion(
|
88 |
messages, # type: ignore
|
89 |
stream=True,
|
90 |
-
tools=
|
91 |
-
tool_choice=
|
92 |
) # type: ignore
|
93 |
|
94 |
def content_generator():
|
|
|
3 |
import uuid
|
4 |
from constants.config import GGUF_FILE_NAME, GGUF_REPO_ID
|
5 |
from utils.stream_helper import process_stream_content
|
|
|
6 |
from utils.tools import tools_define
|
7 |
from utils.tools.tools_helper import extract_tool_calls_and_reupdate_output
|
8 |
|
|
|
14 |
return _llm is not None
|
15 |
|
16 |
|
17 |
+
def clear_resources():
|
18 |
+
global _llm
|
19 |
+
_llm = None
|
20 |
+
|
21 |
+
|
22 |
def load():
|
23 |
try:
|
24 |
import llama_cpp
|
|
|
29 |
|
30 |
global _llm
|
31 |
|
32 |
+
# _llm = llama_cpp.Llama.from_pretrained(
|
33 |
+
# repo_id=GGUF_REPO_ID,
|
34 |
+
# filename=GGUF_FILE_NAME,
|
35 |
+
# n_threads=os.cpu_count(),
|
36 |
+
# n_gpu_layers=-1,
|
37 |
+
# n_ctx=4096,
|
38 |
+
# verbose=True,
|
39 |
+
# use_mlock=True,
|
40 |
+
# use_mmap=True,
|
41 |
+
# # messages_to_prompt=messages_to_prompt,
|
42 |
+
# # completion_to_prompt=completion_to_prompt,
|
43 |
+
# )
|
44 |
+
|
45 |
+
_llm = llama_cpp.Llama(
|
46 |
+
model_path=f"./.cache/{GGUF_FILE_NAME}",
|
47 |
n_threads=os.cpu_count(),
|
48 |
n_gpu_layers=-1,
|
49 |
n_ctx=4096,
|
|
|
103 |
output = _llm.create_chat_completion(
|
104 |
messages, # type: ignore
|
105 |
stream=True,
|
106 |
+
tools=tools, # type: ignore
|
107 |
+
tool_choice=tool_choice,
|
108 |
) # type: ignore
|
109 |
|
110 |
def content_generator():
|