LeoNguyen commited on
Commit
e3a80c0
·
1 Parent(s): 10ec9ff

Update documentation and refine requirements: Enhance the README with detailed installation instructions, Docker deployment steps, and key dependencies. Update requirements files to clarify optional packages and adjust CUDA-related dependencies. Modify .gitignore to include cache directories and ensure proper resource management in the application.

Browse files
.gitignore CHANGED
@@ -16,4 +16,5 @@ bitsandbytes/*
16
  llama-cpp-python/*
17
  local_packages_for_win/*
18
  llama.cpp/*
19
- local_packages_for_server/*
 
 
16
  llama-cpp-python/*
17
  local_packages_for_win/*
18
  llama.cpp/*
19
+ local_packages_for_server/*
20
+ .cache/*
readme.github.md CHANGED
@@ -77,33 +77,76 @@ src/
77
 
78
  ### Prerequisites
79
 
80
- - Python 3.x
81
- - FastAPI
82
- - Uvicorn
 
83
 
84
  ### Installation
85
 
 
 
 
 
 
 
 
 
 
 
86
  ```bash
87
  pip install -r requirements.txt
88
  ```
89
 
90
  ### Running the Application
91
 
 
 
92
  ```bash
93
  uvicorn main:app --reload --port 8080
94
  ```
95
 
96
- The application will be available at `http://localhost:8080`
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
  ## Development
99
 
100
- - Modular architecture: routes, services, models, utils
101
- - Environment variables required for some services (e.g., Brave, Jina API keys)
102
 
103
- ### Create and run docker file
104
 
105
- docker build -t ai-assistance-server .
106
- docker run -p 80:80 ai-assistance-server
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  ## License
109
 
 
77
 
78
  ### Prerequisites
79
 
80
+ - Python 3.11
81
+ - CUDA 12.9.0 (for GPU acceleration)
82
+ - FastAPI 0.114.0
83
+ - Uvicorn 0.34.2
84
 
85
  ### Installation
86
 
87
+ 1. Clone the repository
88
+ 2. Create a virtual environment:
89
+
90
+ ```bash
91
+ python -m venv .venv
92
+ source .venv/bin/activate # On Windows: .venv\Scripts\activate
93
+ ```
94
+
95
+ 3. Install dependencies:
96
+
97
  ```bash
98
  pip install -r requirements.txt
99
  ```
100
 
101
  ### Running the Application
102
 
103
+ #### Local Development
104
+
105
  ```bash
106
  uvicorn main:app --reload --port 8080
107
  ```
108
 
109
+ #### Docker Deployment
110
+
111
+ ```bash
112
+ # Build the Docker image
113
+ docker build -t ai-assistance-server .
114
+
115
+ # Run the container
116
+ docker run -p 7860:7860 --gpus all ai-assistance-server
117
+ ```
118
+
119
+ The application will be available at:
120
+
121
+ - Local: `http://localhost:7860`
122
+ - Server: `http://0.0.0.0:7860` or https://leonguyen101120-ai-assistance.hf.space
123
 
124
  ## Development
125
 
126
+ ### Key Dependencies
 
127
 
128
+ - **AI/ML**:
129
 
130
+ - diffusers 0.33.1
131
+ - transformers 4.52.4
132
+ - torch 2.7.0
133
+ - accelerate 1.6.0
134
+
135
+ - **File Processing** (Optional):
136
+ - beautifulsoup4 4.13.4
137
+ - langchain_chroma 0.2.2
138
+ - langchain_huggingface 0.1.2
139
+ - langchain_community 0.3.19
140
+ - chromadb 0.6.3
141
+ - pymupdf 1.25.1
142
+
143
+ ### Environment Variables
144
+
145
+ The following environment variables are required for specific features:
146
+
147
+ - Brave Search API key (for web search)
148
+ - Jina API key (for web content reading)
149
+ - HuggingFace API key (for model access)
150
 
151
  ## License
152
 
requirements.txt CHANGED
@@ -1,26 +1,29 @@
1
  fastapi[standard] == 0.114.0
2
  uvicorn == 0.34.2
3
  requests == 2.32.3
 
4
 
5
- # # If use diffusers
6
  diffusers == 0.33.1
7
  accelerate == 1.6.0
8
- transformers == 4.52.4
9
  torch==2.7.0
10
- --extra-index-url https://download.pytorch.org/whl/cu128
11
 
12
- # Offline install for windows
13
- bitsandbytes -f ./local_packages_for_win/bitsandbytes-0.46.0-cp311-cp311-win_amd64.whl
14
 
15
- # If use llama-cpp-python
16
- # llama-cpp-python -f ./local_packages_for_win/llama_cpp_python-0.3.9-cp311-cp311-win_amd64.whl
 
 
 
17
 
18
  # If process file feature enable
19
- # beautifulsoup4 == 4.13.4
20
- # requests == 2.32.3
21
- # langchain_chroma == 0.2.2
22
- # langchain_huggingface == 0.1.2
23
- # langchain_community == 0.3.19
24
- # chromadb == 0.6.3
25
- # pymupdf == 1.25.1
26
 
 
1
  fastapi[standard] == 0.114.0
2
  uvicorn == 0.34.2
3
  requests == 2.32.3
4
+ huggingface-hub == 0.32.0
5
 
6
+ # If use diffusers
7
  diffusers == 0.33.1
8
  accelerate == 1.6.0
9
+ # transformers == 4.52.4
10
  torch==2.7.0
 
11
 
12
+ # # If use bitsandbytes with cuda
13
+ # https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl
14
 
15
+ # # If use llama-cpp-python with cuda
16
+ # https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu124/llama_cpp_python-0.3.4-cp311-cp311-linux_x86_64.whl
17
+
18
+ # # If use llama-cpp-python CPU
19
+ llama-cpp-python == 0.3.9
20
 
21
  # If process file feature enable
22
+ beautifulsoup4 == 4.13.4
23
+ requests == 2.32.3
24
+ langchain_chroma == 0.2.2
25
+ langchain_huggingface == 0.1.2
26
+ langchain_community == 0.3.19
27
+ chromadb == 0.6.3
28
+ pymupdf == 1.25.1
29
 
requirements_for_server.txt CHANGED
@@ -6,25 +6,25 @@ huggingface-hub == 0.32.0
6
  # If use diffusers
7
  diffusers == 0.33.1
8
  accelerate == 1.6.0
9
- transformers == 4.52.4
10
  torch==2.7.0
11
  --extra-index-url https://download.pytorch.org/whl/cu128
12
 
13
  # # If use bitsandbytes with cuda
14
- https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl
15
 
16
  # # If use llama-cpp-python with cuda
17
- # https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu124/llama_cpp_python-0.3.4-cp311-cp311-linux_x86_64.whl
18
 
19
  # # If use llama-cpp-python CPU
20
  # llama-cpp-python == 0.3.9
21
 
22
  # If process file feature enable
23
- # beautifulsoup4 == 4.13.4
24
- # requests == 2.32.3
25
- # langchain_chroma == 0.2.2
26
- # langchain_huggingface == 0.1.2
27
- # langchain_community == 0.3.19
28
- # chromadb == 0.6.3
29
- # pymupdf == 1.25.1
30
 
 
6
  # If use diffusers
7
  diffusers == 0.33.1
8
  accelerate == 1.6.0
9
+ # transformers == 4.52.4
10
  torch==2.7.0
11
  --extra-index-url https://download.pytorch.org/whl/cu128
12
 
13
  # # If use bitsandbytes with cuda
14
+ # https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl
15
 
16
  # # If use llama-cpp-python with cuda
17
+ https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu124/llama_cpp_python-0.3.4-cp311-cp311-linux_x86_64.whl
18
 
19
  # # If use llama-cpp-python CPU
20
  # llama-cpp-python == 0.3.9
21
 
22
  # If process file feature enable
23
+ beautifulsoup4 == 4.13.4
24
+ requests == 2.32.3
25
+ langchain_chroma == 0.2.2
26
+ langchain_huggingface == 0.1.2
27
+ langchain_community == 0.3.19
28
+ chromadb == 0.6.3
29
+ pymupdf == 1.25.1
30
 
src/constants/system_prompts.py CHANGED
@@ -42,6 +42,25 @@ When tool is required, or something prompt seem like request tool, respond in **
42
 
43
  > **Important:** No explanation, greetings, or comments should be included before or after this format. Return only the JSON block wrapped in `<tool_call> </tool_call>`.
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  ### Example
46
  #### Example 1:
47
  **User:**
 
42
 
43
  > **Important:** No explanation, greetings, or comments should be included before or after this format. Return only the JSON block wrapped in `<tool_call> </tool_call>`.
44
 
45
+ ### Handling Image Generation Tool Calls
46
+ When the user requests image generation:
47
+ - Always return a new URL for each image generation request
48
+ - If the tool fails to generate a new image, return the URL from the last successful image generation
49
+ - Never return empty or null URLs for image generation requests
50
+ - If no previous image URL exists and the tool fails, respond with a clear error message
51
+ - Some time, you can get the input like this or the last messages if you get this format:
52
+ ```
53
+ {
54
+ "role": "tool",
55
+ "tool_call_id": "tool_call_id_here",
56
+ "content": "url_of_image_here"
57
+ "tool_call_name": "generate_image_url"
58
+ }
59
+ ```
60
+
61
+ You must return the url of the image to the user [url_of_image_here], make your response friendly and natural.
62
+
63
+
64
  ### Example
65
  #### Example 1:
66
  **User:**
src/main.py CHANGED
@@ -9,26 +9,32 @@ from fastapi.staticfiles import StaticFiles
9
  from constants.config import OUTPUT_DIR
10
  from models.responses.base_response import BaseResponse
11
  from routes import chat_routes, process_file_routes, vector_store_routes
12
- from utils.clients import image_pipeline_client, llama_cpp_client, transformer_client, vector_store_client
 
 
 
 
 
13
  from utils.exception import CustomException
14
 
15
 
16
  @asynccontextmanager
17
  async def lifespan(app: FastAPI):
18
  try:
19
- transformer_client.load_model()
20
- # vector_store_client.load_vector_store_client()
21
- # image_pipeline_client.load_pipeline()
22
- # llama_cpp_client.load()
23
  # pass
24
 
25
  except Exception as e:
26
  print(f"Error during startup: {str(e)}")
27
- # raise e
28
 
29
  yield
30
- # transformer_client.clear_resources()
31
- # image_pipeline_client.clear_resources()
 
32
 
33
 
34
  app = FastAPI(lifespan=lifespan)
 
9
  from constants.config import OUTPUT_DIR
10
  from models.responses.base_response import BaseResponse
11
  from routes import chat_routes, process_file_routes, vector_store_routes
12
+ from utils.clients import (
13
+ image_pipeline_client,
14
+ llama_cpp_client,
15
+ transformer_client,
16
+ vector_store_client,
17
+ )
18
  from utils.exception import CustomException
19
 
20
 
21
  @asynccontextmanager
22
  async def lifespan(app: FastAPI):
23
  try:
24
+ # transformer_client.load_model()
25
+ vector_store_client.load_vector_store_client()
26
+ image_pipeline_client.load_pipeline()
27
+ llama_cpp_client.load()
28
  # pass
29
 
30
  except Exception as e:
31
  print(f"Error during startup: {str(e)}")
32
+ raise e
33
 
34
  yield
35
+ transformer_client.clear_resources()
36
+ image_pipeline_client.clear_resources()
37
+ llama_cpp_client.clear_resources()
38
 
39
 
40
  app = FastAPI(lifespan=lifespan)
src/services/chat_service.py CHANGED
@@ -82,15 +82,16 @@ def chat_generate_stream(
82
 
83
  with measure_time("Tool call handling"):
84
  tool_call_result = tools_helper.process_tool_calls(tool_calls)
85
- tool_call_message = {
86
- "role": "tool",
87
- "content": tool_call_result.get("content", ""),
88
- }
89
- messages.append(tool_call_message)
90
 
91
  with measure_time("Generate new stream"):
92
  new_stream = client.generate_stream(messages, has_tool_call=False)
93
  for chunk in new_stream:
 
94
  yield chunk
95
 
96
 
@@ -112,11 +113,13 @@ def chat_generate(request: ChatRequest):
112
 
113
  with measure_time("Tool call handling"):
114
  tool_call_result = tools_helper.process_tool_calls(tool_calls=tool_calls)
115
- tool_call_message = {
116
- "role": "tool",
117
- "content": tool_call_result.get("content", ""),
118
- }
119
- messages.append(tool_call_message)
 
 
120
 
121
  with measure_time("Generate new chat completion"):
122
  new_output = client.generate(messages=messages, has_tool_call=False)
 
82
 
83
  with measure_time("Tool call handling"):
84
  tool_call_result = tools_helper.process_tool_calls(tool_calls)
85
+ # tool_call_message = {
86
+ # "role": "tool",
87
+ # "content": tool_call_result.get("content", ""),
88
+ # }
89
+ messages.append(tool_call_result)
90
 
91
  with measure_time("Generate new stream"):
92
  new_stream = client.generate_stream(messages, has_tool_call=False)
93
  for chunk in new_stream:
94
+ print(chunk.get("choices", [])[0].get("delta", {}).get("content"))
95
  yield chunk
96
 
97
 
 
113
 
114
  with measure_time("Tool call handling"):
115
  tool_call_result = tools_helper.process_tool_calls(tool_calls=tool_calls)
116
+ # tool_call_message = {
117
+ # "role": "tool",
118
+ # "content": tool_call_result.get("content", ""),
119
+ # }
120
+ messages.append(tool_call_result)
121
+
122
+ print(messages)
123
 
124
  with measure_time("Generate new chat completion"):
125
  new_output = client.generate(messages=messages, has_tool_call=False)
src/services/image_service.py CHANGED
@@ -4,10 +4,10 @@ from constants.config import OUTPUT_DIR
4
  from utils.clients import image_pipeline_client
5
 
6
  negative_promt = "blurry, distorted, pixelated, incomplete, poorly drawn, misaligned, weird proportions, bad perspective, unnatural colors, noisy, out of focus, glitchy, unsharp, overexposed, underexposed, poorly lit, bad composition, excessive noise, oversaturated, too dark, too bright, inconsistent lighting, discolored, overly stylized, unrealistic, awkward pose, unbalanced, mismatched, distorted features, flat, unnatural texture, chaotic, unreadable, incoherent, asymmetrical, low quality, lowres, wrong anatomy, bad anatomy, deformed, disfigured, ugly"
7
- width = 512
8
- height = 512
9
  guidance_scale = 7.5
10
- num_inference_steps = 30
11
 
12
  base_url = "http://0.0.0.0:7860"
13
 
@@ -36,6 +36,6 @@ def generate_image_url(prompt: str) -> str:
36
  image_path = os.path.join(OUTPUT_DIR, file_name)
37
  image.save(image_path)
38
 
39
- return f"{base_url}/{OUTPUT_DIR}/{file_name}"
40
  except Exception as e:
41
  raise RuntimeError(f"Failed to generate image: {e}")
 
4
  from utils.clients import image_pipeline_client
5
 
6
  negative_promt = "blurry, distorted, pixelated, incomplete, poorly drawn, misaligned, weird proportions, bad perspective, unnatural colors, noisy, out of focus, glitchy, unsharp, overexposed, underexposed, poorly lit, bad composition, excessive noise, oversaturated, too dark, too bright, inconsistent lighting, discolored, overly stylized, unrealistic, awkward pose, unbalanced, mismatched, distorted features, flat, unnatural texture, chaotic, unreadable, incoherent, asymmetrical, low quality, lowres, wrong anatomy, bad anatomy, deformed, disfigured, ugly"
7
+ width = 64
8
+ height = 64
9
  guidance_scale = 7.5
10
+ num_inference_steps = 1
11
 
12
  base_url = "http://0.0.0.0:7860"
13
 
 
36
  image_path = os.path.join(OUTPUT_DIR, file_name)
37
  image.save(image_path)
38
 
39
+ return f"{base_url}{OUTPUT_DIR}/{file_name}"
40
  except Exception as e:
41
  raise RuntimeError(f"Failed to generate image: {e}")
src/utils/clients/llama_cpp_client.py CHANGED
@@ -3,7 +3,6 @@ from typing import Generator, List
3
  import uuid
4
  from constants.config import GGUF_FILE_NAME, GGUF_REPO_ID
5
  from utils.stream_helper import process_stream_content
6
- from utils.timing import measure_time
7
  from utils.tools import tools_define
8
  from utils.tools.tools_helper import extract_tool_calls_and_reupdate_output
9
 
@@ -15,6 +14,11 @@ def is_loaded() -> bool:
15
  return _llm is not None
16
 
17
 
 
 
 
 
 
18
  def load():
19
  try:
20
  import llama_cpp
@@ -25,9 +29,21 @@ def load():
25
 
26
  global _llm
27
 
28
- _llm = llama_cpp.Llama.from_pretrained(
29
- repo_id=GGUF_REPO_ID,
30
- filename=GGUF_FILE_NAME,
 
 
 
 
 
 
 
 
 
 
 
 
31
  n_threads=os.cpu_count(),
32
  n_gpu_layers=-1,
33
  n_ctx=4096,
@@ -87,8 +103,8 @@ def generate_stream(
87
  output = _llm.create_chat_completion(
88
  messages, # type: ignore
89
  stream=True,
90
- tools=tools_define.tools, # type: ignore
91
- tool_choice="auto",
92
  ) # type: ignore
93
 
94
  def content_generator():
 
3
  import uuid
4
  from constants.config import GGUF_FILE_NAME, GGUF_REPO_ID
5
  from utils.stream_helper import process_stream_content
 
6
  from utils.tools import tools_define
7
  from utils.tools.tools_helper import extract_tool_calls_and_reupdate_output
8
 
 
14
  return _llm is not None
15
 
16
 
17
+ def clear_resources():
18
+ global _llm
19
+ _llm = None
20
+
21
+
22
  def load():
23
  try:
24
  import llama_cpp
 
29
 
30
  global _llm
31
 
32
+ # _llm = llama_cpp.Llama.from_pretrained(
33
+ # repo_id=GGUF_REPO_ID,
34
+ # filename=GGUF_FILE_NAME,
35
+ # n_threads=os.cpu_count(),
36
+ # n_gpu_layers=-1,
37
+ # n_ctx=4096,
38
+ # verbose=True,
39
+ # use_mlock=True,
40
+ # use_mmap=True,
41
+ # # messages_to_prompt=messages_to_prompt,
42
+ # # completion_to_prompt=completion_to_prompt,
43
+ # )
44
+
45
+ _llm = llama_cpp.Llama(
46
+ model_path=f"./.cache/{GGUF_FILE_NAME}",
47
  n_threads=os.cpu_count(),
48
  n_gpu_layers=-1,
49
  n_ctx=4096,
 
103
  output = _llm.create_chat_completion(
104
  messages, # type: ignore
105
  stream=True,
106
+ tools=tools, # type: ignore
107
+ tool_choice=tool_choice,
108
  ) # type: ignore
109
 
110
  def content_generator():