Commit
·
6b4fb1d
1
Parent(s):
c2767f1
Update Dockerfile and requirements: Introduce requirements_for_server.txt for streamlined dependency management, remove requirements.local.txt, and adjust Dockerfile to install local packages. Refactor chat_service.py to utilize transformer_client for message generation, and update .gitignore to include local_packages_for_win.
Browse files- .gitignore +2 -1
- .vscode/launch.json +6 -15
- Dockerfile +9 -28
- requirements.local.txt +0 -25
- requirements.txt +4 -3
- requirements_for_server.txt +26 -0
- src/services/chat_service.py +2 -3
- src/utils/clients/transformer_client.py +2 -0
.gitignore
CHANGED
@@ -13,4 +13,5 @@ data/*
|
|
13 |
venv/
|
14 |
.venv/
|
15 |
bitsandbytes/*
|
16 |
-
llama-cpp-python/*
|
|
|
|
13 |
venv/
|
14 |
.venv/
|
15 |
bitsandbytes/*
|
16 |
+
llama-cpp-python/*
|
17 |
+
local_packages_for_win/*
|
.vscode/launch.json
CHANGED
@@ -1,7 +1,4 @@
|
|
1 |
{
|
2 |
-
// Use IntelliSense to learn about possible attributes.
|
3 |
-
// Hover to view descriptions of existing attributes.
|
4 |
-
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
5 |
"version": "0.2.0",
|
6 |
"configurations": [
|
7 |
{
|
@@ -9,14 +6,6 @@
|
|
9 |
"type": "debugpy",
|
10 |
"request": "launch",
|
11 |
"module": "uvicorn",
|
12 |
-
// "env": {
|
13 |
-
// "db_username": "postgres",
|
14 |
-
// "db_password": "secret",
|
15 |
-
// "host_server": "localhost",
|
16 |
-
// "database_name": "fastapi",
|
17 |
-
// "ssl_mode": "prefer",
|
18 |
-
// "db_server_port": "5432"
|
19 |
-
// },
|
20 |
"env": {
|
21 |
"PYTHONPATH": "${workspaceFolder}/src"
|
22 |
},
|
@@ -26,9 +15,11 @@
|
|
26 |
"--port",
|
27 |
"7860",
|
28 |
"--host",
|
29 |
-
"0.0.0.0"
|
30 |
-
]
|
|
|
|
|
|
|
31 |
}
|
32 |
-
|
33 |
]
|
34 |
-
}
|
|
|
1 |
{
|
|
|
|
|
|
|
2 |
"version": "0.2.0",
|
3 |
"configurations": [
|
4 |
{
|
|
|
6 |
"type": "debugpy",
|
7 |
"request": "launch",
|
8 |
"module": "uvicorn",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
"env": {
|
10 |
"PYTHONPATH": "${workspaceFolder}/src"
|
11 |
},
|
|
|
15 |
"--port",
|
16 |
"7860",
|
17 |
"--host",
|
18 |
+
"0.0.0.0"
|
19 |
+
],
|
20 |
+
"windows": {
|
21 |
+
"python": "${workspaceFolder}\\.venv\\Scripts\\python.exe"
|
22 |
+
}
|
23 |
}
|
|
|
24 |
]
|
25 |
+
}
|
Dockerfile
CHANGED
@@ -1,48 +1,29 @@
|
|
1 |
-
# 1. Use an official Python base image (slim for smaller size)
|
2 |
FROM python:3.11-slim
|
3 |
|
4 |
-
# 2. Set environment variables for Python
|
5 |
ENV PYTHONDONTWRITEBYTECODE=1 \
|
6 |
PYTHONUNBUFFERED=1
|
7 |
|
8 |
-
# 3. Set the working directory in the container
|
9 |
WORKDIR /src
|
10 |
|
11 |
-
#
|
12 |
RUN apt-get update && \
|
13 |
apt-get install -y --no-install-recommends \
|
14 |
build-essential \
|
15 |
-
# gcc \
|
16 |
-
# g++ \
|
17 |
cmake \
|
18 |
-
# tesseract-ocr \
|
19 |
-
# libgl1 \
|
20 |
libglib2.0-0 \
|
21 |
git \
|
22 |
&& rm -rf /var/lib/apt/lists/*
|
23 |
|
24 |
-
|
25 |
-
|
|
|
26 |
|
27 |
-
#
|
28 |
-
|
29 |
-
# RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
30 |
|
31 |
-
#
|
32 |
-
# RUN pip install --no-cache-dir "llama-cpp-python==0.3.8" --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124
|
33 |
-
|
34 |
-
# # Remove llama line from requirements and install the rest
|
35 |
-
# RUN grep -v "llama-cpp-python" requirements.txt > requirements-no-llama.txt && \
|
36 |
-
# pip install --no-cache-dir -r requirements-no-llama.txt
|
37 |
-
|
38 |
-
run pip install --no-cache-dir -r requirements.txt
|
39 |
-
|
40 |
-
# 6. Copy only necessary files and folders
|
41 |
-
COPY requirements.txt .
|
42 |
COPY src/ .
|
43 |
|
44 |
-
#
|
45 |
EXPOSE 7860
|
46 |
-
|
47 |
-
# 8. Set the default command to run the FastAPI app
|
48 |
-
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
|
|
1 |
FROM python:3.11-slim
|
2 |
|
|
|
3 |
ENV PYTHONDONTWRITEBYTECODE=1 \
|
4 |
PYTHONUNBUFFERED=1
|
5 |
|
|
|
6 |
WORKDIR /src
|
7 |
|
8 |
+
# 1. Cài các package cần thiết
|
9 |
RUN apt-get update && \
|
10 |
apt-get install -y --no-install-recommends \
|
11 |
build-essential \
|
|
|
|
|
12 |
cmake \
|
|
|
|
|
13 |
libglib2.0-0 \
|
14 |
git \
|
15 |
&& rm -rf /var/lib/apt/lists/*
|
16 |
|
17 |
+
# 2. Copy requirements và thư viện local
|
18 |
+
COPY requirements_for_server.txt ./
|
19 |
+
COPY local_packages_for_server/ /tmp/local_packages_for_server/
|
20 |
|
21 |
+
# 3. Cài đặt gói từ local
|
22 |
+
RUN pip install --no-cache-dir --find-links=/tmp/local_packages_for_server -r requirements_for_server.txt
|
|
|
23 |
|
24 |
+
# 4. Copy mã nguồn
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
COPY src/ .
|
26 |
|
27 |
+
# 5. Expose và chạy app
|
28 |
EXPOSE 7860
|
29 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
|
|
|
|
requirements.local.txt
DELETED
@@ -1,25 +0,0 @@
|
|
1 |
-
fastapi[standard]>=0.113.0,<0.114.0
|
2 |
-
pydantic>=2.7.0,<3.0.0
|
3 |
-
uvicorn>=0.34.2
|
4 |
-
python-dotenv>=1.1.0
|
5 |
-
requests>=2.32.3
|
6 |
-
openai>=1.76.0
|
7 |
-
torch>=2.3.0,<2.6.0
|
8 |
-
Pillow>=11.2.1
|
9 |
-
yfinance>=0.2.56
|
10 |
-
python-multipart>=0.0.20
|
11 |
-
diffusers>=0.33.1
|
12 |
-
transformers>=4.51.3
|
13 |
-
accelerate>=1.6.0
|
14 |
-
beautifulsoup4>=4.13.4
|
15 |
-
pymupdf>=1.25.1
|
16 |
-
docx2txt>=0.8
|
17 |
-
pytesseract>=0.3.13
|
18 |
-
langchain_community>=0.3.19
|
19 |
-
langchain>=0.3.20
|
20 |
-
langchain_chroma>=0.2.2
|
21 |
-
chromadb>=0.6.3
|
22 |
-
sentence_transformers>=4.1.0
|
23 |
-
langchain_huggingface>=0.1.2
|
24 |
-
huggingface_hub[hf_xet]
|
25 |
-
llama-cpp-python==0.3.8
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -1,13 +1,14 @@
|
|
1 |
fastapi[standard] == 0.114.0
|
2 |
uvicorn == 0.34.2
|
3 |
-
torch==2.7.0
|
4 |
-
--extra-index-url https://download.pytorch.org/whl/cu128
|
5 |
|
6 |
# If use diffusers
|
7 |
diffusers == 0.33.1
|
8 |
-
# bitsandbytes == 0.46.0
|
9 |
accelerate == 1.6.0
|
10 |
transformers == 4.52.4
|
|
|
|
|
|
|
|
|
11 |
|
12 |
# If use llama-cpp-python
|
13 |
# llama-cpp-python == 0.3.8
|
|
|
1 |
fastapi[standard] == 0.114.0
|
2 |
uvicorn == 0.34.2
|
|
|
|
|
3 |
|
4 |
# If use diffusers
|
5 |
diffusers == 0.33.1
|
|
|
6 |
accelerate == 1.6.0
|
7 |
transformers == 4.52.4
|
8 |
+
torch==2.7.0
|
9 |
+
|
10 |
+
# Offline install
|
11 |
+
bitsandbytes -f ./local_packages/bitsandbytes-0.46.0-cp310-cp310-win_amd64.whl
|
12 |
|
13 |
# If use llama-cpp-python
|
14 |
# llama-cpp-python == 0.3.8
|
requirements_for_server.txt
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi[standard] == 0.114.0
|
2 |
+
uvicorn == 0.34.2
|
3 |
+
|
4 |
+
# If use diffusers
|
5 |
+
diffusers == 0.33.1
|
6 |
+
accelerate == 1.6.0
|
7 |
+
transformers == 4.52.4
|
8 |
+
|
9 |
+
# # Offline install
|
10 |
+
# torch==2.7.0
|
11 |
+
# -f ./local_packages/torch-2.7.0+cu128-cp310-cp310-win_amd64.whl
|
12 |
+
# bitsandbytes -f ./local_packages/bitsandbytes-0.46.0-cp310-cp310-win_amd64.whl
|
13 |
+
|
14 |
+
# If use llama-cpp-python
|
15 |
+
# llama-cpp-python == 0.3.8
|
16 |
+
# --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124
|
17 |
+
|
18 |
+
# If process file feature enable
|
19 |
+
# beautifulsoup4 == 4.13.4
|
20 |
+
# requests == 2.32.3
|
21 |
+
# langchain_chroma == 0.2.2
|
22 |
+
# langchain_huggingface == 0.1.2
|
23 |
+
# langchain_community == 0.3.19
|
24 |
+
# chromadb == 0.6.3
|
25 |
+
# pymupdf == 1.25.1
|
26 |
+
|
src/services/chat_service.py
CHANGED
@@ -3,7 +3,6 @@ from models.requests.chat_request import ChatRequest
|
|
3 |
from services import vector_store_service
|
4 |
|
5 |
# from utils.llama_cpp_client import create, create_stream
|
6 |
-
from utils.clients import open_ai_client
|
7 |
from utils.clients import transformer_client
|
8 |
from utils.timing import measure_time
|
9 |
from utils.tools import tools_helper
|
@@ -87,7 +86,7 @@ def chat_generate(request: ChatRequest):
|
|
87 |
messages = build_context_prompt(request)
|
88 |
messages.extend(request.messages)
|
89 |
|
90 |
-
output =
|
91 |
choices = output.get("choices", [])
|
92 |
|
93 |
tool_calls = choices[0].get("message").get("tool_calls")
|
@@ -100,5 +99,5 @@ def chat_generate(request: ChatRequest):
|
|
100 |
messages.append(tool_call_message)
|
101 |
|
102 |
# new_output = generate(messages=messages, has_tool_call=False)
|
103 |
-
new_output =
|
104 |
return new_output
|
|
|
3 |
from services import vector_store_service
|
4 |
|
5 |
# from utils.llama_cpp_client import create, create_stream
|
|
|
6 |
from utils.clients import transformer_client
|
7 |
from utils.timing import measure_time
|
8 |
from utils.tools import tools_helper
|
|
|
86 |
messages = build_context_prompt(request)
|
87 |
messages.extend(request.messages)
|
88 |
|
89 |
+
output = transformer_client.generate(messages=messages)
|
90 |
choices = output.get("choices", [])
|
91 |
|
92 |
tool_calls = choices[0].get("message").get("tool_calls")
|
|
|
99 |
messages.append(tool_call_message)
|
100 |
|
101 |
# new_output = generate(messages=messages, has_tool_call=False)
|
102 |
+
new_output = transformer_client.generate(messages=messages, has_tool_call=False)
|
103 |
return new_output
|
src/utils/clients/transformer_client.py
CHANGED
@@ -35,6 +35,7 @@ def load_model():
|
|
35 |
try:
|
36 |
with measure_time("Load model"):
|
37 |
if USE_QUANT:
|
|
|
38 |
quantization_config = BitsAndBytesConfig(
|
39 |
load_in_4bit=True,
|
40 |
bnb_4bit_quant_type="nf4",
|
@@ -52,6 +53,7 @@ def load_model():
|
|
52 |
# max_memory={0: "4GiB"}, # Limit GPU memory usage
|
53 |
)
|
54 |
else:
|
|
|
55 |
_model = AutoModelForCausalLM.from_pretrained(
|
56 |
LLM_MODEL_NAME,
|
57 |
torch_dtype=MODEL_OPTIMIZATION["torch_dtype"],
|
|
|
35 |
try:
|
36 |
with measure_time("Load model"):
|
37 |
if USE_QUANT:
|
38 |
+
print("Using quantization")
|
39 |
quantization_config = BitsAndBytesConfig(
|
40 |
load_in_4bit=True,
|
41 |
bnb_4bit_quant_type="nf4",
|
|
|
53 |
# max_memory={0: "4GiB"}, # Limit GPU memory usage
|
54 |
)
|
55 |
else:
|
56 |
+
print("Not using quantization")
|
57 |
_model = AutoModelForCausalLM.from_pretrained(
|
58 |
LLM_MODEL_NAME,
|
59 |
torch_dtype=MODEL_OPTIMIZATION["torch_dtype"],
|