LeoNguyen101120 commited on
Commit
6b4fb1d
·
1 Parent(s): c2767f1

Update Dockerfile and requirements: Introduce requirements_for_server.txt for streamlined dependency management, remove requirements.local.txt, and adjust Dockerfile to install local packages. Refactor chat_service.py to utilize transformer_client for message generation, and update .gitignore to include local_packages_for_win.

Browse files
.gitignore CHANGED
@@ -13,4 +13,5 @@ data/*
13
  venv/
14
  .venv/
15
  bitsandbytes/*
16
- llama-cpp-python/*
 
 
13
  venv/
14
  .venv/
15
  bitsandbytes/*
16
+ llama-cpp-python/*
17
+ local_packages_for_win/*
.vscode/launch.json CHANGED
@@ -1,7 +1,4 @@
1
  {
2
- // Use IntelliSense to learn about possible attributes.
3
- // Hover to view descriptions of existing attributes.
4
- // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5
  "version": "0.2.0",
6
  "configurations": [
7
  {
@@ -9,14 +6,6 @@
9
  "type": "debugpy",
10
  "request": "launch",
11
  "module": "uvicorn",
12
- // "env": {
13
- // "db_username": "postgres",
14
- // "db_password": "secret",
15
- // "host_server": "localhost",
16
- // "database_name": "fastapi",
17
- // "ssl_mode": "prefer",
18
- // "db_server_port": "5432"
19
- // },
20
  "env": {
21
  "PYTHONPATH": "${workspaceFolder}/src"
22
  },
@@ -26,9 +15,11 @@
26
  "--port",
27
  "7860",
28
  "--host",
29
- "0.0.0.0",
30
- ]
 
 
 
31
  }
32
-
33
  ]
34
- }
 
1
  {
 
 
 
2
  "version": "0.2.0",
3
  "configurations": [
4
  {
 
6
  "type": "debugpy",
7
  "request": "launch",
8
  "module": "uvicorn",
 
 
 
 
 
 
 
 
9
  "env": {
10
  "PYTHONPATH": "${workspaceFolder}/src"
11
  },
 
15
  "--port",
16
  "7860",
17
  "--host",
18
+ "0.0.0.0"
19
+ ],
20
+ "windows": {
21
+ "python": "${workspaceFolder}\\.venv\\Scripts\\python.exe"
22
+ }
23
  }
 
24
  ]
25
+ }
Dockerfile CHANGED
@@ -1,48 +1,29 @@
1
- # 1. Use an official Python base image (slim for smaller size)
2
  FROM python:3.11-slim
3
 
4
- # 2. Set environment variables for Python
5
  ENV PYTHONDONTWRITEBYTECODE=1 \
6
  PYTHONUNBUFFERED=1
7
 
8
- # 3. Set the working directory in the container
9
  WORKDIR /src
10
 
11
- # 4. Install system dependencies required for Python packages and llama-cpp-python
12
  RUN apt-get update && \
13
  apt-get install -y --no-install-recommends \
14
  build-essential \
15
- # gcc \
16
- # g++ \
17
  cmake \
18
- # tesseract-ocr \
19
- # libgl1 \
20
  libglib2.0-0 \
21
  git \
22
  && rm -rf /var/lib/apt/lists/*
23
 
24
- RUN mkdir -p /tmp/cache /tmp/vector_store /.cache && \
25
- chown -R 1000:1000 /tmp /.cache
 
26
 
27
- # 5. Copy requirements.txt and install Python dependencies
28
- COPY requirements.txt .
29
- # RUN pip install --no-cache-dir --upgrade -r requirements.txt
30
 
31
- # # Install llama-cpp-python first (faster rebuilds)
32
- # RUN pip install --no-cache-dir "llama-cpp-python==0.3.8" --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124
33
-
34
- # # Remove llama line from requirements and install the rest
35
- # RUN grep -v "llama-cpp-python" requirements.txt > requirements-no-llama.txt && \
36
- # pip install --no-cache-dir -r requirements-no-llama.txt
37
-
38
- run pip install --no-cache-dir -r requirements.txt
39
-
40
- # 6. Copy only necessary files and folders
41
- COPY requirements.txt .
42
  COPY src/ .
43
 
44
- # 7. Expose the port FastAPI will run on
45
  EXPOSE 7860
46
-
47
- # 8. Set the default command to run the FastAPI app
48
- CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
1
  FROM python:3.11-slim
2
 
 
3
  ENV PYTHONDONTWRITEBYTECODE=1 \
4
  PYTHONUNBUFFERED=1
5
 
 
6
  WORKDIR /src
7
 
8
+ # 1. Cài các package cần thiết
9
  RUN apt-get update && \
10
  apt-get install -y --no-install-recommends \
11
  build-essential \
 
 
12
  cmake \
 
 
13
  libglib2.0-0 \
14
  git \
15
  && rm -rf /var/lib/apt/lists/*
16
 
17
+ # 2. Copy requirements thư viện local
18
+ COPY requirements_for_server.txt ./
19
+ COPY local_packages_for_server/ /tmp/local_packages_for_server/
20
 
21
+ # 3. Cài đặt gói từ local
22
+ RUN pip install --no-cache-dir --find-links=/tmp/local_packages_for_server -r requirements_for_server.txt
 
23
 
24
+ # 4. Copy nguồn
 
 
 
 
 
 
 
 
 
 
25
  COPY src/ .
26
 
27
+ # 5. Expose chạy app
28
  EXPOSE 7860
29
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
 
 
requirements.local.txt DELETED
@@ -1,25 +0,0 @@
1
- fastapi[standard]>=0.113.0,<0.114.0
2
- pydantic>=2.7.0,<3.0.0
3
- uvicorn>=0.34.2
4
- python-dotenv>=1.1.0
5
- requests>=2.32.3
6
- openai>=1.76.0
7
- torch>=2.3.0,<2.6.0
8
- Pillow>=11.2.1
9
- yfinance>=0.2.56
10
- python-multipart>=0.0.20
11
- diffusers>=0.33.1
12
- transformers>=4.51.3
13
- accelerate>=1.6.0
14
- beautifulsoup4>=4.13.4
15
- pymupdf>=1.25.1
16
- docx2txt>=0.8
17
- pytesseract>=0.3.13
18
- langchain_community>=0.3.19
19
- langchain>=0.3.20
20
- langchain_chroma>=0.2.2
21
- chromadb>=0.6.3
22
- sentence_transformers>=4.1.0
23
- langchain_huggingface>=0.1.2
24
- huggingface_hub[hf_xet]
25
- llama-cpp-python==0.3.8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,13 +1,14 @@
1
  fastapi[standard] == 0.114.0
2
  uvicorn == 0.34.2
3
- torch==2.7.0
4
- --extra-index-url https://download.pytorch.org/whl/cu128
5
 
6
  # If use diffusers
7
  diffusers == 0.33.1
8
- # bitsandbytes == 0.46.0
9
  accelerate == 1.6.0
10
  transformers == 4.52.4
 
 
 
 
11
 
12
  # If use llama-cpp-python
13
  # llama-cpp-python == 0.3.8
 
1
  fastapi[standard] == 0.114.0
2
  uvicorn == 0.34.2
 
 
3
 
4
  # If use diffusers
5
  diffusers == 0.33.1
 
6
  accelerate == 1.6.0
7
  transformers == 4.52.4
8
+ torch==2.7.0
9
+
10
+ # Offline install
11
+ bitsandbytes -f ./local_packages/bitsandbytes-0.46.0-cp310-cp310-win_amd64.whl
12
 
13
  # If use llama-cpp-python
14
  # llama-cpp-python == 0.3.8
requirements_for_server.txt ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi[standard] == 0.114.0
2
+ uvicorn == 0.34.2
3
+
4
+ # If use diffusers
5
+ diffusers == 0.33.1
6
+ accelerate == 1.6.0
7
+ transformers == 4.52.4
8
+
9
+ # # Offline install
10
+ # torch==2.7.0
11
+ # -f ./local_packages/torch-2.7.0+cu128-cp310-cp310-win_amd64.whl
12
+ # bitsandbytes -f ./local_packages/bitsandbytes-0.46.0-cp310-cp310-win_amd64.whl
13
+
14
+ # If use llama-cpp-python
15
+ # llama-cpp-python == 0.3.8
16
+ # --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124
17
+
18
+ # If process file feature enable
19
+ # beautifulsoup4 == 4.13.4
20
+ # requests == 2.32.3
21
+ # langchain_chroma == 0.2.2
22
+ # langchain_huggingface == 0.1.2
23
+ # langchain_community == 0.3.19
24
+ # chromadb == 0.6.3
25
+ # pymupdf == 1.25.1
26
+
src/services/chat_service.py CHANGED
@@ -3,7 +3,6 @@ from models.requests.chat_request import ChatRequest
3
  from services import vector_store_service
4
 
5
  # from utils.llama_cpp_client import create, create_stream
6
- from utils.clients import open_ai_client
7
  from utils.clients import transformer_client
8
  from utils.timing import measure_time
9
  from utils.tools import tools_helper
@@ -87,7 +86,7 @@ def chat_generate(request: ChatRequest):
87
  messages = build_context_prompt(request)
88
  messages.extend(request.messages)
89
 
90
- output = open_ai_client.generate(messages=messages)
91
  choices = output.get("choices", [])
92
 
93
  tool_calls = choices[0].get("message").get("tool_calls")
@@ -100,5 +99,5 @@ def chat_generate(request: ChatRequest):
100
  messages.append(tool_call_message)
101
 
102
  # new_output = generate(messages=messages, has_tool_call=False)
103
- new_output = open_ai_client.generate(messages=messages, has_tool_call=False)
104
  return new_output
 
3
  from services import vector_store_service
4
 
5
  # from utils.llama_cpp_client import create, create_stream
 
6
  from utils.clients import transformer_client
7
  from utils.timing import measure_time
8
  from utils.tools import tools_helper
 
86
  messages = build_context_prompt(request)
87
  messages.extend(request.messages)
88
 
89
+ output = transformer_client.generate(messages=messages)
90
  choices = output.get("choices", [])
91
 
92
  tool_calls = choices[0].get("message").get("tool_calls")
 
99
  messages.append(tool_call_message)
100
 
101
  # new_output = generate(messages=messages, has_tool_call=False)
102
+ new_output = transformer_client.generate(messages=messages, has_tool_call=False)
103
  return new_output
src/utils/clients/transformer_client.py CHANGED
@@ -35,6 +35,7 @@ def load_model():
35
  try:
36
  with measure_time("Load model"):
37
  if USE_QUANT:
 
38
  quantization_config = BitsAndBytesConfig(
39
  load_in_4bit=True,
40
  bnb_4bit_quant_type="nf4",
@@ -52,6 +53,7 @@ def load_model():
52
  # max_memory={0: "4GiB"}, # Limit GPU memory usage
53
  )
54
  else:
 
55
  _model = AutoModelForCausalLM.from_pretrained(
56
  LLM_MODEL_NAME,
57
  torch_dtype=MODEL_OPTIMIZATION["torch_dtype"],
 
35
  try:
36
  with measure_time("Load model"):
37
  if USE_QUANT:
38
+ print("Using quantization")
39
  quantization_config = BitsAndBytesConfig(
40
  load_in_4bit=True,
41
  bnb_4bit_quant_type="nf4",
 
53
  # max_memory={0: "4GiB"}, # Limit GPU memory usage
54
  )
55
  else:
56
+ print("Not using quantization")
57
  _model = AutoModelForCausalLM.from_pretrained(
58
  LLM_MODEL_NAME,
59
  torch_dtype=MODEL_OPTIMIZATION["torch_dtype"],