Commit
·
e48d33f
1
Parent(s):
e086b97
Refactor Dockerfile and .dockerignore: Update file copying strategy to include only necessary files and improve ignored patterns for better build efficiency.
Browse files- .dockerignore +21 -7
- Dockerfile +3 -2
- src/utils/llama_cpp_client.py +21 -21
.dockerignore
CHANGED
@@ -1,27 +1,41 @@
|
|
1 |
# Byte-compiled / optimized / DLL files
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
|
|
7 |
|
8 |
# Distribution / packaging
|
9 |
-
|
|
|
10 |
|
11 |
# OS files
|
|
|
12 |
.DS_Store
|
13 |
|
14 |
# Environment files
|
|
|
15 |
.env
|
16 |
-
.env
|
17 |
|
18 |
# Project data and outputs
|
|
|
19 |
outputs/
|
20 |
uploads/
|
21 |
data/
|
22 |
|
23 |
# VSCode settings
|
|
|
24 |
.vscode/
|
25 |
|
26 |
# Git
|
|
|
27 |
.git/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# Byte-compiled / optimized / DLL files
|
2 |
+
|
3 |
+
**pycache**/
|
4 |
+
_.py[cod]
|
5 |
+
_.pyo
|
6 |
+
_.pyw
|
7 |
+
_.pyz
|
8 |
|
9 |
# Distribution / packaging
|
10 |
+
|
11 |
+
\*.egg-info/
|
12 |
|
13 |
# OS files
|
14 |
+
|
15 |
.DS_Store
|
16 |
|
17 |
# Environment files
|
18 |
+
|
19 |
.env
|
20 |
+
.env.\*
|
21 |
|
22 |
# Project data and outputs
|
23 |
+
|
24 |
outputs/
|
25 |
uploads/
|
26 |
data/
|
27 |
|
28 |
# VSCode settings
|
29 |
+
|
30 |
.vscode/
|
31 |
|
32 |
# Git
|
33 |
+
|
34 |
.git/
|
35 |
+
|
36 |
+
.venv
|
37 |
+
.idea
|
38 |
+
.vscode
|
39 |
+
\*.md
|
40 |
+
.git
|
41 |
+
.env
|
Dockerfile
CHANGED
@@ -35,8 +35,9 @@ RUN pip install --no-cache-dir "llama-cpp-python==0.3.8" --extra-index-url https
|
|
35 |
RUN grep -v "llama-cpp-python" requirements.txt > requirements-no-llama.txt && \
|
36 |
pip install --no-cache-dir -r requirements-no-llama.txt
|
37 |
|
38 |
-
# 6. Copy
|
39 |
-
COPY . .
|
|
|
40 |
|
41 |
# 7. Expose the port FastAPI will run on
|
42 |
EXPOSE 7860
|
|
|
35 |
RUN grep -v "llama-cpp-python" requirements.txt > requirements-no-llama.txt && \
|
36 |
pip install --no-cache-dir -r requirements-no-llama.txt
|
37 |
|
38 |
+
# 6. Copy only necessary files and folders
|
39 |
+
COPY requirements.txt .
|
40 |
+
COPY src/ ./src
|
41 |
|
42 |
# 7. Expose the port FastAPI will run on
|
43 |
EXPOSE 7860
|
src/utils/llama_cpp_client.py
CHANGED
@@ -7,25 +7,25 @@ from models.responses.chat_response import ChatResponse
|
|
7 |
from utils.timing import measure_time
|
8 |
from utils.tools import tools_define
|
9 |
|
10 |
-
from transformers import AutoTokenizer
|
11 |
|
12 |
-
tokenizer = AutoTokenizer.from_pretrained("modularai/Llama-3.1-8B-Instruct-GGUF")
|
13 |
|
14 |
|
15 |
-
def messages_to_prompt(messages):
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
|
22 |
|
23 |
-
def completion_to_prompt(completion):
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
|
30 |
|
31 |
# llm = llama_cpp.Llama(
|
@@ -44,8 +44,8 @@ llm = llama_cpp.Llama.from_pretrained(
|
|
44 |
n_gpu_layers=-1,
|
45 |
n_ctx=4096,
|
46 |
verbose=True,
|
47 |
-
messages_to_prompt=messages_to_prompt,
|
48 |
-
completion_to_prompt=completion_to_prompt,
|
49 |
)
|
50 |
|
51 |
|
@@ -60,11 +60,11 @@ def create(messages: List[Message], has_tool_call: bool = True):
|
|
60 |
try:
|
61 |
with measure_time("Starting create chat completion"):
|
62 |
output = llm.create_chat_completion(
|
63 |
-
prompt,
|
64 |
-
tools=tools,
|
65 |
tool_choice=tool_choice,
|
66 |
) # type: ignore
|
67 |
-
return ChatResponse.from_llm_output(output)
|
68 |
except Exception as e:
|
69 |
print(f"Error in create chat completion: {str(e)}")
|
70 |
raise
|
@@ -74,9 +74,9 @@ def create_stream(messages: List[Message]) -> Generator[ChatResponse, None, None
|
|
74 |
prompt = [message.to_map() for message in messages]
|
75 |
|
76 |
output = llm.create_chat_completion(
|
77 |
-
prompt,
|
78 |
stream=True,
|
79 |
-
tools=tools_define.tools,
|
80 |
tool_choice="auto",
|
81 |
) # type: ignore
|
82 |
last_role = None
|
|
|
7 |
from utils.timing import measure_time
|
8 |
from utils.tools import tools_define
|
9 |
|
10 |
+
# from transformers import AutoTokenizer
|
11 |
|
12 |
+
# tokenizer = AutoTokenizer.from_pretrained("modularai/Llama-3.1-8B-Instruct-GGUF")
|
13 |
|
14 |
|
15 |
+
# def messages_to_prompt(messages):
|
16 |
+
# messages = [{"role": m.role.value, "content": m.content} for m in messages]
|
17 |
+
# prompt = tokenizer.apply_chat_template(
|
18 |
+
# messages, tokenize=False, add_generation_prompt=True
|
19 |
+
# )
|
20 |
+
# return prompt
|
21 |
|
22 |
|
23 |
+
# def completion_to_prompt(completion):
|
24 |
+
# messages = [{"role": "user", "content": completion}]
|
25 |
+
# prompt = tokenizer.apply_chat_template(
|
26 |
+
# messages, tokenize=False, add_generation_prompt=True
|
27 |
+
# )
|
28 |
+
# return prompt
|
29 |
|
30 |
|
31 |
# llm = llama_cpp.Llama(
|
|
|
44 |
n_gpu_layers=-1,
|
45 |
n_ctx=4096,
|
46 |
verbose=True,
|
47 |
+
# messages_to_prompt=messages_to_prompt,
|
48 |
+
# completion_to_prompt=completion_to_prompt,
|
49 |
)
|
50 |
|
51 |
|
|
|
60 |
try:
|
61 |
with measure_time("Starting create chat completion"):
|
62 |
output = llm.create_chat_completion(
|
63 |
+
prompt, # type: ignore
|
64 |
+
tools=tools, # type: ignore
|
65 |
tool_choice=tool_choice,
|
66 |
) # type: ignore
|
67 |
+
return ChatResponse.from_llm_output(output) # type: ignore
|
68 |
except Exception as e:
|
69 |
print(f"Error in create chat completion: {str(e)}")
|
70 |
raise
|
|
|
74 |
prompt = [message.to_map() for message in messages]
|
75 |
|
76 |
output = llm.create_chat_completion(
|
77 |
+
prompt, # type: ignore
|
78 |
stream=True,
|
79 |
+
tools=tools_define.tools, # type: ignore
|
80 |
tool_choice="auto",
|
81 |
) # type: ignore
|
82 |
last_role = None
|