LeoNguyen101120 commited on
Commit
32efff5
·
1 Parent(s): e3a80c0

Update requirements and refactor client integration: Add extra index URL for PyTorch in requirements.txt, integrate open_ai_client in main.py, and adjust image generation parameters in image_service.py. Refactor llama_cpp_client to improve model loading configuration and enhance error handling in image_pipeline_client.

Browse files
requirements.txt CHANGED
@@ -8,6 +8,7 @@ diffusers == 0.33.1
8
  accelerate == 1.6.0
9
  # transformers == 4.52.4
10
  torch==2.7.0
 
11
 
12
  # # If use bitsandbytes with cuda
13
  # https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl
 
8
  accelerate == 1.6.0
9
  # transformers == 4.52.4
10
  torch==2.7.0
11
+ --extra-index-url https://download.pytorch.org/whl/cu128
12
 
13
  # # If use bitsandbytes with cuda
14
  # https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl
src/constants/config.py CHANGED
@@ -4,7 +4,6 @@ IS_APPLE_SILICON = torch.backends.mps.is_available()
4
  IS_CUDA_AVAILABLE = torch.cuda.is_available()
5
 
6
  TORCH_DEVICE = "cuda" if IS_CUDA_AVAILABLE else "mps" if IS_APPLE_SILICON else "cpu"
7
-
8
  # Enable quantization for CPU/MPS
9
  USE_QUANT = IS_CUDA_AVAILABLE
10
 
 
4
  IS_CUDA_AVAILABLE = torch.cuda.is_available()
5
 
6
  TORCH_DEVICE = "cuda" if IS_CUDA_AVAILABLE else "mps" if IS_APPLE_SILICON else "cpu"
 
7
  # Enable quantization for CPU/MPS
8
  USE_QUANT = IS_CUDA_AVAILABLE
9
 
src/main.py CHANGED
@@ -12,6 +12,7 @@ from routes import chat_routes, process_file_routes, vector_store_routes
12
  from utils.clients import (
13
  image_pipeline_client,
14
  llama_cpp_client,
 
15
  transformer_client,
16
  vector_store_client,
17
  )
@@ -25,6 +26,7 @@ async def lifespan(app: FastAPI):
25
  vector_store_client.load_vector_store_client()
26
  image_pipeline_client.load_pipeline()
27
  llama_cpp_client.load()
 
28
  # pass
29
 
30
  except Exception as e:
 
12
  from utils.clients import (
13
  image_pipeline_client,
14
  llama_cpp_client,
15
+ open_ai_client,
16
  transformer_client,
17
  vector_store_client,
18
  )
 
26
  vector_store_client.load_vector_store_client()
27
  image_pipeline_client.load_pipeline()
28
  llama_cpp_client.load()
29
+ # open_ai_client.load_open_ai_client()
30
  # pass
31
 
32
  except Exception as e:
src/services/chat_service.py CHANGED
@@ -82,16 +82,11 @@ def chat_generate_stream(
82
 
83
  with measure_time("Tool call handling"):
84
  tool_call_result = tools_helper.process_tool_calls(tool_calls)
85
- # tool_call_message = {
86
- # "role": "tool",
87
- # "content": tool_call_result.get("content", ""),
88
- # }
89
  messages.append(tool_call_result)
90
 
91
  with measure_time("Generate new stream"):
92
  new_stream = client.generate_stream(messages, has_tool_call=False)
93
  for chunk in new_stream:
94
- print(chunk.get("choices", [])[0].get("delta", {}).get("content"))
95
  yield chunk
96
 
97
 
@@ -113,14 +108,8 @@ def chat_generate(request: ChatRequest):
113
 
114
  with measure_time("Tool call handling"):
115
  tool_call_result = tools_helper.process_tool_calls(tool_calls=tool_calls)
116
- # tool_call_message = {
117
- # "role": "tool",
118
- # "content": tool_call_result.get("content", ""),
119
- # }
120
  messages.append(tool_call_result)
121
 
122
- print(messages)
123
-
124
  with measure_time("Generate new chat completion"):
125
  new_output = client.generate(messages=messages, has_tool_call=False)
126
 
 
82
 
83
  with measure_time("Tool call handling"):
84
  tool_call_result = tools_helper.process_tool_calls(tool_calls)
 
 
 
 
85
  messages.append(tool_call_result)
86
 
87
  with measure_time("Generate new stream"):
88
  new_stream = client.generate_stream(messages, has_tool_call=False)
89
  for chunk in new_stream:
 
90
  yield chunk
91
 
92
 
 
108
 
109
  with measure_time("Tool call handling"):
110
  tool_call_result = tools_helper.process_tool_calls(tool_calls=tool_calls)
 
 
 
 
111
  messages.append(tool_call_result)
112
 
 
 
113
  with measure_time("Generate new chat completion"):
114
  new_output = client.generate(messages=messages, has_tool_call=False)
115
 
src/services/image_service.py CHANGED
@@ -4,12 +4,12 @@ from constants.config import OUTPUT_DIR
4
  from utils.clients import image_pipeline_client
5
 
6
  negative_promt = "blurry, distorted, pixelated, incomplete, poorly drawn, misaligned, weird proportions, bad perspective, unnatural colors, noisy, out of focus, glitchy, unsharp, overexposed, underexposed, poorly lit, bad composition, excessive noise, oversaturated, too dark, too bright, inconsistent lighting, discolored, overly stylized, unrealistic, awkward pose, unbalanced, mismatched, distorted features, flat, unnatural texture, chaotic, unreadable, incoherent, asymmetrical, low quality, lowres, wrong anatomy, bad anatomy, deformed, disfigured, ugly"
7
- width = 64
8
- height = 64
9
  guidance_scale = 7.5
10
- num_inference_steps = 1
11
 
12
- base_url = "http://0.0.0.0:7860"
13
 
14
 
15
  def generate_image_url(prompt: str) -> str:
 
4
  from utils.clients import image_pipeline_client
5
 
6
  negative_promt = "blurry, distorted, pixelated, incomplete, poorly drawn, misaligned, weird proportions, bad perspective, unnatural colors, noisy, out of focus, glitchy, unsharp, overexposed, underexposed, poorly lit, bad composition, excessive noise, oversaturated, too dark, too bright, inconsistent lighting, discolored, overly stylized, unrealistic, awkward pose, unbalanced, mismatched, distorted features, flat, unnatural texture, chaotic, unreadable, incoherent, asymmetrical, low quality, lowres, wrong anatomy, bad anatomy, deformed, disfigured, ugly"
7
+ width = 512
8
+ height = 512
9
  guidance_scale = 7.5
10
+ num_inference_steps = 30
11
 
12
+ base_url = "http://leonguyen101120.zapto.org:7860"
13
 
14
 
15
  def generate_image_url(prompt: str) -> str:
src/utils/clients/image_pipeline_client.py CHANGED
@@ -1,4 +1,4 @@
1
- from constants.config import IMAGE_MODEL_ID_OR_LINK
2
  from utils.timing import measure_time
3
 
4
 
@@ -20,6 +20,9 @@ def load_pipeline():
20
  raise ImportError(
21
  "diffusers is not installed. Please install it using 'pip install diffusers'."
22
  )
 
 
 
23
 
24
  with measure_time("Load image pipeline"):
25
  pipeline = StableDiffusionPipeline.from_pretrained(
@@ -28,7 +31,7 @@ def load_pipeline():
28
  variant="fp16",
29
  # safety_checker=True,
30
  use_safetensors=True,
31
- )
32
 
33
 
34
  def clear_resources():
 
1
+ from constants.config import IMAGE_MODEL_ID_OR_LINK, TORCH_DEVICE
2
  from utils.timing import measure_time
3
 
4
 
 
20
  raise ImportError(
21
  "diffusers is not installed. Please install it using 'pip install diffusers'."
22
  )
23
+
24
+ print(TORCH_DEVICE)
25
+
26
 
27
  with measure_time("Load image pipeline"):
28
  pipeline = StableDiffusionPipeline.from_pretrained(
 
31
  variant="fp16",
32
  # safety_checker=True,
33
  use_safetensors=True,
34
+ ).to(TORCH_DEVICE)
35
 
36
 
37
  def clear_resources():
src/utils/clients/llama_cpp_client.py CHANGED
@@ -27,31 +27,17 @@ def load():
27
  "llama_cpp is not installed. Please install it using 'pip install llama-cpp-python'."
28
  )
29
 
 
30
  global _llm
31
 
32
- # _llm = llama_cpp.Llama.from_pretrained(
33
- # repo_id=GGUF_REPO_ID,
34
- # filename=GGUF_FILE_NAME,
35
- # n_threads=os.cpu_count(),
36
- # n_gpu_layers=-1,
37
- # n_ctx=4096,
38
- # verbose=True,
39
- # use_mlock=True,
40
- # use_mmap=True,
41
- # # messages_to_prompt=messages_to_prompt,
42
- # # completion_to_prompt=completion_to_prompt,
43
- # )
44
-
45
- _llm = llama_cpp.Llama(
46
- model_path=f"./.cache/{GGUF_FILE_NAME}",
47
  n_threads=os.cpu_count(),
48
  n_gpu_layers=-1,
49
- n_ctx=4096,
50
  verbose=True,
51
- use_mlock=True,
52
  use_mmap=True,
53
- # messages_to_prompt=messages_to_prompt,
54
- # completion_to_prompt=completion_to_prompt,
55
  )
56
 
57
 
 
27
  "llama_cpp is not installed. Please install it using 'pip install llama-cpp-python'."
28
  )
29
 
30
+
31
  global _llm
32
 
33
+ _llm = llama_cpp.Llama.from_pretrained(
34
+ repo_id=GGUF_REPO_ID,
35
+ filename=GGUF_FILE_NAME,
 
 
 
 
 
 
 
 
 
 
 
 
36
  n_threads=os.cpu_count(),
37
  n_gpu_layers=-1,
38
+ n_ctx=16384,
39
  verbose=True,
 
40
  use_mmap=True,
 
 
41
  )
42
 
43