Spaces:

LeoNguyen101120
/

ai-assistance

Paused

App Files Files Community

LeoNguyen101120 commited on Jun 4

Commit

32efff5

1 Parent(s): e3a80c0

Update requirements and refactor client integration: Add extra index URL for PyTorch in requirements.txt, integrate open_ai_client in main.py, and adjust image generation parameters in image_service.py. Refactor llama_cpp_client to improve model loading configuration and enhance error handling in image_pipeline_client.

Browse files

Files changed (7) hide show

requirements.txt +1 -0
src/constants/config.py +0 -1
src/main.py +2 -0
src/services/chat_service.py +0 -11
src/services/image_service.py +4 -4
src/utils/clients/image_pipeline_client.py +5 -2
src/utils/clients/llama_cpp_client.py +5 -19

requirements.txt CHANGED Viewed

@@ -8,6 +8,7 @@ diffusers == 0.33.1
 accelerate == 1.6.0
 # transformers == 4.52.4
 torch==2.7.0
 # # If use bitsandbytes with cuda
 # https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl

 accelerate == 1.6.0
 # transformers == 4.52.4
 torch==2.7.0
+--extra-index-url https://download.pytorch.org/whl/cu128
 # # If use bitsandbytes with cuda
 # https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl

src/constants/config.py CHANGED Viewed

@@ -4,7 +4,6 @@ IS_APPLE_SILICON = torch.backends.mps.is_available()
 IS_CUDA_AVAILABLE = torch.cuda.is_available()
 TORCH_DEVICE = "cuda" if IS_CUDA_AVAILABLE else "mps" if IS_APPLE_SILICON else "cpu"
 # Enable quantization for CPU/MPS
 USE_QUANT = IS_CUDA_AVAILABLE

 IS_CUDA_AVAILABLE = torch.cuda.is_available()
 TORCH_DEVICE = "cuda" if IS_CUDA_AVAILABLE else "mps" if IS_APPLE_SILICON else "cpu"
 # Enable quantization for CPU/MPS
 USE_QUANT = IS_CUDA_AVAILABLE

src/main.py CHANGED Viewed

@@ -12,6 +12,7 @@ from routes import chat_routes, process_file_routes, vector_store_routes
 from utils.clients import (
     image_pipeline_client,
     llama_cpp_client,
     transformer_client,
     vector_store_client,
 )
@@ -25,6 +26,7 @@ async def lifespan(app: FastAPI):
         vector_store_client.load_vector_store_client()
         image_pipeline_client.load_pipeline()
         llama_cpp_client.load()
         # pass
     except Exception as e:

 from utils.clients import (
     image_pipeline_client,
     llama_cpp_client,
+    open_ai_client,
     transformer_client,
     vector_store_client,
 )
         vector_store_client.load_vector_store_client()
         image_pipeline_client.load_pipeline()
         llama_cpp_client.load()
+        # open_ai_client.load_open_ai_client()
         # pass
     except Exception as e:

src/services/chat_service.py CHANGED Viewed

@@ -82,16 +82,11 @@ def chat_generate_stream(
     with measure_time("Tool call handling"):
         tool_call_result = tools_helper.process_tool_calls(tool_calls)
-        # tool_call_message = {
-        #     "role": "tool",
-        #     "content": tool_call_result.get("content", ""),
-        # }
         messages.append(tool_call_result)
     with measure_time("Generate new stream"):
         new_stream = client.generate_stream(messages, has_tool_call=False)
         for chunk in new_stream:
-            print(chunk.get("choices", [])[0].get("delta", {}).get("content"))
             yield chunk
@@ -113,14 +108,8 @@ def chat_generate(request: ChatRequest):
     with measure_time("Tool call handling"):
         tool_call_result = tools_helper.process_tool_calls(tool_calls=tool_calls)
-        # tool_call_message = {
-        #     "role": "tool",
-        #     "content": tool_call_result.get("content", ""),
-        # }
         messages.append(tool_call_result)
-    print(messages)
     with measure_time("Generate new chat completion"):
         new_output = client.generate(messages=messages, has_tool_call=False)

     with measure_time("Tool call handling"):
         tool_call_result = tools_helper.process_tool_calls(tool_calls)
         messages.append(tool_call_result)
     with measure_time("Generate new stream"):
         new_stream = client.generate_stream(messages, has_tool_call=False)
         for chunk in new_stream:
             yield chunk
     with measure_time("Tool call handling"):
         tool_call_result = tools_helper.process_tool_calls(tool_calls=tool_calls)
         messages.append(tool_call_result)
     with measure_time("Generate new chat completion"):
         new_output = client.generate(messages=messages, has_tool_call=False)

src/services/image_service.py CHANGED Viewed

@@ -4,12 +4,12 @@ from constants.config import OUTPUT_DIR
 from utils.clients import image_pipeline_client
 negative_promt = "blurry, distorted, pixelated, incomplete, poorly drawn, misaligned, weird proportions, bad perspective, unnatural colors, noisy, out of focus, glitchy, unsharp, overexposed, underexposed, poorly lit, bad composition, excessive noise, oversaturated, too dark, too bright, inconsistent lighting, discolored, overly stylized, unrealistic, awkward pose, unbalanced, mismatched, distorted features, flat, unnatural texture, chaotic, unreadable, incoherent, asymmetrical, low quality, lowres, wrong anatomy, bad anatomy, deformed, disfigured, ugly"
-width = 64
-height = 64
 guidance_scale = 7.5
-num_inference_steps = 1
-base_url = "http://0.0.0.0:7860"
 def generate_image_url(prompt: str) -> str:

 from utils.clients import image_pipeline_client
 negative_promt = "blurry, distorted, pixelated, incomplete, poorly drawn, misaligned, weird proportions, bad perspective, unnatural colors, noisy, out of focus, glitchy, unsharp, overexposed, underexposed, poorly lit, bad composition, excessive noise, oversaturated, too dark, too bright, inconsistent lighting, discolored, overly stylized, unrealistic, awkward pose, unbalanced, mismatched, distorted features, flat, unnatural texture, chaotic, unreadable, incoherent, asymmetrical, low quality, lowres, wrong anatomy, bad anatomy, deformed, disfigured, ugly"
+width = 512
+height = 512
 guidance_scale = 7.5
+num_inference_steps = 30
+base_url = "http://leonguyen101120.zapto.org:7860"
 def generate_image_url(prompt: str) -> str:

src/utils/clients/image_pipeline_client.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from constants.config import IMAGE_MODEL_ID_OR_LINK
 from utils.timing import measure_time
@@ -20,6 +20,9 @@ def load_pipeline():
         raise ImportError(
             "diffusers is not installed. Please install it using 'pip install diffusers'."
         )
     with measure_time("Load image pipeline"):
         pipeline = StableDiffusionPipeline.from_pretrained(
@@ -28,7 +31,7 @@ def load_pipeline():
             variant="fp16",
             # safety_checker=True,
             use_safetensors=True,
-        )
 def clear_resources():

+from constants.config import IMAGE_MODEL_ID_OR_LINK, TORCH_DEVICE
 from utils.timing import measure_time
         raise ImportError(
             "diffusers is not installed. Please install it using 'pip install diffusers'."
         )
+    print(TORCH_DEVICE)
     with measure_time("Load image pipeline"):
         pipeline = StableDiffusionPipeline.from_pretrained(
             variant="fp16",
             # safety_checker=True,
             use_safetensors=True,
+        ).to(TORCH_DEVICE)
 def clear_resources():

src/utils/clients/llama_cpp_client.py CHANGED Viewed

@@ -27,31 +27,17 @@ def load():
             "llama_cpp is not installed. Please install it using 'pip install llama-cpp-python'."
         )
     global _llm
-    # _llm = llama_cpp.Llama.from_pretrained(
-    #     repo_id=GGUF_REPO_ID,
-    #     filename=GGUF_FILE_NAME,
-    #     n_threads=os.cpu_count(),
-    #     n_gpu_layers=-1,
-    #     n_ctx=4096,
-    #     verbose=True,
-    #     use_mlock=True,
-    #     use_mmap=True,
-    #     # messages_to_prompt=messages_to_prompt,
-    #     # completion_to_prompt=completion_to_prompt,
-    # )
-    _llm = llama_cpp.Llama(
-        model_path=f"./.cache/{GGUF_FILE_NAME}",
         n_threads=os.cpu_count(),
         n_gpu_layers=-1,
-        n_ctx=4096,
         verbose=True,
-        use_mlock=True,
         use_mmap=True,
-        # messages_to_prompt=messages_to_prompt,
-        # completion_to_prompt=completion_to_prompt,
     )

             "llama_cpp is not installed. Please install it using 'pip install llama-cpp-python'."
         )
     global _llm
+    _llm = llama_cpp.Llama.from_pretrained(
+        repo_id=GGUF_REPO_ID,
+        filename=GGUF_FILE_NAME,
         n_threads=os.cpu_count(),
         n_gpu_layers=-1,
+        n_ctx=16384,
         verbose=True,
         use_mmap=True,
     )