remiai3 commited on
Commit
d2edc36
·
verified ·
1 Parent(s): c60b9e0

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -21
app.py CHANGED
@@ -1,28 +1,39 @@
1
  from flask import Flask, render_template, request, jsonify
2
  from llama_cpp import Llama
3
- import re
4
 
5
  app = Flask(__name__)
6
 
7
- # Path to the local GGUF model weights
8
- MODEL_PATH = "models/oss_20b_gguf/gpt-oss-20b-Q2_K_L.gguf" # update this path
9
 
10
- # Initialize model
11
- llm = Llama(
12
- model_path=MODEL_PATH,
13
- n_ctx=2048,
14
- n_threads=8 # adjust based on your CPU
15
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
- # Build adaptive prompt
18
  def build_prompt(history, user_text):
19
  system_prompt = (
20
- "You are a helpful and adaptive assistant. Follow these rules strictly:\n"
21
- "- If the user asks a simple or factual question, give a short, precise answer.\n"
22
- "- If the user requests a story, essay, or letter, provide a longer, well-structured response.\n"
23
- "- If the user asks for programming help or code, provide correct, complete, well-formatted code.\n"
24
- "- Always keep answers clear, neat, and structured; use points when helpful.\n"
25
- "- Output code inside proper Markdown code blocks with language tags for syntax highlighting.\n"
26
  )
27
  prompt = system_prompt + "\n\n"
28
  for turn in history:
@@ -42,7 +53,6 @@ def chat():
42
 
43
  prompt = build_prompt(history, user_message)
44
 
45
- # Adjust max_tokens dynamically
46
  if any(word in user_message.lower() for word in ["story", "letter", "essay"]):
47
  max_out = 800
48
  elif any(word in user_message.lower() for word in ["code", "program", "script", "python", "java", "html", "c++"]):
@@ -57,10 +67,7 @@ def chat():
57
  stop=["\nUser:", "\nAssistant:"]
58
  )
59
 
60
- text = resp["choices"][0]["text"].strip()
61
-
62
- # Wrap fenced code blocks with copy button (handled in JS)
63
- return jsonify({"response": text})
64
 
65
  if __name__ == "__main__":
66
  app.run(host="0.0.0.0", port=5000, debug=True)
 
1
  from flask import Flask, render_template, request, jsonify
2
  from llama_cpp import Llama
3
+ import os
4
 
5
  app = Flask(__name__)
6
 
7
+ # Update this path to your downloaded model weight
8
+ MODEL_PATH = "models/oss_20b_gguf/gpt-oss-20b-Q2_K_L.gguf"
9
 
10
+ # Detect GPU automatically: if llama-cpp-python was compiled with CUDA/Metal and GPU layers can be offloaded
11
+ # Adjust n_gpu_layers for your GPU memory; 20-40 for mid GPUs, 60-100 for higher VRAM, 0 = CPU only
12
+ try:
13
+ print("Trying GPU offload...")
14
+ llm = Llama(
15
+ model_path=MODEL_PATH,
16
+ n_ctx=2048,
17
+ n_threads=os.cpu_count(),
18
+ n_gpu_layers=40 # increase or decrease based on your GPU memory
19
+ )
20
+ print("GPU initialized successfully.")
21
+ except Exception as e:
22
+ print(f"GPU failed: {e}\nFalling back to CPU.")
23
+ llm = Llama(
24
+ model_path=MODEL_PATH,
25
+ n_ctx=2048,
26
+ n_threads=os.cpu_count(),
27
+ n_gpu_layers=0 # CPU only
28
+ )
29
 
 
30
  def build_prompt(history, user_text):
31
  system_prompt = (
32
+ "You are a helpful assistant. Follow these:\n"
33
+ "- Simple Q: Short, precise.\n"
34
+ "- Story/letter/essay: Longer answer.\n"
35
+ "- Code: Complete, neat, Markdown fenced code with language tag.\n"
36
+ "- Use points when helpful.\n"
 
37
  )
38
  prompt = system_prompt + "\n\n"
39
  for turn in history:
 
53
 
54
  prompt = build_prompt(history, user_message)
55
 
 
56
  if any(word in user_message.lower() for word in ["story", "letter", "essay"]):
57
  max_out = 800
58
  elif any(word in user_message.lower() for word in ["code", "program", "script", "python", "java", "html", "c++"]):
 
67
  stop=["\nUser:", "\nAssistant:"]
68
  )
69
 
70
+ return jsonify({"response": resp["choices"][0]["text"].strip()})
 
 
 
71
 
72
  if __name__ == "__main__":
73
  app.run(host="0.0.0.0", port=5000, debug=True)