Upload app.py
Browse files
app.py
CHANGED
@@ -1,28 +1,39 @@
|
|
1 |
from flask import Flask, render_template, request, jsonify
|
2 |
from llama_cpp import Llama
|
3 |
-
import
|
4 |
|
5 |
app = Flask(__name__)
|
6 |
|
7 |
-
#
|
8 |
-
MODEL_PATH = "models/oss_20b_gguf/gpt-oss-20b-Q2_K_L.gguf"
|
9 |
|
10 |
-
#
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
-
# Build adaptive prompt
|
18 |
def build_prompt(history, user_text):
|
19 |
system_prompt = (
|
20 |
-
"You are a helpful
|
21 |
-
"-
|
22 |
-
"-
|
23 |
-
"-
|
24 |
-
"-
|
25 |
-
"- Output code inside proper Markdown code blocks with language tags for syntax highlighting.\n"
|
26 |
)
|
27 |
prompt = system_prompt + "\n\n"
|
28 |
for turn in history:
|
@@ -42,7 +53,6 @@ def chat():
|
|
42 |
|
43 |
prompt = build_prompt(history, user_message)
|
44 |
|
45 |
-
# Adjust max_tokens dynamically
|
46 |
if any(word in user_message.lower() for word in ["story", "letter", "essay"]):
|
47 |
max_out = 800
|
48 |
elif any(word in user_message.lower() for word in ["code", "program", "script", "python", "java", "html", "c++"]):
|
@@ -57,10 +67,7 @@ def chat():
|
|
57 |
stop=["\nUser:", "\nAssistant:"]
|
58 |
)
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
# Wrap fenced code blocks with copy button (handled in JS)
|
63 |
-
return jsonify({"response": text})
|
64 |
|
65 |
if __name__ == "__main__":
|
66 |
app.run(host="0.0.0.0", port=5000, debug=True)
|
|
|
1 |
from flask import Flask, render_template, request, jsonify
|
2 |
from llama_cpp import Llama
|
3 |
+
import os
|
4 |
|
5 |
app = Flask(__name__)
|
6 |
|
7 |
+
# Update this path to your downloaded model weight
|
8 |
+
MODEL_PATH = "models/oss_20b_gguf/gpt-oss-20b-Q2_K_L.gguf"
|
9 |
|
10 |
+
# Detect GPU automatically: if llama-cpp-python was compiled with CUDA/Metal and GPU layers can be offloaded
|
11 |
+
# Adjust n_gpu_layers for your GPU memory; 20-40 for mid GPUs, 60-100 for higher VRAM, 0 = CPU only
|
12 |
+
try:
|
13 |
+
print("Trying GPU offload...")
|
14 |
+
llm = Llama(
|
15 |
+
model_path=MODEL_PATH,
|
16 |
+
n_ctx=2048,
|
17 |
+
n_threads=os.cpu_count(),
|
18 |
+
n_gpu_layers=40 # increase or decrease based on your GPU memory
|
19 |
+
)
|
20 |
+
print("GPU initialized successfully.")
|
21 |
+
except Exception as e:
|
22 |
+
print(f"GPU failed: {e}\nFalling back to CPU.")
|
23 |
+
llm = Llama(
|
24 |
+
model_path=MODEL_PATH,
|
25 |
+
n_ctx=2048,
|
26 |
+
n_threads=os.cpu_count(),
|
27 |
+
n_gpu_layers=0 # CPU only
|
28 |
+
)
|
29 |
|
|
|
30 |
def build_prompt(history, user_text):
|
31 |
system_prompt = (
|
32 |
+
"You are a helpful assistant. Follow these:\n"
|
33 |
+
"- Simple Q: Short, precise.\n"
|
34 |
+
"- Story/letter/essay: Longer answer.\n"
|
35 |
+
"- Code: Complete, neat, Markdown fenced code with language tag.\n"
|
36 |
+
"- Use points when helpful.\n"
|
|
|
37 |
)
|
38 |
prompt = system_prompt + "\n\n"
|
39 |
for turn in history:
|
|
|
53 |
|
54 |
prompt = build_prompt(history, user_message)
|
55 |
|
|
|
56 |
if any(word in user_message.lower() for word in ["story", "letter", "essay"]):
|
57 |
max_out = 800
|
58 |
elif any(word in user_message.lower() for word in ["code", "program", "script", "python", "java", "html", "c++"]):
|
|
|
67 |
stop=["\nUser:", "\nAssistant:"]
|
68 |
)
|
69 |
|
70 |
+
return jsonify({"response": resp["choices"][0]["text"].strip()})
|
|
|
|
|
|
|
71 |
|
72 |
if __name__ == "__main__":
|
73 |
app.run(host="0.0.0.0", port=5000, debug=True)
|