Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -2,17 +2,14 @@
|
|
2 |
#huggingface-llama-recipes : https://github.com/huggingface/huggingface-llama-recipes/tree/main
|
3 |
|
4 |
import gradio as gr
|
5 |
-
from
|
6 |
import os
|
7 |
|
8 |
ACCESS_TOKEN = os.getenv("myHFtoken")
|
9 |
|
10 |
print("Access token loaded.")
|
11 |
|
12 |
-
client =
|
13 |
-
base_url="https://api-inference.huggingface.co/v1/",
|
14 |
-
api_key=ACCESS_TOKEN,
|
15 |
-
)
|
16 |
|
17 |
print("Client initialized.")
|
18 |
|
@@ -43,8 +40,8 @@ def respond(
|
|
43 |
print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
|
44 |
print(f"Selected model: {model_name}")
|
45 |
|
|
|
46 |
messages = [{"role": "system", "content": system_message}]
|
47 |
-
|
48 |
for val in history:
|
49 |
if val[0]:
|
50 |
messages.append({"role": "user", "content": val[0]})
|
@@ -54,19 +51,21 @@ def respond(
|
|
54 |
print(f"Added assistant message to context: {val[1]}")
|
55 |
|
56 |
messages.append({"role": "user", "content": message})
|
57 |
-
|
58 |
response = ""
|
59 |
-
print("Sending request to
|
60 |
-
|
61 |
-
|
|
|
62 |
model=model_name,
|
|
|
63 |
max_tokens=max_tokens,
|
64 |
-
stream=True,
|
65 |
temperature=temperature,
|
66 |
top_p=top_p,
|
67 |
-
|
68 |
-
)
|
69 |
-
|
|
|
|
|
70 |
print(f"Received token: {token}")
|
71 |
response += token
|
72 |
yield response
|
@@ -74,16 +73,16 @@ def respond(
|
|
74 |
print("Completed response generation.")
|
75 |
|
76 |
models = [
|
|
|
77 |
"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
|
78 |
"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
|
79 |
"deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
|
80 |
-
"ngxson/MiniThinky-v2-1B-Llama-3.2",
|
81 |
-
"meta-llama/Llama-3.2-3B-Instruct",
|
82 |
"PowerInfer/SmallThinker-3B-Preview",
|
83 |
"NovaSky-AI/Sky-T1-32B-Preview",
|
84 |
"Qwen/QwQ-32B-Preview",
|
85 |
"Qwen/Qwen2.5-Coder-32B-Instruct",
|
86 |
"microsoft/Phi-3-mini-128k-instruct",
|
|
|
87 |
]
|
88 |
|
89 |
with gr.Blocks() as demo:
|
|
|
2 |
#huggingface-llama-recipes : https://github.com/huggingface/huggingface-llama-recipes/tree/main
|
3 |
|
4 |
import gradio as gr
|
5 |
+
from huggingface_hub import InferenceClient
|
6 |
import os
|
7 |
|
8 |
ACCESS_TOKEN = os.getenv("myHFtoken")
|
9 |
|
10 |
print("Access token loaded.")
|
11 |
|
12 |
+
client = InferenceClient(api_key=ACCESS_TOKEN)
|
|
|
|
|
|
|
13 |
|
14 |
print("Client initialized.")
|
15 |
|
|
|
40 |
print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
|
41 |
print(f"Selected model: {model_name}")
|
42 |
|
43 |
+
# Prepare messages for the Hugging Face API
|
44 |
messages = [{"role": "system", "content": system_message}]
|
|
|
45 |
for val in history:
|
46 |
if val[0]:
|
47 |
messages.append({"role": "user", "content": val[0]})
|
|
|
51 |
print(f"Added assistant message to context: {val[1]}")
|
52 |
|
53 |
messages.append({"role": "user", "content": message})
|
|
|
54 |
response = ""
|
55 |
+
print("Sending request to Hugging Face API.")
|
56 |
+
|
57 |
+
# Stream response from Hugging Face API
|
58 |
+
completion = client.chat.completions.create(
|
59 |
model=model_name,
|
60 |
+
messages=messages,
|
61 |
max_tokens=max_tokens,
|
|
|
62 |
temperature=temperature,
|
63 |
top_p=top_p,
|
64 |
+
stream=True,
|
65 |
+
)
|
66 |
+
|
67 |
+
for message in completion:
|
68 |
+
token = message.delta.get("content", "")
|
69 |
print(f"Received token: {token}")
|
70 |
response += token
|
71 |
yield response
|
|
|
73 |
print("Completed response generation.")
|
74 |
|
75 |
models = [
|
76 |
+
"meta-llama/Llama-3.2-3B-Instruct",
|
77 |
"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
|
78 |
"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
|
79 |
"deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
|
|
|
|
|
80 |
"PowerInfer/SmallThinker-3B-Preview",
|
81 |
"NovaSky-AI/Sky-T1-32B-Preview",
|
82 |
"Qwen/QwQ-32B-Preview",
|
83 |
"Qwen/Qwen2.5-Coder-32B-Instruct",
|
84 |
"microsoft/Phi-3-mini-128k-instruct",
|
85 |
+
"microsoft/phi-4"
|
86 |
]
|
87 |
|
88 |
with gr.Blocks() as demo:
|