Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import gradio as gr | |
import requests | |
import json | |
import os | |
import threading | |
import queue | |
import time | |
# Load all configuration from environment variables | |
TOGETHER_API_KEY = os.environ.get('TOGETHER_API_KEY', '') | |
TOGETHER_API_URL = os.environ.get('TOGETHER_API_URL', 'https://api.together.xyz/v1/chat/completions') | |
MODEL_A_NAME = os.environ.get('MODEL_A_NAME', '') | |
MODEL_B_NAME = os.environ.get('MODEL_B_NAME', '') | |
MODEL_C_NAME = os.environ.get('MODEL_C_NAME', '') | |
# Display names for the UI | |
MODEL_A_DISPLAY = os.environ.get('MODEL_A_DISPLAY', '') | |
MODEL_B_DISPLAY = os.environ.get('MODEL_B_DISPLAY', '') | |
MODEL_C_DISPLAY = os.environ.get('MODEL_C_DISPLAY', '') | |
# Headers for API calls | |
HEADERS = { | |
"Authorization": f"Bearer {TOGETHER_API_KEY}", | |
"Content-Type": "application/json" | |
} | |
SYSTEM_PROMPT = os.environ.get('SYSTEM_PROMPT', """You are an expert conversationalist who responds to the best of your ability. The assistant is Palmyra, created by Writer.""") | |
MODELS = { | |
"Model A": MODEL_A_NAME, | |
"Model B": MODEL_B_NAME, | |
"Model C": MODEL_C_NAME | |
} | |
def stream_together_model(model_name, user_prompt, add_thinking_delay=False): | |
if add_thinking_delay: | |
yield "Thinking.." | |
time.sleep(1) | |
yield "" | |
body = { | |
"model": model_name, | |
"messages": [ | |
{"role": "system", "content": SYSTEM_PROMPT}, | |
{"role": "user", "content": user_prompt} | |
], | |
"stream": True | |
} | |
try: | |
with requests.post(TOGETHER_API_URL, headers=HEADERS, json=body, stream=True) as response: | |
response.raise_for_status() | |
for line in response.iter_lines(): | |
if line: | |
try: | |
data = json.loads(line.decode('utf-8').replace("data: ", "")) | |
content = data.get("choices", [{}])[0].get("delta", {}).get("content", "") | |
if content: | |
yield content | |
except: | |
continue | |
except Exception as e: | |
yield f"[Error: {str(e)}]" | |
def stream_model_c(user_prompt): | |
url = "http://192.222.54.94:8000/v1/chat/completions" | |
headers = {"Content-Type": "application/json"} | |
body = { | |
"model": "palmyra-x5-v2", | |
"messages": [ | |
{"role": "user", "content": user_prompt} | |
], | |
"temperature": 0.07 | |
} | |
try: | |
response = requests.post(url, headers=headers, json=body) | |
response.raise_for_status() | |
data = response.json() | |
content = data.get("choices", [{}])[0].get("message", {}).get("content", "") | |
yield content | |
except Exception as e: | |
yield f"[Error: {str(e)}]" | |
custom_css = """... (unchanged CSS, keep same) ...""" | |
with gr.Blocks(css=custom_css, theme=gr.themes.Base()) as demo: | |
gr.HTML(""" | |
<div class="container"> | |
<h1>Palmyra-x5</h1> | |
</div> | |
""") | |
with gr.Row(): | |
chatbot_a = gr.Chatbot(label=MODEL_A_DISPLAY, height=500, bubble_full_width=False) | |
chatbot_b = gr.Chatbot(label=MODEL_B_DISPLAY, height=500, bubble_full_width=False) | |
chatbot_c = gr.Chatbot(label=MODEL_C_DISPLAY, height=500, bubble_full_width=False) | |
with gr.Row(): | |
user_input = gr.Textbox(placeholder="Type your message...", show_label=False, scale=8) | |
thinking_toggle = gr.Checkbox(label="Show Thinking Process", value=True, scale=2) | |
submit_btn = gr.Button("Send", scale=1, variant="primary") | |
gr.Examples( | |
examples=[ | |
"What does Tencent do?", | |
"Explain quantum computing", | |
"Write a haiku about AI", | |
"Compare Python vs JavaScript", | |
"Tips for better sleep" | |
], | |
inputs=user_input, | |
label="Try these examples:" | |
) | |
def stream_all_models(message, enable_thinking, hist_a, hist_b, hist_c): | |
if not message.strip(): | |
return hist_a, hist_b, hist_c, "" | |
hist_a = hist_a + [[message, ""]] | |
hist_b = hist_b + [[message, ""]] | |
hist_c = hist_c + [[message, ""]] | |
yield hist_a, hist_b, hist_c, "" | |
q1, q2, q3 = queue.Queue(), queue.Queue(), queue.Queue() | |
def fetch_stream(q, model, add_delay=False): | |
try: | |
for chunk in stream_together_model(model, message, add_delay): | |
q.put(chunk) | |
finally: | |
q.put(None) | |
def fetch_stream_c(q, message): | |
try: | |
for chunk in stream_model_c(message): | |
q.put(chunk) | |
finally: | |
q.put(None) | |
threading.Thread(target=fetch_stream, args=(q1, MODELS["Model A"], True)).start() | |
threading.Thread(target=fetch_stream, args=(q2, MODELS["Model B"], True)).start() | |
threading.Thread(target=fetch_stream, args=(q3, MODELS["Model C"], True)).start() | |
done_a = done_b = done_c = False | |
while not (done_a and done_b and done_c): | |
updated = False | |
if not done_a: | |
try: | |
chunk = q1.get(timeout=0.05) | |
if chunk is None: | |
done_a = True | |
else: | |
if chunk == "": | |
hist_a[-1][1] = "" | |
elif chunk.startswith("\ud83e\udd14"): | |
hist_a[-1][1] = chunk | |
else: | |
hist_a[-1][1] += chunk | |
updated = True | |
except: | |
pass | |
if not done_b: | |
try: | |
chunk = q2.get(timeout=0.05) | |
if chunk is None: | |
done_b = True | |
else: | |
if chunk == "": | |
hist_b[-1][1] = "" | |
elif chunk.startswith("\ud83e\udd14"): | |
hist_b[-1][1] = chunk | |
else: | |
hist_b[-1][1] += chunk | |
updated = True | |
except: | |
pass | |
if not done_c: | |
try: | |
chunk = q3.get(timeout=0.05) | |
if chunk is None: | |
done_c = True | |
else: | |
if chunk == "": | |
hist_c[-1][1] = "" | |
elif chunk.startswith("\ud83e\udd14"): | |
hist_c[-1][1] = chunk | |
else: | |
hist_c[-1][1] += chunk | |
updated = True | |
except: | |
pass | |
if updated: | |
yield hist_a, hist_b, hist_c, "" | |
submit_btn.click( | |
stream_all_models, | |
[user_input, thinking_toggle, chatbot_a, chatbot_b, chatbot_c], | |
[chatbot_a, chatbot_b, chatbot_c, user_input] | |
) | |
user_input.submit( | |
stream_all_models, | |
[user_input, thinking_toggle, chatbot_a, chatbot_b, chatbot_c], | |
[chatbot_a, chatbot_b, chatbot_c, user_input] | |
) | |
if __name__ == "__main__": | |
demo.launch() |