import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
from huggingface_hub import InferenceClient
import re
from streaming_stt_nemo import Model
import torch
import random
from openai import OpenAI
import subprocess
from starlette.requests import ClientDisconnect
LLAMA_3B_API_KEY = os.environ.get("LLAMA_3B_API_KEY")
HF_TOKEN = os.environ.get("HF_TOKEN", None)
default_lang = "en"
engines = { default_lang: Model(default_lang) }
"English": "eng",
"Spanish": "spa",
"Chinese": "cmn",
"French": "fra",
"German": "deu",
"Italian": "ita"
def transcribe(audio):
if audio is None:
return ""
lang = "en"
model = engines[lang]
text = model.stt_file(audio)[0]
return text
def llm_clients(model):
if "Llama 3 8B Service" in model:
return OpenAI(
elif "Llama" in model:
return InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
elif "Mistral" in model:
return InferenceClient("mistralai/Mistral-7B-Instruct-v0.2")
elif "Phi" in model:
return InferenceClient("microsoft/Phi-3-mini-4k-instruct")
elif "Mixtral" in model:
return InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
return InferenceClient("microsoft/Phi-3-mini-4k-instruct")
def randomize_seed_fn(seed: int) -> int:
seed = random.randint(0, 999999)
return seed
system_prompt = """
[SYSTEM] You are OPTIMUS Prime, a personal AI voice assistant created by Jaward. Keep conversations friendly, concise, and to the point. Provide clear and direct answers, avoiding unnecessary introductions. Maintain a normal, conversational tone while being both helpful and approachable. Use context from previous interactions to enhance your responses.
Your creator, Jaward, is an AI Research Engineer at Linksoul AI, specializing in advanced AI systems, particularly in training and optimization. He aims to develop AI that not only mimics human intelligence but also enhances it. Jaward has significantly contributed to the open-source community with fundamental implementations of AI/ML research papers. He completed his first internship at the Beijing Academy of Artificial Intelligence, where he contributed to cutting-edge research. His work led to the publication of an insightful paper, "AUTOAGENTS - A Framework for Automatic Agent Generation," accepted at IJCAI this year. Currently, Jaward is interning at LinkSoul AI, a small open-source AI research startup in Beijing.
conversation_history = []
def models(text, model="Llama 3 8B Service", seed=42):
global conversation_history
seed = int(randomize_seed_fn(seed))
generator = torch.Generator().manual_seed(seed)
client = llm_clients(model)
if "Llama 3 8B Service" in model:
messages = [
{"role": "system", "content": system_prompt},
] + conversation_history + [
{"role": "user", "content": text}
completion =
assistant_response = completion.choices[0].message.content
# Update conversation history
conversation_history.append({"role": "user", "content": text})
conversation_history.append({"role": "assistant", "content": assistant_response})
# Keep only the last 10 messages to avoid token limit issues
if len(conversation_history) > 20:
conversation_history = conversation_history[-20:]
return assistant_response
# For other models, we'll concatenate the conversation history into a single string
history_text = "\n".join([f"{'User' if msg['role'] == 'user' else 'Assistant'}: {msg['content']}" for msg in conversation_history])
formatted_prompt = f"{system_prompt}\n\nConversation history:\n{history_text}\n\nUser: {text}\nOPTIMUS:"
generate_kwargs = dict(
stream = client.text_generation(
formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
output = ""
for response in stream:
if not response.token.text == "":
output += response.token.text
# Update conversation history
conversation_history.append({"role": "user", "content": text})
conversation_history.append({"role": "assistant", "content": output})
# Keep only the last 10 messages to avoid token limit issues
if len(conversation_history) > 20:
conversation_history = conversation_history[-20:]
return output
def translate_speech(audio_file, target_language):
if audio_file is None:
return None
language_code = LANGUAGE_CODES[target_language]
output_file = "translated_audio.wav"
command = [
"--tgt_lang", language_code,
"--model_name", "seamless_expressivity",
"--vocoder_name", "vocoder_pretssel",
"--gated-model-dir", "models",
"--output_path", output_file
], check=True)
if os.path.exists(output_file):
print(f"File created successfully: {output_file}")
return output_file
print(f"File not found: {output_file}")
return None
async def respond(audio, model, seed, target_language):
if audio is None:
return None, None, "No input detected."
user_input = transcribe(audio)
if not user_input:
return None, None, "Could not transcribe audio."
if user_input.lower().startswith("please translate"):
# Extract the actual content to translate
content_to_translate = user_input[len("please translate"):].strip()
translated_audio = translate_speech(audio, target_language)
return None, translated_audio, f"Translated to {target_language}"
reply = models(user_input, model, seed)
communicate = edge_tts.Communicate(reply, voice="en-US-ChristopherNeural")
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
tmp_path =
return tmp_path, None, "Voice assistant response"
except ClientDisconnect:
print("Client disconnected")
return None, None, "Client disconnected. Please try again."
except Exception as e:
print(f"An error occurred: {str(e)}")
return None, None, f"An error occurred: {str(e)}"
def clear_history():
global conversation_history
conversation_history = []
return None, None, "Conversation history cleared."
with gr.Blocks(css="style.css") as demo: