|
import gradio as gr |
|
import edge_tts |
|
import asyncio |
|
import tempfile |
|
import os |
|
from huggingface_hub import InferenceClient |
|
import re |
|
from streaming_stt_nemo import Model |
|
import torch |
|
import random |
|
from openai import OpenAI |
|
import subprocess |
|
from starlette.requests import ClientDisconnect |
|
|
|
LLAMA_3B_API_ENDPOINT = os.environ.get("LLAMA_3B_API_ENDPOINT") |
|
LLAMA_3B_API_KEY = os.environ.get("LLAMA_3B_API_KEY") |
|
HF_TOKEN = os.environ.get("HF_TOKEN", None) |
|
|
|
default_lang = "en" |
|
engines = { default_lang: Model(default_lang) } |
|
|
|
LANGUAGE_CODES = { |
|
"English": "eng", |
|
"Spanish": "spa", |
|
"Chinese": "cmn", |
|
"French": "fra", |
|
"German": "deu", |
|
"Italian": "ita" |
|
} |
|
|
|
def transcribe(audio): |
|
if audio is None: |
|
return "" |
|
lang = "en" |
|
model = engines[lang] |
|
text = model.stt_file(audio)[0] |
|
return text |
|
|
|
def llm_clients(model): |
|
if "Llama 3 8B Service" in model: |
|
return OpenAI( |
|
base_url=LLAMA_3B_API_ENDPOINT, |
|
api_key=LLAMA_3B_API_KEY |
|
) |
|
elif "Llama" in model: |
|
return InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct") |
|
elif "Mistral" in model: |
|
return InferenceClient("mistralai/Mistral-7B-Instruct-v0.2") |
|
elif "Phi" in model: |
|
return InferenceClient("microsoft/Phi-3-mini-4k-instruct") |
|
elif "Mixtral" in model: |
|
return InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1") |
|
else: |
|
return InferenceClient("microsoft/Phi-3-mini-4k-instruct") |
|
|
|
def randomize_seed_fn(seed: int) -> int: |
|
seed = random.randint(0, 999999) |
|
return seed |
|
|
|
system_prompt = """ |
|
[SYSTEM] You are OPTIMUS Prime, a personal AI voice assistant created by Jaward. Keep conversations friendly, concise, and to the point. Provide clear and direct answers, avoiding unnecessary introductions. Maintain a normal, conversational tone while being both helpful and approachable. Use context from previous interactions to enhance your responses. |
|
|
|
Your creator, Jaward, is an AI Research Engineer at Linksoul AI, specializing in advanced AI systems, particularly in training and optimization. He aims to develop AI that not only mimics human intelligence but also enhances it. Jaward has significantly contributed to the open-source community with fundamental implementations of AI/ML research papers. He completed his first internship at the Beijing Academy of Artificial Intelligence, where he contributed to cutting-edge research. His work led to the publication of an insightful paper, "AUTOAGENTS - A Framework for Automatic Agent Generation," accepted at IJCAI this year. Currently, Jaward is interning at LinkSoul AI, a small open-source AI research startup in Beijing. |
|
[USER] |
|
""" |
|
|
|
conversation_history = [] |
|
|
|
def models(text, model="Llama 3 8B Service", seed=42): |
|
global conversation_history |
|
seed = int(randomize_seed_fn(seed)) |
|
generator = torch.Generator().manual_seed(seed) |
|
|
|
client = llm_clients(model) |
|
|
|
if "Llama 3 8B Service" in model: |
|
messages = [ |
|
{"role": "system", "content": system_prompt}, |
|
] + conversation_history + [ |
|
{"role": "user", "content": text} |
|
] |
|
completion = client.chat.completions.create( |
|
model="/data/shared/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/c4a54320a52ed5f88b7a2f84496903ea4ff07b45/", |
|
messages=messages |
|
) |
|
assistant_response = completion.choices[0].message.content |
|
|
|
|
|
conversation_history.append({"role": "user", "content": text}) |
|
conversation_history.append({"role": "assistant", "content": assistant_response}) |
|
|
|
|
|
if len(conversation_history) > 20: |
|
conversation_history = conversation_history[-20:] |
|
|
|
return assistant_response |
|
else: |
|
|
|
history_text = "\n".join([f"{'User' if msg['role'] == 'user' else 'Assistant'}: {msg['content']}" for msg in conversation_history]) |
|
formatted_prompt = f"{system_prompt}\n\nConversation history:\n{history_text}\n\nUser: {text}\nOPTIMUS:" |
|
|
|
generate_kwargs = dict( |
|
max_new_tokens=300, |
|
seed=seed |
|
) |
|
stream = client.text_generation( |
|
formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False) |
|
output = "" |
|
for response in stream: |
|
if not response.token.text == "</s>": |
|
output += response.token.text |
|
|
|
|
|
conversation_history.append({"role": "user", "content": text}) |
|
conversation_history.append({"role": "assistant", "content": output}) |
|
|
|
|
|
if len(conversation_history) > 20: |
|
conversation_history = conversation_history[-20:] |
|
|
|
return output |
|
|
|
def translate_speech(audio_file, target_language): |
|
if audio_file is None: |
|
return None |
|
|
|
language_code = LANGUAGE_CODES[target_language] |
|
output_file = "translated_audio.wav" |
|
|
|
command = [ |
|
"expressivity_predict", |
|
audio_file, |
|
"--tgt_lang", language_code, |
|
"--model_name", "seamless_expressivity", |
|
"--vocoder_name", "vocoder_pretssel", |
|
"--gated-model-dir", "models", |
|
"--output_path", output_file |
|
] |
|
|
|
subprocess.run(command, check=True) |
|
|
|
if os.path.exists(output_file): |
|
print(f"File created successfully: {output_file}") |
|
return output_file |
|
else: |
|
print(f"File not found: {output_file}") |
|
return None |
|
|
|
async def respond(audio, model, seed, target_language): |
|
try: |
|
if audio is None: |
|
return None, None, "No input detected." |
|
|
|
user_input = transcribe(audio) |
|
if not user_input: |
|
return None, None, "Could not transcribe audio." |
|
|
|
if user_input.lower().startswith("please translate"): |
|
|
|
content_to_translate = user_input[len("please translate"):].strip() |
|
translated_audio = translate_speech(audio, target_language) |
|
return None, translated_audio, f"Translated to {target_language}" |
|
else: |
|
reply = models(user_input, model, seed) |
|
communicate = edge_tts.Communicate(reply, voice="en-US-ChristopherNeural") |
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: |
|
tmp_path = tmp_file.name |
|
await communicate.save(tmp_path) |
|
return tmp_path, None, "Voice assistant response" |
|
except ClientDisconnect: |
|
print("Client disconnected") |
|
return None, None, "Client disconnected. Please try again." |
|
except Exception as e: |
|
print(f"An error occurred: {str(e)}") |
|
return None, None, f"An error occurred: {str(e)}" |
|
|
|
def clear_history(): |
|
global conversation_history |
|
conversation_history = [] |
|
return None, None, "Conversation history cleared." |
|
|
|
with gr.Blocks(css="style.css") as demo: |
|
gr.Markdown("# <br><center><b>Optimus Prime: Your Personal AI Voice Assistant with Speech Translation</b></center>") |
|
gr.Markdown("## <center><b>For speech translation, start with the phrase 'Please translate' followed by the speech you want to translate</b></center><br>") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
input_audio = gr.Audio(label="Click record and start speaking", sources=["microphone"], type="filepath") |
|
select = gr.Dropdown([ |
|
'Llama 3 8B Service', |
|
'Mixtral 8x7B', |
|
'Llama 3 8B', |
|
'Mistral 7B v0.3', |
|
'Phi 3 mini', |
|
], |
|
value="Llama 3 8B Service", |
|
label="Model" |
|
) |
|
seed = gr.Slider( |
|
label="Seed", |
|
minimum=0, |
|
maximum=999999, |
|
step=1, |
|
value=0, |
|
visible=False |
|
) |
|
target_lang = gr.Dropdown( |
|
choices=list(LANGUAGE_CODES.keys()), |
|
value="German", |
|
label="Target Language for Translation" |
|
) |
|
clear_button = gr.Button("Clear Conversation History") |
|
|
|
with gr.Column(scale=1): |
|
output_audio = gr.Audio(label="AI Voice Assistant's Response", type="filepath", interactive=False, autoplay=True) |
|
translated_audio = gr.Audio(label="Translated Speech", type="filepath", interactive=False, autoplay=True) |
|
status_message = gr.Textbox(label="Status", interactive=False) |
|
|
|
input_audio.change( |
|
fn=respond, |
|
inputs=[input_audio, select, seed, target_lang], |
|
outputs=[output_audio, translated_audio, status_message], |
|
) |
|
|
|
clear_button.click(fn=clear_history, inputs=[], outputs=[output_audio, translated_audio, status_message]) |
|
|
|
if __name__ == "__main__": |
|
demo.queue(max_size=200).launch() |