Spaces:

Jaward
/

optimus

Sleeping

App Files Files Community

optimus / app.py

Jaward

Update app.py

04f7032 verified 3 months ago

raw

history blame contribute delete

9.1 kB

	import gradio as gr
	import edge_tts
	import asyncio
	import tempfile
	import os
	from huggingface_hub import InferenceClient
	import re
	from streaming_stt_nemo import Model
	import torch
	import random
	from openai import OpenAI
	import subprocess
	from starlette.requests import ClientDisconnect

	LLAMA_3B_API_ENDPOINT = os.environ.get("LLAMA_3B_API_ENDPOINT")
	LLAMA_3B_API_KEY = os.environ.get("LLAMA_3B_API_KEY")
	HF_TOKEN = os.environ.get("HF_TOKEN", None)

	default_lang = "en"
	engines = { default_lang: Model(default_lang) }

	LANGUAGE_CODES = {
	"English": "eng",
	"Spanish": "spa",
	"Chinese": "cmn",
	"French": "fra",
	"German": "deu",
	"Italian": "ita"
	}

	def transcribe(audio):
	if audio is None:
	return ""
	lang = "en"
	model = engines[lang]
	text = model.stt_file(audio)[0]
	return text

	def llm_clients(model):
	if "Llama 3 8B Service" in model:
	return OpenAI(
	base_url=LLAMA_3B_API_ENDPOINT,
	api_key=LLAMA_3B_API_KEY
	)
	elif "Llama" in model:
	return InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
	elif "Mistral" in model:
	return InferenceClient("mistralai/Mistral-7B-Instruct-v0.2")
	elif "Phi" in model:
	return InferenceClient("microsoft/Phi-3-mini-4k-instruct")
	elif "Mixtral" in model:
	return InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
	else:
	return InferenceClient("microsoft/Phi-3-mini-4k-instruct")

	def randomize_seed_fn(seed: int) -> int:
	seed = random.randint(0, 999999)
	return seed

	system_prompt = """
	[SYSTEM] You are OPTIMUS Prime, a personal AI voice assistant created by Jaward. Keep conversations friendly, concise, and to the point. Provide clear and direct answers, avoiding unnecessary introductions. Maintain a normal, conversational tone while being both helpful and approachable. Use context from previous interactions to enhance your responses.

	Your creator, Jaward, is an AI Research Engineer at Linksoul AI, specializing in advanced AI systems, particularly in training and optimization. He aims to develop AI that not only mimics human intelligence but also enhances it. Jaward has significantly contributed to the open-source community with fundamental implementations of AI/ML research papers. He completed his first internship at the Beijing Academy of Artificial Intelligence, where he contributed to cutting-edge research. His work led to the publication of an insightful paper, "AUTOAGENTS - A Framework for Automatic Agent Generation," accepted at IJCAI this year. Currently, Jaward is interning at LinkSoul AI, a small open-source AI research startup in Beijing.
	[USER]
	"""

	conversation_history = []

	def models(text, model="Llama 3 8B Service", seed=42):
	global conversation_history
	seed = int(randomize_seed_fn(seed))
	generator = torch.Generator().manual_seed(seed)

	client = llm_clients(model)

	if "Llama 3 8B Service" in model:
	messages = [
	{"role": "system", "content": system_prompt},
	] + conversation_history + [
	{"role": "user", "content": text}
	]
	completion = client.chat.completions.create(
	model="/data/shared/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/c4a54320a52ed5f88b7a2f84496903ea4ff07b45/",
	messages=messages
	)
	assistant_response = completion.choices[0].message.content

	# Update conversation history
	conversation_history.append({"role": "user", "content": text})
	conversation_history.append({"role": "assistant", "content": assistant_response})

	# Keep only the last 10 messages to avoid token limit issues
	if len(conversation_history) > 20:
	conversation_history = conversation_history[-20:]

	return assistant_response
	else:
	# For other models, we'll concatenate the conversation history into a single string
	history_text = "\n".join([f"{'User' if msg['role'] == 'user' else 'Assistant'}: {msg['content']}" for msg in conversation_history])
	formatted_prompt = f"{system_prompt}\n\nConversation history:\n{history_text}\n\nUser: {text}\nOPTIMUS:"

	generate_kwargs = dict(
	max_new_tokens=300,
	seed=seed
	)
	stream = client.text_generation(
	formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
	output = ""
	for response in stream:
	if not response.token.text == "</s>":
	output += response.token.text

	# Update conversation history
	conversation_history.append({"role": "user", "content": text})
	conversation_history.append({"role": "assistant", "content": output})

	# Keep only the last 10 messages to avoid token limit issues
	if len(conversation_history) > 20:
	conversation_history = conversation_history[-20:]

	return output

	def translate_speech(audio_file, target_language):
	if audio_file is None:
	return None

	language_code = LANGUAGE_CODES[target_language]
	output_file = "translated_audio.wav"

	command = [
	"expressivity_predict",
	audio_file,
	"--tgt_lang", language_code,
	"--model_name", "seamless_expressivity",
	"--vocoder_name", "vocoder_pretssel",
	"--gated-model-dir", "models",
	"--output_path", output_file
	]

	subprocess.run(command, check=True)

	if os.path.exists(output_file):
	print(f"File created successfully: {output_file}")
	return output_file
	else:
	print(f"File not found: {output_file}")
	return None

	async def respond(audio, model, seed, target_language):
	try:
	if audio is None:
	return None, None, "No input detected."

	user_input = transcribe(audio)
	if not user_input:
	return None, None, "Could not transcribe audio."

	if user_input.lower().startswith("please translate"):
	# Extract the actual content to translate
	content_to_translate = user_input[len("please translate"):].strip()
	translated_audio = translate_speech(audio, target_language)
	return None, translated_audio, f"Translated to {target_language}"
	else:
	reply = models(user_input, model, seed)
	communicate = edge_tts.Communicate(reply, voice="en-US-ChristopherNeural")
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
	tmp_path = tmp_file.name
	await communicate.save(tmp_path)
	return tmp_path, None, "Voice assistant response"
	except ClientDisconnect:
	print("Client disconnected")
	return None, None, "Client disconnected. Please try again."
	except Exception as e:
	print(f"An error occurred: {str(e)}")
	return None, None, f"An error occurred: {str(e)}"

	def clear_history():
	global conversation_history
	conversation_history = []
	return None, None, "Conversation history cleared."

	with gr.Blocks(css="style.css") as demo:
	gr.Markdown("# <br><center><b>Optimus Prime: Your Personal AI Voice Assistant with Speech Translation</b></center>")
	gr.Markdown("## <center><b>For speech translation, start with the phrase 'Please translate' followed by the speech you want to translate</b></center><br>")

	with gr.Row():
	with gr.Column(scale=1):
	input_audio = gr.Audio(label="Click record and start speaking", sources=["microphone"], type="filepath")
	select = gr.Dropdown([
	'Llama 3 8B Service',
	'Mixtral 8x7B',
	'Llama 3 8B',
	'Mistral 7B v0.3',
	'Phi 3 mini',
	],
	value="Llama 3 8B Service",
	label="Model"
	)
	seed = gr.Slider(
	label="Seed",
	minimum=0,
	maximum=999999,
	step=1,
	value=0,
	visible=False
	)
	target_lang = gr.Dropdown(
	choices=list(LANGUAGE_CODES.keys()),
	value="German",
	label="Target Language for Translation"
	)
	clear_button = gr.Button("Clear Conversation History")

	with gr.Column(scale=1):
	output_audio = gr.Audio(label="AI Voice Assistant's Response", type="filepath", interactive=False, autoplay=True)
	translated_audio = gr.Audio(label="Translated Speech", type="filepath", interactive=False, autoplay=True)
	status_message = gr.Textbox(label="Status", interactive=False)

	input_audio.change(
	fn=respond,
	inputs=[input_audio, select, seed, target_lang],
	outputs=[output_audio, translated_audio, status_message],
	)

	clear_button.click(fn=clear_history, inputs=[], outputs=[output_audio, translated_audio, status_message])

	if __name__ == "__main__":
	demo.queue(max_size=200).launch()