Spaces:

mrfakename
/

EmoAct-MiMo

Running on Zero

App Files Files Community

EmoAct-MiMo / app.py

mrfakename

Upgrade to 1.2

573876e verified 6 days ago

raw

history blame

11.3 kB

	import subprocess
	import sys
	import os
	import torch
	import platform

	def install_flash_attention():
	# --- Step 1: Detect system info ---
	py_version = f"cp{sys.version_info.major}{sys.version_info.minor}"
	torch_version = torch.__version__.split("+")[0] # e.g., '2.6.0'
	cuda_version = torch.version.cuda or "cpu"
	cxx11abi = "FALSE" if torch._C._GLIBCXX_USE_CXX11_ABI == 0 else "TRUE"
	system = platform.system().lower()
	arch = platform.machine()

	# --- Step 2: Normalize CUDA and torch version formatting ---
	if cuda_version != "cpu":
	# Extract only major.minor (e.g., 12.1 -> 12)
	cuda_major = cuda_version.split(".")[0]
	cuda_tag = f"cu{cuda_major}"
	else:
	cuda_tag = "cpu"

	# Use only torch major.minor (e.g., 2.6.0 -> 2.6)
	torch_tag = torch_version[:3]

	# --- Step 3: Build the wheel URL ---
	base_url = "https://github.com/Dao-AILab/flash-attention/releases/download"
	release_tag = "v2.7.4.post1"

	wheel_name = (
	f"flash_attn-2.7.4.post1+{cuda_tag}torch{torch_tag}"
	f"cxx11abi{cxx11abi}-"
	f"{py_version}-{py_version}-linux_x86_64.whl"
	)

	wheel_url = f"{base_url}/{release_tag}/{wheel_name}"

	print(f"🔥 Installing FlashAttention wheel:\n{wheel_url}\n")

	# --- Step 4: Install it ---
	env = dict(**os.environ, FLASH_ATTENTION_SKIP_CUDA_BUILD="TRUE")

	subprocess.run(
	["pip", "install", wheel_url, "--no-build-isolation"],
	env=env,
	check=True,
	)

	install_flash_attention()


	import gradio as gr
	import spaces
	import torch
	from huggingface_hub import snapshot_download
	from transformers import AutoTokenizer
	from src.mimo_audio.modeling_mimo_audio import MiMoAudioArguments, MiMoAudioForCausalLM
	from peft import PeftModel
	from src.mimo_audio.mimo_audio import MimoAudio
	import tempfile
	import os

	# Download base models from Hugging Face
	print("Downloading MiMo-Audio base models from Hugging Face...")
	base_model_path = snapshot_download(repo_id="XiaomiMiMo/MiMo-Audio-7B-Instruct")
	tokenizer_path = snapshot_download(repo_id="XiaomiMiMo/MiMo-Audio-Tokenizer")
	print(f"Base models downloaded to: {base_model_path}")

	# Download both LoRA weights
	print("Downloading EmoAct-MiMo LoRA weights...")
	hf_token = os.environ.get("HF_TOKEN")
	lora_v1_path = snapshot_download(repo_id="mrfakename/EmoAct-MiMo", token=hf_token)
	print(f"LoRA v1.0 weights downloaded to: {lora_v1_path}")

	print("Downloading EmoAct-MiMo v1.2 (Beta) LoRA weights...")
	lora_v1_1_path = snapshot_download(repo_id="mrfakename/EmoAct-MiMo-v1.2", token=hf_token)
	print(f"LoRA v1.2 (Beta) weights downloaded to: {lora_v1_1_path}")

	# Load tokenizer and get special tokens
	print("Loading tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained(base_model_path)
	sosp_idx = tokenizer.convert_tokens_to_ids("<\|sosp\|>")
	eosp_idx = tokenizer.convert_tokens_to_ids("<\|eosp\|>")
	empty_token = tokenizer.convert_tokens_to_ids("<\|empty\|>")
	sostm_idx = tokenizer.convert_tokens_to_ids("<\|sostm\|>")
	eostm_idx = tokenizer.convert_tokens_to_ids("<\|eostm\|>")
	eot_idx = tokenizer.convert_tokens_to_ids("<\|eot\|>")

	# Create model args
	model_args = MiMoAudioArguments(
	model_name_or_path=base_model_path,
	sosp_idx=sosp_idx,
	eosp_idx=eosp_idx,
	empty_idx=empty_token,
	sostm_idx=sostm_idx,
	eostm_idx=eostm_idx,
	eot_idx=eot_idx,
	)

	# Load base model for v1.0
	print("Loading base MiMo-Audio model for v1.0...")
	base_model_v1 = MiMoAudioForCausalLM.from_pretrained(
	base_model_path,
	args=model_args,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	)
	print("Base model v1.0 loaded")

	# Load and merge LoRA v1.0
	print("Loading LoRA v1.0 adapter...")
	model_with_lora_v1 = PeftModel.from_pretrained(base_model_v1, lora_v1_path)
	print("Merging LoRA v1.0 weights...")
	merged_model_v1 = model_with_lora_v1.merge_and_unload()
	print("LoRA v1.0 weights merged!")

	# Save merged model v1.0 to temporary directory
	print("Saving merged model v1.0...")
	merged_model_v1_path = "/tmp/merged_mimo_audio_v1"
	os.makedirs(merged_model_v1_path, exist_ok=True)
	merged_model_v1.save_pretrained(merged_model_v1_path)
	tokenizer.save_pretrained(merged_model_v1_path)
	print(f"Merged model v1.0 saved to {merged_model_v1_path}")

	# Load base model for v1.2
	print("Loading base MiMo-Audio model for v1.2...")
	base_model_v1_1 = MiMoAudioForCausalLM.from_pretrained(
	base_model_path,
	args=model_args,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	)
	print("Base model v1.2 loaded")

	# Load and merge LoRA v1.2
	print("Loading LoRA v1.2 (Beta) adapter...")
	model_with_lora_v1_1 = PeftModel.from_pretrained(base_model_v1_1, lora_v1_1_path)
	print("Merging LoRA v1.2 (Beta) weights...")
	merged_model_v1_1 = model_with_lora_v1_1.merge_and_unload()
	print("LoRA v1.2 (Beta) weights merged!")

	# Save merged model v1.2 to temporary directory
	print("Saving merged model v1.2...")
	merged_model_v1_1_path = "/tmp/merged_mimo_audio_v1_1"
	os.makedirs(merged_model_v1_1_path, exist_ok=True)
	merged_model_v1_1.save_pretrained(merged_model_v1_1_path)
	tokenizer.save_pretrained(merged_model_v1_1_path)
	print(f"Merged model v1.2 (Beta) saved to {merged_model_v1_1_path}")

	# Initialize both MimoAudio models
	print("Initializing MimoAudio wrappers...")
	model_v1 = MimoAudio(
	model_path=merged_model_v1_path,
	mimo_audio_tokenizer_path=tokenizer_path
	)
	model_v1_1 = MimoAudio(
	model_path=merged_model_v1_1_path,
	mimo_audio_tokenizer_path=tokenizer_path
	)
	print("Both models ready!")

	# Dictionary to store models
	models = {
	"EmoAct-MiMo v1.0 (Stable)": model_v1,
	"EmoAct-MiMo v1.2 (Beta - Experimental)": model_v1_1
	}

	@spaces.GPU
	def generate_speech(model_choice, emotion, text):
	"""Generate emotional speech from text using selected EmoAct-MiMo model"""
	if not emotion or not emotion.strip():
	return None, "Please enter an emotion description."
	if not text or not text.strip():
	return None, "Please enter text to convert to speech."

	print(f"Using model: {model_choice}")
	print("Generating:", text)
	print("With emotion:", emotion)

	try:
	# Select the appropriate model
	model = models[model_choice]

	# Create temporary file for output
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
	output_path = tmp_file.name

	# Format the instruction with emotion and text
	full_instruction = f"Emotion: {emotion.strip()}\nText: {text.strip()}"

	# Generate TTS with emotion instruction
	model.tts_sft(
	text=text.strip(),
	output_path=output_path,
	instruct=emotion.strip()
	)

	return output_path, f"✅ Speech generated successfully using {model_choice}!"

	except Exception as e:
	return None, f"❌ Error: {str(e)}"

	# Create Gradio interface
	with gr.Blocks(title="EmoAct-MiMo TTS") as demo:
	gr.Markdown("""
	# 🎭 EmoAct-MiMo: Emotion-Controllable Text-to-Speech

	Generate intensely emotional speech using the [EmoAct-MiMo model](https://huggingface.co/mrfakename/EmoAct-MiMo).

	This is still a very early experiment and is very early in the training run, I need to change a few settings and retrain. But the model turned out quite nicely!

	It may hallucinate, try a few times to get good results.

	Voice cloning is not supported yet.
	""")

	with gr.Row():
	with gr.Column():
	model_selector = gr.Dropdown(
	choices=["EmoAct-MiMo v1.0 (Stable)", "EmoAct-MiMo v1.2 (Beta - Experimental)"],
	value="EmoAct-MiMo v1.0 (Stable)",
	label="Model Selection",
	info="v1.0 is the current stable model. v1.2 is a beta experimental version with potentially different characteristics."
	)
	emotion_input = gr.Textbox(
	label="Emotion",
	placeholder="e.g., 'intense anger, rage, fury, hatred, and annoyance, speaking without any accent'",
	lines=3
	)
	text_input = gr.Textbox(
	label="Text",
	placeholder="Enter the text to speak with emotion...",
	lines=5
	)
	generate_btn = gr.Button("Generate Emotional Speech", variant="primary")

	with gr.Column():
	audio_output = gr.Audio(
	label="Generated Speech",
	type="filepath"
	)
	status_output = gr.Textbox(
	label="Status",
	interactive=False
	)

	# Intense emotion examples
	gr.Examples(
	examples=[
	[
	"EmoAct-MiMo v1.0 (Stable)",
	"intense anger, rage, fury, hatred, and annoyance, speaking without any accent",
	"You know what? I'm done. I'm done with your excuses. (sharp exhale) Every single time, it's the same, and I actually believed you'd change. (voice cracks slightly) God, I'm such an idiot for trusting you again."
	],
	[
	"EmoAct-MiMo v1.0 (Stable)",
	"overwhelming grief, deep sorrow, heartbreak, and devastating sadness, speaking without any accent",
	"I can't... I can't believe they're gone. (trembling voice) It doesn't feel real. I keep expecting them to walk through that door, and... (chokes up) ...and they never will. How am I supposed to go on without them?"
	],
	[
	"EmoAct-MiMo v1.0 (Stable)",
	"extreme fear, terror, panic, dread, and anxiety, speaking without any accent",
	"(breathing heavily) Did you hear that? Something's out there. (whispers urgently) We need to hide, NOW. Oh god, oh god, it's getting closer. I don't want to die. Please, please let us make it out of here alive."
	],
	[
	"EmoAct-MiMo v1.0 (Stable)",
	"intense joy, euphoria, excitement, elation, and overwhelming happiness, speaking without any accent",
	"YES! YES! I DID IT! (laughs breathlessly) I can't believe it actually worked! This is... this is everything I've ever dreamed of! I'm so happy I could cry!"
	],
	[
	"EmoAct-MiMo v1.2 (Beta - Experimental)",
	"crushing despair, hopelessness, depression, and deep emotional pain, speaking without any accent",
	"What's the point anymore? I've tried everything. Nothing changes. Nothing ever gets better. I'm so tired of pretending I'm okay when I'm falling apart inside."
	],
	[
	"EmoAct-MiMo v1.2 (Beta - Experimental)",
	"bitter jealousy, envy, resentment, and seething frustration, speaking without any accent",
	"Of course they chose you. They always choose you. <laugh> Must be nice, having everything handed to you while the rest of us break our backs. You don't even appreciate what you have."
	]
	],
	inputs=[model_selector, emotion_input, text_input]
	)

	# Event handler
	generate_btn.click(
	fn=generate_speech,
	inputs=[model_selector, emotion_input, text_input],
	outputs=[audio_output, status_output]
	)

	if __name__ == "__main__":
	demo.launch()