Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,988 Bytes
0e29aed dffe439 0e29aed f8c9676 0e29aed a7eeceb 0e29aed 1d67e42 0e29aed 1d67e42 0e29aed 1d67e42 0e29aed 62194f7 dffe439 0e29aed dffe439 0e29aed c5f1fa0 0e29aed 1d67e42 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import subprocess
import sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "pip==24.0"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "omegaconf==2.0.6"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "git+https://github.com/facebookresearch/[email protected]"])
import gradio as gr
import os
import torch
import librosa
import soundfile as sf
import tempfile
import spaces # ZeroGPU requirement
# 导入你的模块
import Echox_copy_stream as Echox
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# 全局变量
_MODEL_ON_CUDA = False
inference_model = None
def init_model():
"""在CPU上初始化模型"""
global inference_model
if inference_model is None:
inference_model = Echox.EchoxAssistant()
return inference_model
def process_audio_input(audio):
"""处理音频输入"""
if audio is None:
return None
try:
# 如果是文件路径,直接返回
if isinstance(audio, str):
return audio
# 如果是numpy数组格式 (sr, data)
if isinstance(audio, tuple):
sr, y = audio
if y.ndim > 1:
y = y[:, 0] # 只保留第一个声道
else:
# 如果直接是数组
y = audio
sr = 16000 # 默认采样率
# 保存为临时文件
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
sf.write(tmp_file.name, y, sr)
return tmp_file.name
except Exception as e:
print(f"Error processing audio: {e}")
return None
@spaces.GPU(duration=120)
def process_audio_text(text, audio):
global _MODEL_ON_CUDA, inference_model
if inference_model is None:
init_model()
if not _MODEL_ON_CUDA:
try:
if hasattr(inference_model, 'model'):
inference_model.model = inference_model.model.to("cuda")
if hasattr(inference_model, 'unit_translator'):
inference_model.unit_translator = inference_model.unit_translator.to("cuda")
inference_model.device = "cuda"
_MODEL_ON_CUDA = True
print("Model moved to GPU")
except Exception as e:
print(f"Error moving model to GPU: {e}")
audio_path = process_audio_input(audio)
tmp = [{
"conversations": [
{
"from": "user",
"value": text,
"audio": audio_path
}
]
}]
accumulated_text = ""
try:
for text_response, audio_data in inference_model._inference(tmp):
if text_response:
accumulated_text = text_response
if audio_data is not None:
sr, audio_array = audio_data
yield (sr, audio_array), accumulated_text
else:
yield None, accumulated_text
except Exception as e:
yield None, f"Error: {str(e)}"
finally:
if audio_path and audio_path != audio and os.path.exists(audio_path):
try:
os.unlink(audio_path)
except:
pass
init_model()
if __name__ == "__main__":
examples = [
["Recognize what the voice said and respond to it.", "./show_case/1.wav"],
["", "./show_case/2.wav"],
]
iface = gr.Interface(
fn=process_audio_text,
inputs=[
gr.Textbox(label="Input Text", value=examples[0][0]),
gr.Audio(type="filepath", label="Upload Audio", value=examples[0][1])
],
outputs=[
gr.Audio(label="Streamed Audio", streaming=True, autoplay=True),
gr.Textbox(label="Model output")
],
examples=examples,
live=False,
allow_flagging="never"
)
iface.launch(server_name="0.0.0.0", server_port=7860, share=False) |