File size: 3,988 Bytes
0e29aed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dffe439
 
0e29aed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f8c9676
0e29aed
a7eeceb
 
 
 
 
 
 
0e29aed
 
 
 
 
 
 
 
 
 
 
1d67e42
0e29aed
1d67e42
0e29aed
1d67e42
0e29aed
 
 
 
 
 
 
 
 
 
 
62194f7
dffe439
0e29aed
 
 
 
 
dffe439
 
0e29aed
 
c5f1fa0
 
0e29aed
 
 
 
 
 
1d67e42
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import subprocess
import sys

subprocess.check_call([sys.executable, "-m", "pip", "install", "pip==24.0"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "omegaconf==2.0.6"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "git+https://github.com/facebookresearch/[email protected]"])
import gradio as gr
import os
import torch
import librosa
import soundfile as sf
import tempfile
import spaces  # ZeroGPU requirement

# 导入你的模块
import Echox_copy_stream as Echox

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 全局变量
_MODEL_ON_CUDA = False
inference_model = None

def init_model():
    """在CPU上初始化模型"""
    global inference_model
    if inference_model is None:
        inference_model = Echox.EchoxAssistant()
    return inference_model

def process_audio_input(audio):
    """处理音频输入"""
    if audio is None:
        return None
    
    try:
        # 如果是文件路径,直接返回
        if isinstance(audio, str):
            return audio
        
        # 如果是numpy数组格式 (sr, data)
        if isinstance(audio, tuple):
            sr, y = audio
            if y.ndim > 1:
                y = y[:, 0]  # 只保留第一个声道
        else:
            # 如果直接是数组
            y = audio
            sr = 16000  # 默认采样率
        
        # 保存为临时文件
        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
            sf.write(tmp_file.name, y, sr)
            return tmp_file.name
            
    except Exception as e:
        print(f"Error processing audio: {e}")
        return None

@spaces.GPU(duration=120)
def process_audio_text(text, audio):
    global _MODEL_ON_CUDA, inference_model
    
    if inference_model is None:
        init_model()
    
    if not _MODEL_ON_CUDA:
        try:
            if hasattr(inference_model, 'model'):
                inference_model.model = inference_model.model.to("cuda")
            if hasattr(inference_model, 'unit_translator'):
                inference_model.unit_translator = inference_model.unit_translator.to("cuda")
            
            inference_model.device = "cuda"
            _MODEL_ON_CUDA = True
            print("Model moved to GPU")
        except Exception as e:
            print(f"Error moving model to GPU: {e}")
    
    audio_path = process_audio_input(audio)
    
    tmp = [{
        "conversations": [
            {
                "from": "user",
                "value": text,
                "audio": audio_path
            }
        ]
    }]
    
    accumulated_text = ""
    
    try:
        for text_response, audio_data in inference_model._inference(tmp):
            if text_response:
                accumulated_text = text_response
            
            if audio_data is not None:
                sr, audio_array = audio_data
                yield (sr, audio_array), accumulated_text
            else:
                yield None, accumulated_text
    except Exception as e:
        yield None, f"Error: {str(e)}"
    finally:
        if audio_path and audio_path != audio and os.path.exists(audio_path):
            try:
                os.unlink(audio_path)
            except:
                pass

init_model()

if __name__ == "__main__":
    examples = [
        ["Recognize what the voice said and respond to it.", "./show_case/1.wav"],
        ["", "./show_case/2.wav"],
    ]  

    iface = gr.Interface(
        fn=process_audio_text,
        inputs=[
            gr.Textbox(label="Input Text", value=examples[0][0]),
            gr.Audio(type="filepath", label="Upload Audio", value=examples[0][1])
        ],
        outputs=[
            gr.Audio(label="Streamed Audio", streaming=True, autoplay=True),
            gr.Textbox(label="Model output")
        ],
        examples=examples,
        live=False,
        allow_flagging="never"
    )

    iface.launch(server_name="0.0.0.0", server_port=7860, share=False)