Spaces:

BryanBradfo
/

speech-to-speech-streaming

Runtime error

App Files Files Community

BryanBradfo commited on Apr 1

Commit

4c65a2b

1 Parent(s): c796506

test

Browse files

Files changed (3) hide show

app.py +43 -136
not.txt +10 -0
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -3,144 +3,35 @@ import os
 import time
 from pathlib import Path
 import gradio as gr
 import numpy as np
 from fastapi import FastAPI
 from fastapi.responses import HTMLResponse, StreamingResponse
 from fastrtc import (
     AdditionalOutputs,
     ReplyOnPause,
     Stream,
     get_twilio_turn_credentials,
 )
 from fastrtc.utils import audio_to_bytes
 from gradio.utils import get_space
 from pydantic import BaseModel
-import torch
-# Import for open-source models
-from transformers import (
-    AutoModelForSpeechSeq2Seq,
-    AutoProcessor,
-    AutoTokenizer,
-    AutoModelForCausalLM,
-    pipeline
-)
-import gc
-# Current directory
 curr_dir = Path(__file__).parent
-# Load ASR model (Whisper small for lightweight usage)
-device = "cuda" if torch.cuda.is_available() else "cpu"
-torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-print("Loading ASR model...")
-asr_model_id = "openai/whisper-small"
-asr_processor = AutoProcessor.from_pretrained(asr_model_id)
-asr_model = AutoModelForSpeechSeq2Seq.from_pretrained(
-    asr_model_id,
-    torch_dtype=torch_dtype,
-    low_cpu_mem_usage=True,
-    use_safetensors=True
-)
-asr_model.to(device)
-asr_pipe = pipeline(
-    "automatic-speech-recognition",
-    model=asr_model,
-    tokenizer=asr_processor.tokenizer,
-    feature_extractor=asr_processor.feature_extractor,
-    max_new_tokens=128,
-    chunk_length_s=30,
-    batch_size=16,
-    return_timestamps=False,
-    device=device,
-)
-# Load LLM (TinyLlama for lightweight operation)
-print("Loading LLM model...")
-llm_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_id)
-llm_model = AutoModelForCausalLM.from_pretrained(
-    llm_model_id,
-    torch_dtype=torch_dtype,
-    low_cpu_mem_usage=True,
-    use_safetensors=True
-)
-llm_model.to(device)
-# Load TTS model (Piper TTS or CoquiTTS)
-print("Loading TTS model...")
-from TTS.api import TTS
-tts_model = TTS("tts_models/en/ljspeech/tacotron2-DDC", gpu=torch.cuda.is_available())
-# Free up memory after loading models
-gc.collect()
-torch.cuda.empty_cache() if torch.cuda.is_available() else None
-def transcribe_audio(audio):
-    """Transcribe audio using Whisper model"""
-    result = asr_pipe({"array": audio[1], "sampling_rate": audio[0]})
-    return result["text"]
-def generate_llm_response(messages):
-    """Generate response using TinyLlama model"""
-    # Format messages for TinyLlama
-    prompt = ""
-    for msg in messages:
-        if msg["role"] == "user":
-            prompt += f"<|user|>\n{msg['content']}\n<|assistant|>\n"
-        elif msg["role"] == "assistant":
-            prompt += f"{msg['content']}\n"
-    # Add final assistant token if not present
-    if not prompt.endswith("<|assistant|>\n"):
-        prompt += "<|assistant|>\n"
-    # Generate response
-    inputs = llm_tokenizer(prompt, return_tensors="pt").to(device)
-    outputs = llm_model.generate(
-        inputs.input_ids,
-        max_new_tokens=512,
-        do_sample=True,
-        temperature=0.7,
-        top_p=0.9,
-    )
-    response = llm_tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
-    return response
-def generate_speech(text):
-    """Generate speech from text using TTS model"""
-    # Create a temporary file path
-    output_path = curr_dir / "temp_audio.wav"
-    # Generate audio
-    tts_model.tts_to_file(text=text, file_path=str(output_path))
-    # Read audio file
-    import wave
-    import numpy as np
-    with wave.open(str(output_path), 'rb') as wav_file:
-        # Get audio parameters
-        sample_rate = wav_file.getframerate()
-        n_frames = wav_file.getnframes()
-        n_channels = wav_file.getnchannels()
-        # Read audio data
-        data = wav_file.readframes(n_frames)
-        # Convert to numpy array
-        audio_data = np.frombuffer(data, dtype=np.int16)
-        if n_channels == 2:  # Convert stereo to mono
-            audio_data = audio_data.reshape(-1, 2).mean(axis=1).astype(np.int16)
-    # Clean up temp file
-    if output_path.exists():
-        os.remove(output_path)
-    # Return audio data and sample rate
-    return (sample_rate, audio_data)
 def response(
     audio: tuple[int, np.ndarray],
@@ -148,28 +39,36 @@ def response(
 ):
     chatbot = chatbot or []
     messages = [{"role": d["role"], "content": d["content"]} for d in chatbot]
-    # Transcribe audio
-    prompt = transcribe_audio(audio)
     chatbot.append({"role": "user", "content": prompt})
     yield AdditionalOutputs(chatbot)
     messages.append({"role": "user", "content": prompt})
-    # Generate response
-    response_text = generate_llm_response(messages)
     chatbot.append({"role": "assistant", "content": response_text})
-    # Generate speech
     start = time.time()
     print("starting tts", start)
-    # Generate speech in a single call for simplicity
-    audio_output = generate_speech(response_text)
-    print("finished tts", time.time() - start)
-    yield audio_output
-    yield AdditionalOutputs(chatbot)
-# Set up Gradio chatbot interface
 chatbot = gr.Chatbot(type="messages")
 stream = Stream(
     modality="audio",
@@ -183,17 +82,21 @@ stream = Stream(
     time_limit=90 if get_space() else None,
 )
 class Message(BaseModel):
     role: str
     content: str
 class InputData(BaseModel):
     webrtc_id: str
     chatbot: list[Message]
 app = FastAPI()
 stream.mount(app)
 @app.get("/")
 async def _():
     rtc_config = get_twilio_turn_credentials() if get_space() else None
@@ -201,11 +104,13 @@ async def _():
     html_content = html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config))
     return HTMLResponse(content=html_content, status_code=200)
 @app.post("/input_hook")
 async def _(body: InputData):
     stream.set_input(body.webrtc_id, body.model_dump()["chatbot"])
     return {"status": "ok"}
 @app.get("/outputs")
 def _(webrtc_id: str):
     async def output_stream():
@@ -215,6 +120,7 @@ def _(webrtc_id: str):
     return StreamingResponse(output_stream(), media_type="text/event-stream")
 if __name__ == "__main__":
     import os
@@ -224,4 +130,5 @@ if __name__ == "__main__":
         stream.fastphone(host="0.0.0.0", port=7860)
     else:
         import uvicorn
         uvicorn.run(app, host="0.0.0.0", port=7860)

 import time
 from pathlib import Path
+import anthropic
 import gradio as gr
 import numpy as np
+from dotenv import load_dotenv
+from elevenlabs import ElevenLabs
 from fastapi import FastAPI
 from fastapi.responses import HTMLResponse, StreamingResponse
 from fastrtc import (
     AdditionalOutputs,
     ReplyOnPause,
     Stream,
+    get_tts_model,
     get_twilio_turn_credentials,
 )
 from fastrtc.utils import audio_to_bytes
 from gradio.utils import get_space
+from groq import Groq
 from pydantic import BaseModel
+load_dotenv()
+groq_client = Groq()
+claude_client = anthropic.Anthropic()
+tts_client = ElevenLabs(api_key=os.environ["ELEVENLABS_API_KEY"])
 curr_dir = Path(__file__).parent
+tts_model = get_tts_model()
 def response(
     audio: tuple[int, np.ndarray],
 ):
     chatbot = chatbot or []
     messages = [{"role": d["role"], "content": d["content"]} for d in chatbot]
+    prompt = groq_client.audio.transcriptions.create(
+        file=("audio-file.mp3", audio_to_bytes(audio)),
+        model="whisper-large-v3-turbo",
+        response_format="verbose_json",
+    ).text
     chatbot.append({"role": "user", "content": prompt})
     yield AdditionalOutputs(chatbot)
     messages.append({"role": "user", "content": prompt})
+    response = claude_client.messages.create(
+        model="claude-3-5-haiku-20241022",
+        max_tokens=512,
+        messages=messages,  # type: ignore
+    )
+    response_text = " ".join(
+        block.text  # type: ignore
+        for block in response.content
+        if getattr(block, "type", None) == "text"
+    )
     chatbot.append({"role": "assistant", "content": response_text})
     start = time.time()
     print("starting tts", start)
+    for i, chunk in enumerate(tts_model.stream_tts_sync(response_text)):
+        print("chunk", i, time.time() - start)
+        yield chunk
+        print("finished tts", time.time() - start)
+        yield AdditionalOutputs(chatbot)
 chatbot = gr.Chatbot(type="messages")
 stream = Stream(
     modality="audio",
     time_limit=90 if get_space() else None,
 )
 class Message(BaseModel):
     role: str
     content: str
 class InputData(BaseModel):
     webrtc_id: str
     chatbot: list[Message]
 app = FastAPI()
 stream.mount(app)
 @app.get("/")
 async def _():
     rtc_config = get_twilio_turn_credentials() if get_space() else None
     html_content = html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config))
     return HTMLResponse(content=html_content, status_code=200)
 @app.post("/input_hook")
 async def _(body: InputData):
     stream.set_input(body.webrtc_id, body.model_dump()["chatbot"])
     return {"status": "ok"}
 @app.get("/outputs")
 def _(webrtc_id: str):
     async def output_stream():
     return StreamingResponse(output_stream(), media_type="text/event-stream")
 if __name__ == "__main__":
     import os
         stream.fastphone(host="0.0.0.0", port=7860)
     else:
         import uvicorn
         uvicorn.run(app, host="0.0.0.0", port=7860)

not.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+fastrtc[vad]
+gradio>=4.0.0
+transformers>=4.37.0
+torch>=2.0.0
+numpy>=1.24.0
+fastapi>=0.103.1
+uvicorn>=0.23.2
+TTS>=0.17.0
+pydantic>=2.0.0
+soundfile>=0.12.1

requirements.txt CHANGED Viewed

@@ -5,3 +5,4 @@ groq
 anthropic
 twilio
 python-dotenv

 anthropic
 twilio
 python-dotenv
+torch>=2.0.0