Spaces:

fastrtc
/

talk-to-gemini

Running on CPU Upgrade

App Files Files Community

freddyaboulton HF staff commited on 25 days ago

Commit

9977a2c

verified ·

1 Parent(s): 427c3db

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

README.md +7 -4
app.py +167 -0
index.html +298 -0
requirements.txt +4 -0

README.md CHANGED Viewed

@@ -1,12 +1,15 @@
 ---
-title: Talk To Gemini
-emoji: 👀
-colorFrom: red
 colorTo: red
 sdk: gradio
 sdk_version: 5.16.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Talk to Gemini
+emoji: ♊️
+colorFrom: purple
 colorTo: red
 sdk: gradio
 sdk_version: 5.16.0
 app_file: app.py
 pinned: false
+license: mit
+short_description: Talk to Gemini using Google's multimodal API
+tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|GEMINI_API_KEY]
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import asyncio
+import base64
+import pathlib
+from typing import AsyncGenerator, Literal
+from dotenv import load_dotenv
+import os
+from google import genai
+from pydantic import BaseModel
+from google.genai.types import (
+    LiveConnectConfig,
+    PrebuiltVoiceConfig,
+    SpeechConfig,
+    VoiceConfig,
+)
+import gradio as gr
+import json
+from gradio.utils import get_space
+from fastrtc import (
+    Stream,
+    AsyncStreamHandler,
+    async_aggregate_bytes_to_16bit,
+    get_twilio_turn_credentials,
+)
+import numpy as np
+from fastapi.responses import HTMLResponse
+current_dir = pathlib.Path(__file__).parent
+load_dotenv()
+def encode_audio(data: np.ndarray) -> str:
+    """Encode Audio data to send to the server"""
+    return base64.b64encode(data.tobytes()).decode("UTF-8")
+class GeminiHandler(AsyncStreamHandler):
+    """Handler for the Gemini API"""
+    def __init__(
+        self,
+        expected_layout: Literal["mono"] = "mono",
+        output_sample_rate: int = 24000,
+        output_frame_size: int = 480,
+    ) -> None:
+        super().__init__(
+            expected_layout,
+            output_sample_rate,
+            output_frame_size,
+            input_sample_rate=16000,
+        )
+        self.input_queue: asyncio.Queue = asyncio.Queue()
+        self.output_queue: asyncio.Queue = asyncio.Queue()
+        self.quit: asyncio.Event = asyncio.Event()
+    def copy(self) -> "GeminiHandler":
+        return GeminiHandler(
+            expected_layout="mono",
+            output_sample_rate=self.output_sample_rate,
+            output_frame_size=self.output_frame_size,
+        )
+    async def stream(self) -> AsyncGenerator[bytes, None]:
+        while not self.quit.is_set():
+            audio = await self.input_queue.get()
+            yield audio
+        return
+    async def connect(
+        self, api_key: str | None = None, voice_name: str | None = None
+    ) -> AsyncGenerator[bytes, None]:
+        """Connect to to genai server and start the stream"""
+        client = genai.Client(
+            api_key=api_key or os.getenv("GEMINI_API_KEY"),
+            http_options={"api_version": "v1alpha"},
+        )
+        config = LiveConnectConfig(
+            response_modalities=["AUDIO"],  # type: ignore
+            speech_config=SpeechConfig(
+                voice_config=VoiceConfig(
+                    prebuilt_voice_config=PrebuiltVoiceConfig(
+                        voice_name=voice_name,
+                    )
+                )
+            ),
+        )
+        async with client.aio.live.connect(
+            model="gemini-2.0-flash-exp", config=config
+        ) as session:
+            async for audio in session.start_stream(
+                stream=self.stream(), mime_type="audio/pcm"
+            ):
+                if audio.data:
+                    yield audio.data
+    async def receive(self, frame: tuple[int, np.ndarray]) -> None:
+        _, array = frame
+        array = array.squeeze()
+        audio_message = encode_audio(array)
+        self.input_queue.put_nowait(audio_message)
+    async def generator(self) -> None:
+        async for audio_response in async_aggregate_bytes_to_16bit(
+            self.connect(*self.latest_args[1:])
+        ):
+            self.output_queue.put_nowait(audio_response)
+    async def emit(self) -> tuple[int, np.ndarray]:
+        if not self.args_set.is_set():
+            await self.wait_for_args()
+            asyncio.create_task(self.generator())
+        array = await self.output_queue.get()
+        return (self.output_sample_rate, array)
+    def shutdown(self) -> None:
+        self.quit.set()
+        self.args_set.clear()
+        self.quit.clear()
+stream = Stream(
+    modality="audio",
+    mode="send-receive",
+    handler=GeminiHandler(),
+    rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
+    additional_inputs=[
+        gr.Textbox(label="API Key", type="password", value=os.getenv("GEMINI_API_KEY")),
+        gr.Dropdown(
+            label="Voice",
+            choices=[
+                "Puck",
+                "Charon",
+                "Kore",
+                "Fenrir",
+                "Aoede",
+            ],
+            value="Puck",
+        ),
+    ],
+)
+class InputData(BaseModel):
+    webrtc_id: str
+    voice_name: str
+    api_key: str
+@stream.post("/input_hook")
+async def _(body: InputData):
+    stream.set_input(body.webrtc_id, body.api_key, body.voice_name)
+    return {"status": "ok"}
+@stream.get("/")
+async def index():
+    rtc_config = get_twilio_turn_credentials() if get_space() else None
+    html_content = (current_dir / "index.html").read_text()
+    html_content = html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config))
+    return HTMLResponse(content=html_content)
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(stream, host="0.0.0.0", port=7860)

index.html ADDED Viewed

	@@ -0,0 +1,298 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Gemini Voice Chat</title>
+    <style>
+        :root {
+            --color-accent: #6366f1;
+            --color-background: #0f172a;
+            --color-surface: #1e293b;
+            --color-text: #e2e8f0;
+            --boxSize: 8px;
+            --gutter: 4px;
+        }
+        body {
+            margin: 0;
+            padding: 0;
+            background-color: var(--color-background);
+            color: var(--color-text);
+            font-family: system-ui, -apple-system, sans-serif;
+            min-height: 100vh;
+            display: flex;
+            flex-direction: column;
+            align-items: center;
+            justify-content: center;
+        }
+        .container {
+            width: 90%;
+            max-width: 800px;
+            background-color: var(--color-surface);
+            padding: 2rem;
+            border-radius: 1rem;
+            box-shadow: 0 25px 50px -12px rgba(0, 0, 0, 0.25);
+        }
+        .wave-container {
+            position: relative;
+            display: flex;
+            min-height: 100px;
+            max-height: 128px;
+            justify-content: center;
+            align-items: center;
+            margin: 2rem 0;
+        }
+        .box-container {
+            display: flex;
+            justify-content: space-between;
+            height: 64px;
+            width: 100%;
+        }
+        .box {
+            height: 100%;
+            width: var(--boxSize);
+            background: var(--color-accent);
+            border-radius: 8px;
+            transition: transform 0.05s ease;
+        }
+        .controls {
+            display: grid;
+            gap: 1rem;
+            margin-bottom: 2rem;
+        }
+        .input-group {
+            display: flex;
+            flex-direction: column;
+            gap: 0.5rem;
+        }
+        label {
+            font-size: 0.875rem;
+            font-weight: 500;
+        }
+        input,
+        select {
+            padding: 0.75rem;
+            border-radius: 0.5rem;
+            border: 1px solid rgba(255, 255, 255, 0.1);
+            background-color: var(--color-background);
+            color: var(--color-text);
+            font-size: 1rem;
+        }
+        button {
+            padding: 1rem 2rem;
+            border-radius: 0.5rem;
+            border: none;
+            background-color: var(--color-accent);
+            color: white;
+            font-weight: 600;
+            cursor: pointer;
+            transition: all 0.2s ease;
+        }
+        button:hover {
+            opacity: 0.9;
+            transform: translateY(-1px);
+        }
+    </style>
+</head>
+<body>
+    <div style="text-align: center">
+        <h1>Gemini Voice Chat</h1>
+        <p>Speak with Gemini using real-time audio streaming</p>
+        <p>
+            Get a Gemini API key
+            <a href="https://ai.google.dev/gemini-api/docs/api-key">here</a>
+        </p>
+    </div>
+    <div class="container">
+        <div class="controls">
+            <div class="input-group">
+                <label for="api-key">API Key</label>
+                <input type="password" id="api-key" placeholder="Enter your API key">
+            </div>
+            <div class="input-group">
+                <label for="voice">Voice</label>
+                <select id="voice">
+                    <option value="Puck">Puck</option>
+                    <option value="Charon">Charon</option>
+                    <option value="Kore">Kore</option>
+                    <option value="Fenrir">Fenrir</option>
+                    <option value="Aoede">Aoede</option>
+                </select>
+            </div>
+        </div>
+        <div class="wave-container">
+            <div class="box-container">
+                <!-- Boxes will be dynamically added here -->
+            </div>
+        </div>
+        <button id="start-button">Start Recording</button>
+    </div>
+    <audio id="audio-output"></audio>
+    <script>
+        let peerConnection;
+        let audioContext;
+        let dataChannel;
+        let isRecording = false;
+        let webrtc_id;
+        const startButton = document.getElementById('start-button');
+        const apiKeyInput = document.getElementById('api-key');
+        const voiceSelect = document.getElementById('voice');
+        const audioOutput = document.getElementById('audio-output');
+        const boxContainer = document.querySelector('.box-container');
+        const numBars = 32;
+        for (let i = 0; i < numBars; i++) {
+            const box = document.createElement('div');
+            box.className = 'box';
+            boxContainer.appendChild(box);
+        }
+        async function setupWebRTC() {
+            const config = __RTC_CONFIGURATION__;
+            peerConnection = new RTCPeerConnection(config);
+            webrtc_id = Math.random().toString(36).substring(7);
+            try {
+                const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+                stream.getTracks().forEach(track => peerConnection.addTrack(track, stream));
+                audioContext = new AudioContext();
+                analyser = audioContext.createAnalyser();
+                analyser.fftSize = 64;
+                analyser.smoothingTimeConstant = 0.8;
+                dataArray = new Uint8Array(analyser.frequencyBinCount);
+                // Handle incoming audio
+                peerConnection.addEventListener('track', (evt) => {
+                    if (audioOutput && audioOutput.srcObject !== evt.streams[0]) {
+                        audioOutput.srcObject = evt.streams[0];
+                        audioOutput.play();
+                        // Set up audio visualization on the output stream
+                        audioContext = new AudioContext();
+                        analyser = audioContext.createAnalyser();
+                        const source = audioContext.createMediaStreamSource(evt.streams[0]);
+                        source.connect(analyser);
+                        analyser.fftSize = 2048;
+                        dataArray = new Uint8Array(analyser.frequencyBinCount);
+                        updateVisualization();
+                    }
+                });
+                // Create data channel for messages
+                dataChannel = peerConnection.createDataChannel('text');
+                dataChannel.onmessage = handleMessage;
+                // Create and send offer
+                const offer = await peerConnection.createOffer();
+                await peerConnection.setLocalDescription(offer);
+                await new Promise((resolve) => {
+                    if (peerConnection.iceGatheringState === "complete") {
+                        resolve();
+                    } else {
+                        const checkState = () => {
+                            if (peerConnection.iceGatheringState === "complete") {
+                                peerConnection.removeEventListener("icegatheringstatechange", checkState);
+                                resolve();
+                            }
+                        };
+                        peerConnection.addEventListener("icegatheringstatechange", checkState);
+                    }
+                });
+                const response = await fetch('/webrtc/offer', {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify({
+                        sdp: peerConnection.localDescription.sdp,
+                        type: peerConnection.localDescription.type,
+                        webrtc_id: webrtc_id,
+                    })
+                });
+                const serverResponse = await response.json();
+                await peerConnection.setRemoteDescription(serverResponse);
+            } catch (err) {
+                console.error('Error setting up WebRTC:', err);
+            }
+        }
+        function handleMessage(event) {
+            const eventJson = JSON.parse(event.data);
+            if (eventJson.type === "send_input") {
+                fetch('/input_hook', {
+                    method: 'POST',
+                    headers: {
+                        'Content-Type': 'application/json',
+                    },
+                    body: JSON.stringify({
+                        webrtc_id: webrtc_id,
+                        api_key: apiKeyInput.value,
+                        voice_name: voiceSelect.value
+                    })
+                });
+            }
+        }
+        function updateVisualization() {
+            if (!analyser) return;
+            analyser.getByteFrequencyData(dataArray);
+            const bars = document.querySelectorAll('.box');
+            for (let i = 0; i < bars.length; i++) {
+                const barHeight = (dataArray[i] / 255) * 2;
+                bars[i].style.transform = `scaleY(${Math.max(0.1, barHeight)})`;
+            }
+            animationId = requestAnimationFrame(updateVisualization);
+        }
+        function stopWebRTC() {
+            if (peerConnection) {
+                peerConnection.close();
+            }
+            if (animationId) {
+                cancelAnimationFrame(animationId);
+            }
+            if (audioContext) {
+                audioContext.close();
+            }
+        }
+        startButton.addEventListener('click', () => {
+            if (!isRecording) {
+                setupWebRTC();
+                startButton.textContent = 'Stop Recording';
+                startButton.classList.add('recording');
+            } else {
+                stopWebRTC();
+                startButton.textContent = 'Start Recording';
+                startButton.classList.remove('recording');
+            }
+            isRecording = !isRecording;
+        });
+    </script>
+</body>
+</html>

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+fastrtc[vad]==0.0.32rc1
+python-dotenv
+google-genai
+twilio