gemini-live-ffc

Running

App Files Files Community

Nirav Madhani commited on Feb 10

Commit

a1096d7

1 Parent(s): 9b731f8

Front end function call

Browse files

Files changed (4) hide show

.gitignore +2 -0
handler.py +25 -12
index.html +104 -57
webapp.py +19 -12

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .env
2	+ __pycache__/*

handler.py CHANGED Viewed

@@ -6,11 +6,18 @@ import os
 import traceback
 from websockets.asyncio.client import connect
 host = "generativelanguage.googleapis.com"
 model = "gemini-2.0-flash-exp"
 api_key = os.environ["GOOGLE_API_KEY"]
 uri = f"wss://{host}/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={api_key}"
 class AudioLoop:
     def __init__(self):
         self.ws = None
@@ -54,33 +61,39 @@ class AudioLoop:
                     response["serverContent"]["modelTurn"]["parts"][0]["inlineData"]["data"]
                 )
                 pcm_data = base64.b64decode(b64data)
-                await self.audio_in_queue.put(pcm_data)
             except KeyError:
                 # No audio in this message
                 pass
             tool_call = response.pop('toolCall', None)
             if tool_call is not None:
-                await self.handle_tool_call(tool_call)
             # If "turnComplete" is present
             if "serverContent" in response and response["serverContent"].get("turnComplete"):
                 print("[AudioLoop] Gemini turn complete")
-    async def handle_tool_call(self,tool_call):
-        print("    ", tool_call)
-        for fc in tool_call['functionCalls']:
-            msg = {
             'tool_response': {
                 'function_responses': [{
-                    'id': fc['id'],
-                    'name': fc['name'],
-                    'response':{'result': {'string_value': 'ok'}}
                 }]
-                }
             }
-            print('>>> ', msg)
-            await self.ws.send(json.dumps(msg))
     async def run(self):
         """Main entry point: connects to Gemini, starts send/receive tasks."""

 import traceback
 from websockets.asyncio.client import connect
+from dotenv import load_dotenv
+# Load environment variables from a .env file
+load_dotenv()
 host = "generativelanguage.googleapis.com"
 model = "gemini-2.0-flash-exp"
 api_key = os.environ["GOOGLE_API_KEY"]
 uri = f"wss://{host}/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={api_key}"
 class AudioLoop:
     def __init__(self):
         self.ws = None
                     response["serverContent"]["modelTurn"]["parts"][0]["inlineData"]["data"]
                 )
                 pcm_data = base64.b64decode(b64data)
+                # Send audio with type "audio"
+                await self.audio_in_queue.put({
+                    "type": "audio",
+                    "payload": base64.b64encode(pcm_data).decode()
+                })
             except KeyError:
                 # No audio in this message
                 pass
+            # Forward function calls to client
             tool_call = response.pop('toolCall', None)
             if tool_call is not None:
+                await self.audio_in_queue.put({
+                    "type": "function_call",
+                    "payload": tool_call
+                })
             # If "turnComplete" is present
             if "serverContent" in response and response["serverContent"].get("turnComplete"):
                 print("[AudioLoop] Gemini turn complete")
+    async def handle_tool_call(self, tool_call_response):
+        """Handle tool call response from client"""
+        msg = {
             'tool_response': {
                 'function_responses': [{
+                    'id': tool_call_response['id'],
+                    'name': tool_call_response['name'],
+                    'response': tool_call_response['response']
                 }]
             }
+        }
+        await self.ws.send(json.dumps(msg))
     async def run(self):
         """Main entry point: connects to Gemini, starts send/receive tasks."""

index.html CHANGED Viewed

@@ -144,6 +144,7 @@
   <label><input type="checkbox" id="logWebSocket"> WebSocket Events</label>
   <label style="margin-left: 1em"><input type="checkbox" id="logAudio"> Audio Events</label>
   <label style="margin-left: 1em"><input type="checkbox" id="logText"> Text Events</label>
   <label style="margin-left: 1em"><input type="checkbox" id="logError" checked> Error Events</label>
 </div>
@@ -211,14 +212,21 @@
   }
   function logMessage(category, ...args) {
-    const pre = document.getElementById("log");
-    const logCategory = document.getElementById(`log${category.charAt(0).toUpperCase() + category.slice(1)}`);
-    const shouldLog = logCategory ? logCategory.checked : false;
-    if (shouldLog) {
       const timestamp = new Date().toLocaleTimeString();
-      pre.textContent += `[${timestamp}] [${category}] ` + args.join(" ") + "\n";
-      console.log(`[${category}]`, ...args);
     }
   }
@@ -311,54 +319,49 @@
   }
   function connectWebSocket() {
-    logMessage("WebSocket", "Connecting...");
-    updateConnectionStatus(false);
-    // Use current origin and replace http(s) with ws(s)
-    const wsUrl = `${window.location.protocol === 'https:' ? 'wss:' : 'ws:'}//${window.location.host}/ws`;
-    socket = new WebSocket(wsUrl);
-    socket.onopen = () => {
-      logMessage("WebSocket", "Opened connection");
-      updateConnectionStatus(true);
-      if (!playbackCtx) {
-        playbackCtx = new (window.AudioContext || window.webkitAudioContext)();
-        setupVisualizer();
-      }
-      nextPlaybackTime = playbackCtx.currentTime;
-    };
-    socket.onerror = (err) => {
-      logMessage("Error", "WebSocket error:", err);
-      updateConnectionStatus(false);
-    };
-    socket.onclose = () => {
-      logMessage("WebSocket", "Connection closed");
-      updateConnectionStatus(false);
-      if (isCapturing) {
-        stopCapture();
-      }
-    };
-    socket.onmessage = (event) => {
-      try {
-        const data = JSON.parse(event.data);
-        if (data.type === "audio" && data.payload) {
-          const arrayBuffer = base64ToArrayBuffer(data.payload);
           const int16View = new Int16Array(arrayBuffer);
           const float32Buffer = new Float32Array(int16View.length);
           for (let i = 0; i < int16View.length; i++) {
             float32Buffer[i] = int16View[i] / 32768;
           }
-          const sampleRate = 24000; // RECEIVED_SAMPLE_RATE from app.py
           const audioBuffer = playbackCtx.createBuffer(1, float32Buffer.length, sampleRate);
           audioBuffer.copyToChannel(float32Buffer, 0);
-          let scheduledTime = playbackCtx.currentTime > nextPlaybackTime ? playbackCtx.currentTime : nextPlaybackTime;
           const source = playbackCtx.createBufferSource();
           source.buffer = audioBuffer;
-          // Connect through analyser for visualization
           if (analyser) {
             source.connect(analyser);
             analyser.connect(playbackCtx.destination);
@@ -368,32 +371,76 @@
           } else {
             source.connect(playbackCtx.destination);
           }
-          source.start(scheduledTime);
-          // Add source to tracked sources
           scheduledSources.push(source);
-          // Remove source from tracking once it finishes
           source.onended = () => {
             const index = scheduledSources.indexOf(source);
             if (index > -1) {
               scheduledSources.splice(index, 1);
             }
-            // Stop visualizer if no more audio
             if (scheduledSources.length === 0) {
               stopVisualizer();
             }
           };
-          nextPlaybackTime = scheduledTime + audioBuffer.duration;
-          logMessage("Audio", "Scheduled playback. Start time:", scheduledTime, "Duration:", audioBuffer.duration);
-        } else if (data.type === "text" && data.content) {
-          logMessage("Text", "Received:", data.content);
-        } else {
-          logMessage("WebSocket", "Received message:", event.data);
         }
-      } catch (err) {
-        logMessage("Error", "Failed to process message:", err);
-      }
-    };
   }
   async function startCapture() {

   <label><input type="checkbox" id="logWebSocket"> WebSocket Events</label>
   <label style="margin-left: 1em"><input type="checkbox" id="logAudio"> Audio Events</label>
   <label style="margin-left: 1em"><input type="checkbox" id="logText"> Text Events</label>
+  <label style="margin-left: 1em"><input type="checkbox" id="logFunction" checked> Function Events</label>
   <label style="margin-left: 1em"><input type="checkbox" id="logError" checked> Error Events</label>
 </div>
   }
   function logMessage(category, ...args) {
+    const logElement = document.getElementById('log');
+    const shouldLog = {
+      'websocket': document.getElementById('logWebSocket').checked,
+      'audio': document.getElementById('logAudio').checked,
+      'text': document.getElementById('logText').checked,
+      'function': document.getElementById('logFunction').checked,
+      'error': document.getElementById('logError').checked
+    };
+    if (shouldLog[category]) {
       const timestamp = new Date().toLocaleTimeString();
+      const message = `[${timestamp}] [${category}] ${args.map(arg =>
+        typeof arg === 'object' ? JSON.stringify(arg, null, 2) : arg
+      ).join(' ')}`;
+      logElement.textContent = message + '\n' + logElement.textContent;
     }
   }
   }
   function connectWebSocket() {
+    try {
+      socket = new WebSocket(`ws://${window.location.host}/ws`);
+      socket.onopen = () => {
+        logMessage('websocket', 'Connected to server');
+        updateConnectionStatus(true);
+      };
+      socket.onclose = () => {
+        logMessage('websocket', 'Disconnected from server');
+        updateConnectionStatus(false);
+      };
+      socket.onmessage = async (event) => {
+        const message = JSON.parse(event.data);
+        const messageType = message.type;
+        if (messageType === 'audio') {
+          // Handle audio data
+          logMessage('audio', 'Received audio chunk from server');
+          const arrayBuffer = base64ToArrayBuffer(message.payload);
+          if (!playbackCtx) {
+            playbackCtx = new (window.AudioContext || window.webkitAudioContext)();
+            setupVisualizer();
+          }
+          // Convert Int16 PCM to Float32
           const int16View = new Int16Array(arrayBuffer);
           const float32Buffer = new Float32Array(int16View.length);
           for (let i = 0; i < int16View.length; i++) {
             float32Buffer[i] = int16View[i] / 32768;
           }
+          // Create audio buffer with correct sample rate
+          const sampleRate = 24000; // Sample rate from server
           const audioBuffer = playbackCtx.createBuffer(1, float32Buffer.length, sampleRate);
           audioBuffer.copyToChannel(float32Buffer, 0);
           const source = playbackCtx.createBufferSource();
           source.buffer = audioBuffer;
+          // Connect through analyser for visualization if available
           if (analyser) {
             source.connect(analyser);
             analyser.connect(playbackCtx.destination);
           } else {
             source.connect(playbackCtx.destination);
           }
+          // Schedule the audio to play at the right time
+          const startTime = Math.max(nextPlaybackTime, playbackCtx.currentTime);
+          source.start(startTime);
+          nextPlaybackTime = startTime + audioBuffer.duration;
+          // Keep track of scheduled sources
           scheduledSources.push(source);
+          // Clean up source when it finishes playing
           source.onended = () => {
             const index = scheduledSources.indexOf(source);
             if (index > -1) {
               scheduledSources.splice(index, 1);
             }
+            // Stop visualizer if no more audio playing
             if (scheduledSources.length === 0) {
               stopVisualizer();
             }
           };
+        }
+        else if (messageType === 'function_call') {
+          // Handle function calls from server
+          logMessage('function', 'Received function call:', message.payload);
+          const functionCalls = message.payload.functionCalls;
+          for (const fc of functionCalls) {
+            const functionName = fc.name;
+            const functionId = fc.id;
+            // Handle different functions
+            let result = 'ok';
+            if (functionName === 'turn_on_the_lights') {
+              logMessage('function', 'Turning on the lights');
+              // Simulate turning on lights
+              result = 'Lights turned on successfully';
+            }
+            else if (functionName === 'turn_off_the_lights') {
+              logMessage('function', 'Turning off the lights');
+              // Simulate turning off lights
+              result = 'Lights turned off successfully';
+            }
+            // Send response back to server
+            const response = {
+              type: 'tool_call_response',
+              payload: {
+                id: functionId,
+                name: functionName,
+                response: {
+                  result: {
+                    string_value: result
+                  }
+                }
+              }
+            };
+            socket.send(JSON.stringify(response));
+            logMessage('function', 'Sent function response:', response);
+          }
         }
+      };
+      socket.onerror = (error) => {
+        logMessage('error', 'WebSocket error:', error);
+      };
+    } catch (error) {
+      logMessage('error', 'Failed to connect:', error);
+    }
   }
   async function startCapture() {

webapp.py CHANGED Viewed

@@ -53,6 +53,7 @@ async def websocket_endpoint(websocket: WebSocket):
                 # Handle audio data from client
                 if msg_type == "audio":
                     raw_pcm = base64.b64decode(msg["payload"])
                     forward_msg = {
                         "realtime_input": {
@@ -97,29 +98,35 @@ async def websocket_endpoint(websocket: WebSocket):
                     }
                     await audio_loop.out_queue.put(forward_msg)
                 else:
                     print("[from_client_to_gemini] Unknown message type:", msg_type)
         except WebSocketDisconnect:
             print("[from_client_to_gemini] Client disconnected.")
-            #del audio_loop
             loop_task.cancel()
         except Exception as e:
             print("[from_client_to_gemini] Error:", e)
     async def from_gemini_to_client():
-        """Reads PCM audio from Gemini and sends it back to the client."""
         try:
             while True:
-                pcm_data = await audio_loop.audio_in_queue.get()
-                b64_pcm = base64.b64encode(pcm_data).decode()
-                out_msg = {
-                    "type": "audio",
-                    "payload": b64_pcm
-                }
-                print("[from_gemini_to_client] Sending audio chunk to client. Size:", len(pcm_data))
-                await websocket.send_text(json.dumps(out_msg))
         except WebSocketDisconnect:
             print("[from_gemini_to_client] Client disconnected.")
@@ -143,5 +150,5 @@ async def websocket_endpoint(websocket: WebSocket):
             pass
         print("[websocket_endpoint] Cleaned up AudioLoop for client")
-if __name__ == "__main__":
     uvicorn.run("webapp:app", host="0.0.0.0", port=7860, reload=True)

                 # Handle audio data from client
                 if msg_type == "audio":
+                    # Decode base64 audio from client
                     raw_pcm = base64.b64decode(msg["payload"])
                     forward_msg = {
                         "realtime_input": {
                     }
                     await audio_loop.out_queue.put(forward_msg)
+                elif msg_type == "tool_call_response":
+                    # Handle tool call response from client
+                    await audio_loop.handle_tool_call(msg["payload"])
                 else:
                     print("[from_client_to_gemini] Unknown message type:", msg_type)
         except WebSocketDisconnect:
             print("[from_client_to_gemini] Client disconnected.")
             loop_task.cancel()
         except Exception as e:
             print("[from_client_to_gemini] Error:", e)
     async def from_gemini_to_client():
+        """Reads messages from Gemini and sends them back to the client."""
         try:
             while True:
+                message = await audio_loop.audio_in_queue.get()
+                message_type = message["type"]
+                if message_type == "audio":
+                    # Audio data is already base64 encoded from handler.py
+                    await websocket.send_text(json.dumps(message))
+                    print("[from_gemini_to_client] Sending audio chunk to client")
+                elif message_type == "function_call":
+                    # Forward function call to client
+                    await websocket.send_text(json.dumps(message))
+                    print("[from_gemini_to_client] Forwarding function call to client")
         except WebSocketDisconnect:
             print("[from_gemini_to_client] Client disconnected.")
             pass
         print("[websocket_endpoint] Cleaned up AudioLoop for client")
+if __name__ == "__main__":
     uvicorn.run("webapp:app", host="0.0.0.0", port=7860, reload=True)