Spaces:

BryanBradfo
/

speech-to-speech-streaming

Runtime error

App Files Files Community

BryanBradfo commited on Apr 2

Commit

1ea0a09

1 Parent(s): 1c7a116

change of everything

Browse files

Files changed (2) hide show

app.py +107 -215
index.html +166 -60

app.py CHANGED Viewed

@@ -1,10 +1,7 @@
 import json
 import os
 import time
-import asyncio
-import threading
 from pathlib import Path
-from queue import Queue
 import gradio as gr
 import numpy as np
@@ -23,229 +20,131 @@ from fastrtc.utils import audio_to_bytes
 from gradio.utils import get_space
 from groq import Groq
 from pydantic import BaseModel
-from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 load_dotenv()
-# Initialize all clients and models upfront to avoid cold start latency
-groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY", ""))
-# Load a small but effective model - initialize once and reuse
-print("Loading model...")
-model_name = "distilgpt2"  # Can be changed to "TinyLlama/TinyLlama-1.1B-Chat-v1.0" for better quality
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(model_name)
-print("Model loaded!")
 tts_client = ElevenLabs(api_key=os.environ["ELEVENLABS_API_KEY"])
-tts_model = get_tts_model()
 curr_dir = Path(__file__).parent
-# Pre-warm the model
-_ = model.generate(tokenizer.encode("Hello", return_tensors="pt"), max_length=5)
-print("Model warmed up")
-# Create a transcription queue for parallel processing
-transcription_queue = Queue()
-response_queue = Queue()
-# Global session state
-sessions = {}
-def transcribe_audio(audio_bytes):
-    """Transcribe audio using Groq's Whisper API"""
-    try:
-        return groq_client.audio.transcriptions.create(
-            file=("audio-file.mp3", audio_bytes),
-            model="whisper-large-v3-turbo",
-            response_format="verbose_json",
-        ).text
-    except Exception as e:
-        print(f"Transcription error: {e}")
-        return "I couldn't understand that. Could you try again?"
-def generate_response(prompt, session_id):
-    """Generate response using the model"""
-    try:
-        # Get or create session history
-        if session_id not in sessions:
-            sessions[session_id] = []
-        # Keep only last 3 exchanges for context
-        history = sessions[session_id][-3:] if sessions[session_id] else []
-        # Build context from history
-        context = "You are a helpful assistant. Keep responses brief and direct.\n"
-        for item in history:
-            role = item.get("role", "")
-            content = item.get("content", "")
-            if role and content:
-                prefix = "User: " if role == "user" else "Assistant: "
-                context += f"{prefix}{content}\n"
-        context += f"User: {prompt}\nAssistant:"
-        # Encode the prompt
-        inputs = tokenizer(context, return_tensors="pt")
-        # Generate
-        with torch.no_grad():
-            output = model.generate(
-                inputs.input_ids,
-                max_length=len(inputs.input_ids[0]) + 100,
-                temperature=0.7,
-                top_p=0.9,
-                repetition_penalty=1.2,
-                do_sample=True,
-                pad_token_id=tokenizer.eos_token_id
-            )
-        # Decode and clean
-        full_output = tokenizer.decode(output[0], skip_special_tokens=True)
-        # Extract just the assistant's response
-        response = full_output.split("Assistant:")[-1].strip()
-        # Clean up the response (remove repetition)
-        sentences = []
-        for sent in response.split('. '):
-            if sent and sent not in sentences:
-                sentences.append(sent)
-        cleaned_response = '. '.join(sentences)
-        if not cleaned_response.endswith('.'):
-            cleaned_response += '.'
-        return cleaned_response
-    except Exception as e:
-        print(f"Response generation error: {e}")
-        return "I'm processing that. One moment please."
-def background_transcription():
-    """Background worker for transcription"""
-    while True:
-        session_id, audio_data = transcription_queue.get()
-        if session_id is None:  # Poison pill
-            break
-        transcript = transcribe_audio(audio_data)
-        response_queue.put((session_id, "user", transcript))
-        # Add to session history
-        if session_id not in sessions:
-            sessions[session_id] = []
-        sessions[session_id].append({"role": "user", "content": transcript})
-        # Generate response in the same thread for simplicity
-        response_text = generate_response(transcript, session_id)
-        response_queue.put((session_id, "assistant", response_text))
-        # Add to session history
-        sessions[session_id].append({"role": "assistant", "content": response_text})
-        transcription_queue.task_done()
-# Start the background worker
-transcription_thread = threading.Thread(target=background_transcription, daemon=True)
-transcription_thread.start()
-def optimize_tts(text):
-    """Optimize text for better TTS performance"""
-    # Break into smaller chunks at natural boundaries
-    chunks = []
-    current_chunk = ""
-    for sentence in text.split('. '):
-        if not sentence.strip():
-            continue
-        if len(current_chunk) + len(sentence) > 100:
-            if current_chunk:
-                chunks.append(current_chunk.strip())
-            current_chunk = sentence
-        else:
-            if current_chunk:
-                current_chunk += ". " + sentence
-            else:
-                current_chunk = sentence
-    if current_chunk:
-        chunks.append(current_chunk.strip())
-    # Make sure chunks end with periods
-    for i in range(len(chunks)):
-        if not chunks[i].endswith('.'):
-            chunks[i] += '.'
-    return chunks
 def response(
     audio: tuple[int, np.ndarray],
     chatbot: list[dict] | None = None,
-    session_id=None
 ):
-    if session_id is None:
-        session_id = f"session_{time.time()}"
     chatbot = chatbot or []
-    audio_bytes = audio_to_bytes(audio)
-    # Queue the audio for transcription
-    transcription_queue.put((session_id, audio_bytes))
-    # Wait for transcription and response (with timeout for real-time experience)
-    waited = 0
-    transcript = None
-    response_text = None
-    # Get user transcript first
-    while waited < 5:  # 5 seconds max wait
-        if not response_queue.empty():
-            sid, role, text = response_queue.get()
-            if sid == session_id and role == "user":
-                transcript = text
-                break
-        time.sleep(0.1)
-        waited += 0.1
-    if transcript is None:
-        # If transcription is taking too long, provide immediate feedback
-        transcript = "Processing your message..."
-    # Update chatbot with user message
-    chatbot.append({"role": "user", "content": transcript})
-    yield AdditionalOutputs(chatbot)
-    # Get assistant response
-    waited = 0
-    while waited < 7:  # 7 seconds max wait
-        if not response_queue.empty():
-            sid, role, text = response_queue.get()
-            if sid == session_id and role == "assistant":
-                response_text = text
-                break
-        time.sleep(0.1)
-        waited += 0.1
-    if response_text is None:
-        # If response generation is taking too long, provide immediate feedback
-        response_text = "I'm thinking about that. One moment please."
-    # Update chatbot with assistant message
-    chatbot.append({"role": "assistant", "content": response_text})
-    yield AdditionalOutputs(chatbot)
-    # Stream TTS in smaller chunks for real-time experience
     try:
-        chunks = optimize_tts(response_text)
-        for chunk in chunks:
-            for audio_chunk in tts_model.stream_tts_sync(chunk):
-                yield audio_chunk
     except Exception as e:
-        print(f"TTS error: {e}")
 chatbot = gr.Chatbot(type="messages")
@@ -296,26 +195,19 @@ def _(webrtc_id: str):
         async for output in stream.output_stream(webrtc_id):
             chatbot = output.args[0]
             if chatbot and len(chatbot) > 0:
-                # Only send the most recent message to avoid duplicates
                 yield f"event: output\ndata: {json.dumps(chatbot[-1])}\n\n"
     return StreamingResponse(output_stream(), media_type="text/event-stream")
 if __name__ == "__main__":
-    try:
-        import torch
-        print("Running with PyTorch")
-    except ImportError:
-        print("PyTorch not available, might affect performance")
     import os
-    print("Starting server...")
     if (mode := os.getenv("MODE")) == "UI":
         stream.ui.launch(server_port=7860, server_name="0.0.0.0")
     elif mode == "PHONE":
         stream.fastphone(host="0.0.0.0", port=7860)
     else:
         import uvicorn
         uvicorn.run(app, host="0.0.0.0", port=7860)

 import json
 import os
 import time
 from pathlib import Path
 import gradio as gr
 import numpy as np
 from gradio.utils import get_space
 from groq import Groq
 from pydantic import BaseModel
+from transformers import pipeline
 load_dotenv()
+groq_client = Groq()
+# Using a better but still small free model - distilGPT2 is more coherent than OPT-125M
+text_generation = pipeline(
+    "text-generation",
+    model="distilgpt2",  # Better free model that's still small
+    device_map="auto",
+    truncation=True
+)
 tts_client = ElevenLabs(api_key=os.environ["ELEVENLABS_API_KEY"])
 curr_dir = Path(__file__).parent
+tts_model = get_tts_model()
+# Keep track of last response to prevent repetition
+last_response = ""
+def clean_response(text):
+    """Clean the generated text to avoid repetition and improve quality"""
+    # Remove repeated phrases (simple approach)
+    sentences = text.split('. ')
+    cleaned_sentences = []
+    for s in sentences:
+        if s and s not in cleaned_sentences:
+            cleaned_sentences.append(s)
+    cleaned_text = '. '.join(cleaned_sentences)
+    if not cleaned_text.endswith('.'):
+        cleaned_text += '.'
+    # Limit length to avoid very long responses
+    if len(cleaned_text) > 200:
+        cleaned_text = cleaned_text[:197] + "..."
+    return cleaned_text
 def response(
     audio: tuple[int, np.ndarray],
     chatbot: list[dict] | None = None,
 ):
+    global last_response
     chatbot = chatbot or []
     try:
+        prompt = groq_client.audio.transcriptions.create(
+            file=("audio-file.mp3", audio_to_bytes(audio)),
+            model="whisper-large-v3-turbo",
+            response_format="verbose_json",
+        ).text
+        chatbot.append({"role": "user", "content": prompt})
+        yield AdditionalOutputs(chatbot)
+        # Create a better prompt for the model
+        context = "You are a helpful assistant. Keep your responses short and to the point."
+        if chatbot and len(chatbot) > 1:
+            # Add the last exchange for context
+            context += f"\nPrevious: {chatbot[-2]['content']}\nYou: {prompt}"
+        else:
+            context += f"\nUser: {prompt}"
+        # Generate response using the Hugging Face model
+        generated_text = text_generation(
+            context,
+            max_length=150,  # Longer context but still reasonable
+            num_return_sequences=1,
+            do_sample=True,
+            top_p=0.92,
+            temperature=0.7,
+            repetition_penalty=1.2,  # Penalize repetition
+        )
+        # Extract only the new content
+        full_text = generated_text[0]['generated_text']
+        response_text = full_text.replace(context, "").strip()
+        # Clean up the response
+        response_text = clean_response(response_text)
+        # Prevent exact repetition from previous response
+        if response_text == last_response:
+            response_text = "I understand. Can you elaborate on that?"
+        last_response = response_text
+        # Add a fallback if the response is empty or too short
+        if len(response_text) < 10:
+            response_text = "I see. Could you tell me more about that?"
+        chatbot.append({"role": "assistant", "content": response_text})
+        yield AdditionalOutputs(chatbot)  # Send chatbot update first
+        # Split the audio generation into smaller chunks to avoid repeating the entire message
+        sentences = response_text.split('. ')
+        start = time.time()
+        print("starting tts", start)
+        # Process each sentence separately for TTS
+        for i, sentence in enumerate(sentences):
+            if not sentence.strip():
+                continue
+            sentence = sentence.strip() + "."
+            for chunk in tts_model.stream_tts_sync(sentence):
+                print(f"chunk {i}.{chunk}", time.time() - start)
+                yield chunk
+        print("finished tts", time.time() - start)
     except Exception as e:
+        print(f"Error in response generation: {e}")
+        error_message = "Sorry, I encountered an error processing your request."
+        chatbot.append({"role": "assistant", "content": error_message})
+        yield AdditionalOutputs(chatbot)
+        # Still try to produce audio for the error message
+        try:
+            for chunk in tts_model.stream_tts_sync(error_message):
+                yield chunk
+        except Exception:
+            # If even TTS fails, just return with the error in chatbot
+            pass
 chatbot = gr.Chatbot(type="messages")
         async for output in stream.output_stream(webrtc_id):
             chatbot = output.args[0]
             if chatbot and len(chatbot) > 0:
                 yield f"event: output\ndata: {json.dumps(chatbot[-1])}\n\n"
     return StreamingResponse(output_stream(), media_type="text/event-stream")
 if __name__ == "__main__":
     import os
     if (mode := os.getenv("MODE")) == "UI":
         stream.ui.launch(server_port=7860, server_name="0.0.0.0")
     elif mode == "PHONE":
         stream.fastphone(host="0.0.0.0", port=7860)
     else:
         import uvicorn
         uvicorn.run(app, host="0.0.0.0", port=7860)

index.html CHANGED Viewed

@@ -4,109 +4,168 @@
 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>RetroChat Audio</title>
     <style>
         body {
-            font-family: monospace;
-            background-color: #1a1a1a;
-            color: #00ff00;
             margin: 0;
             padding: 20px;
             height: 100vh;
             box-sizing: border-box;
         }
         .container {
             display: flex;
             flex-direction: column;
             gap: 20px;
             height: calc(100% - 100px);
             margin-bottom: 20px;
         }
         .chat-container {
-            border: 2px solid #00ff00;
             padding: 20px;
             display: flex;
             flex-direction: column;
             flex-grow: 1;
             box-sizing: border-box;
         }
         .controls-container {
-            border: 2px solid #00ff00;
             padding: 20px;
             display: flex;
             align-items: center;
             gap: 20px;
             height: 128px;
             box-sizing: border-box;
         }
         .visualization-container {
             flex-grow: 1;
             display: flex;
             align-items: center;
         }
         .box-container {
             display: flex;
             justify-content: space-between;
             height: 64px;
             width: 100%;
         }
         .box {
             height: 100%;
             width: 8px;
-            background: #00ff00;
-            border-radius: 8px;
             transition: transform 0.05s ease;
         }
         .chat-messages {
             flex-grow: 1;
             overflow-y: auto;
             margin-bottom: 20px;
-            padding: 10px;
-            border: 1px solid #00ff00;
         }
         .message {
             margin-bottom: 10px;
-            padding: 8px;
-            border-radius: 4px;
         }
         .message.user {
-            background-color: #003300;
         }
         .message.assistant {
-            background-color: #002200;
         }
         button {
             height: 64px;
             min-width: 120px;
-            background-color: #000;
-            color: #00ff00;
-            border: 2px solid #00ff00;
             padding: 10px 20px;
-            font-family: monospace;
-            font-size: 16px;
             cursor: pointer;
             transition: all 0.3s;
         }
         button:hover {
-            border-width: 3px;
         }
         #audio-output {
             display: none;
         }
-        /* Retro CRT effect */
-        .crt-overlay {
             position: absolute;
-            top: 0;
-            left: 0;
-            width: 100%;
-            height: 100%;
-            background: repeating-linear-gradient(0deg,
-                    rgba(0, 255, 0, 0.03),
-                    rgba(0, 255, 0, 0.03) 1px,
-                    transparent 1px,
-                    transparent 2px);
             pointer-events: none;
         }
         /* Add these new styles */
         .icon-with-spinner {
             display: flex;
@@ -115,20 +174,23 @@
             gap: 12px;
             min-width: 180px;
         }
         .spinner {
             width: 20px;
             height: 20px;
-            border: 2px solid #00ff00;
             border-top-color: transparent;
             border-radius: 50%;
             animation: spin 1s linear infinite;
             flex-shrink: 0;
         }
         @keyframes spin {
             to {
                 transform: rotate(360deg);
             }
         }
         .pulse-container {
             display: flex;
             align-items: center;
@@ -136,53 +198,66 @@
             gap: 12px;
             min-width: 180px;
         }
         .pulse-circle {
             width: 20px;
             height: 20px;
             border-radius: 50%;
-            background-color: #00ff00;
-            opacity: 0.2;
             flex-shrink: 0;
             transform: translateX(-0%) scale(var(--audio-level, 1));
             transition: transform 0.1s ease;
         }
         /* Add styles for typing indicator */
         .typing-indicator {
-            padding: 8px;
-            background-color: #002200;
-            border-radius: 4px;
             margin-bottom: 10px;
             display: none;
         }
         .dots {
             display: inline-flex;
             gap: 4px;
         }
         .dot {
-            width: 8px;
-            height: 8px;
-            background-color: #00ff00;
             border-radius: 50%;
-            animation: pulse 1.5s infinite;
-            opacity: 0.5;
         }
         .dot:nth-child(2) {
-            animation-delay: 0.5s;
         }
         .dot:nth-child(3) {
-            animation-delay: 1s;
         }
-        @keyframes pulse {
-            0%,
-            100% {
-                opacity: 0.5;
-                transform: scale(1);
             }
             50% {
-                opacity: 1;
-                transform: scale(1.2);
             }
         }
         /* Add styles for toast notifications */
         .toast {
             position: fixed;
@@ -190,27 +265,58 @@
             left: 50%;
             transform: translateX(-50%);
             padding: 16px 24px;
-            border-radius: 4px;
-            font-size: 14px;
             z-index: 1000;
             display: none;
-            box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2);
         }
         .toast.error {
-            background-color: #f44336;
             color: white;
         }
         .toast.warning {
-            background-color: #ffd700;
-            color: black;
         }
     </style>
 </head>
 <body>
     <!-- Add toast element after body opening tag -->
     <div id="error-toast" class="toast"></div>
     <div class="container">
         <div class="chat-container">
             <div class="chat-messages" id="chat-messages"></div>
             <!-- Move typing indicator outside the chat messages -->
@@ -228,7 +334,7 @@
                     <!-- Boxes will be dynamically added here -->
                 </div>
             </div>
-            <button id="start-button">Press to talk with me</button>
         </div>
     </div>
     <audio id="audio-output"></audio>
@@ -260,7 +366,7 @@
                     </div>
                 `;
             } else {
-                startButton.innerHTML = 'Start';
             }
         }
         function showError(message) {
@@ -466,7 +572,7 @@
             updateButtonState();
         }
         startButton.addEventListener('click', () => {
-            if (startButton.textContent === 'Start') {
                 setupWebRTC();
             } else {
                 stop();

 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>HappyChat Audio</title>
     <style>
+        @import url('https://fonts.googleapis.com/css2?family=Comic+Neue:wght@400;700&display=swap');
         body {
+            font-family: 'Comic Neue', cursive;
+            background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
+            color: #333;
             margin: 0;
             padding: 20px;
             height: 100vh;
             box-sizing: border-box;
         }
         .container {
             display: flex;
             flex-direction: column;
             gap: 20px;
             height: calc(100% - 100px);
             margin-bottom: 20px;
+            max-width: 1200px;
+            margin: 0 auto;
         }
         .chat-container {
+            border: 3px solid #ff6b6b;
+            border-radius: 16px;
             padding: 20px;
             display: flex;
             flex-direction: column;
             flex-grow: 1;
             box-sizing: border-box;
+            background-color: #fff;
+            box-shadow: 0 8px 20px rgba(0, 0, 0, 0.1);
         }
         .controls-container {
+            border: 3px solid #4ecdc4;
+            border-radius: 16px;
             padding: 20px;
             display: flex;
             align-items: center;
             gap: 20px;
             height: 128px;
             box-sizing: border-box;
+            background-color: #fff;
+            box-shadow: 0 8px 20px rgba(0, 0, 0, 0.1);
         }
         .visualization-container {
             flex-grow: 1;
             display: flex;
             align-items: center;
         }
         .box-container {
             display: flex;
             justify-content: space-between;
             height: 64px;
             width: 100%;
         }
         .box {
             height: 100%;
             width: 8px;
+            background: #ff6b6b;
+            border-radius: 20px;
             transition: transform 0.05s ease;
         }
+        .box:nth-child(odd) {
+            background: #ffcc5c;
+        }
+        .box:nth-child(3n) {
+            background: #4ecdc4;
+        }
+        .box:nth-child(5n) {
+            background: #c16ecf;
+        }
         .chat-messages {
             flex-grow: 1;
             overflow-y: auto;
             margin-bottom: 20px;
+            padding: 15px;
+            border: 2px solid #ffcc5c;
+            border-radius: 12px;
+            background-color: #f9f9f9;
         }
         .message {
             margin-bottom: 10px;
+            padding: 12px;
+            border-radius: 18px;
+            font-size: 16px;
+            box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05);
+            max-width: 80%;
         }
         .message.user {
+            background-color: #ffcc5c;
+            margin-left: auto;
+            border-bottom-right-radius: 4px;
         }
         .message.assistant {
+            background-color: #4ecdc4;
+            color: white;
+            margin-right: auto;
+            border-bottom-left-radius: 4px;
         }
         button {
             height: 64px;
             min-width: 120px;
+            background-color: #ff6b6b;
+            color: white;
+            border: none;
+            border-radius: 32px;
             padding: 10px 20px;
+            font-family: 'Comic Neue', cursive;
+            font-size: 18px;
+            font-weight: bold;
             cursor: pointer;
             transition: all 0.3s;
+            box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
         }
         button:hover {
+            transform: translateY(-3px);
+            box-shadow: 0 6px 12px rgba(0, 0, 0, 0.15);
+            background-color: #ff5252;
+        }
+        button:active {
+            transform: translateY(1px);
         }
         #audio-output {
             display: none;
         }
+        /* Floating bubbles animation */
+        .bubble {
             position: absolute;
+            border-radius: 50%;
+            opacity: 0.6;
             pointer-events: none;
+            animation: float 15s infinite ease-in-out;
+        }
+        @keyframes float {
+            0%, 100% {
+                transform: translateY(0) rotate(0deg);
+            }
+            50% {
+                transform: translateY(-80px) rotate(180deg);
+            }
         }
         /* Add these new styles */
         .icon-with-spinner {
             display: flex;
             gap: 12px;
             min-width: 180px;
         }
         .spinner {
             width: 20px;
             height: 20px;
+            border: 2px solid #fff;
             border-top-color: transparent;
             border-radius: 50%;
             animation: spin 1s linear infinite;
             flex-shrink: 0;
         }
         @keyframes spin {
             to {
                 transform: rotate(360deg);
             }
         }
         .pulse-container {
             display: flex;
             align-items: center;
             gap: 12px;
             min-width: 180px;
         }
         .pulse-circle {
             width: 20px;
             height: 20px;
             border-radius: 50%;
+            background-color: #ffcc5c;
+            opacity: 0.8;
             flex-shrink: 0;
             transform: translateX(-0%) scale(var(--audio-level, 1));
             transition: transform 0.1s ease;
+            box-shadow: 0 0 10px #ffcc5c;
         }
         /* Add styles for typing indicator */
         .typing-indicator {
+            padding: 12px;
+            background-color: #f0f0f0;
+            border-radius: 18px;
+            border-bottom-left-radius: 4px;
             margin-bottom: 10px;
             display: none;
+            width: fit-content;
         }
         .dots {
             display: inline-flex;
             gap: 4px;
         }
         .dot {
+            width: 10px;
+            height: 10px;
+            background-color: #ff6b6b;
             border-radius: 50%;
+            animation: bounce 1.5s infinite;
+        }
+        .dot:nth-child(1) {
+            background-color: #ff6b6b;
         }
         .dot:nth-child(2) {
+            animation-delay: 0.2s;
+            background-color: #ffcc5c;
         }
         .dot:nth-child(3) {
+            animation-delay: 0.4s;
+            background-color: #4ecdc4;
         }
+        @keyframes bounce {
+            0%, 100% {
+                transform: translateY(0);
             }
             50% {
+                transform: translateY(-10px);
             }
         }
         /* Add styles for toast notifications */
         .toast {
             position: fixed;
             left: 50%;
             transform: translateX(-50%);
             padding: 16px 24px;
+            border-radius: 12px;
+            font-size: 16px;
             z-index: 1000;
             display: none;
+            box-shadow: 0 8px 16px rgba(0, 0, 0, 0.2);
         }
         .toast.error {
+            background-color: #ff6b6b;
             color: white;
         }
         .toast.warning {
+            background-color: #ffcc5c;
+            color: #333;
+        }
+        .header {
+            text-align: center;
+            margin-bottom: 20px;
+        }
+        .header h1 {
+            color: #ff6b6b;
+            font-size: 36px;
+            margin-bottom: 8px;
+            text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.1);
+        }
+        .header p {
+            color: #666;
+            font-size: 18px;
         }
     </style>
 </head>
 <body>
+    <!-- Add animated bubbles -->
+    <div class="bubble" style="width: 60px; height: 60px; background: #ff6b6b; left: 10%; top: 20%;"></div>
+    <div class="bubble" style="width: 40px; height: 40px; background: #ffcc5c; right: 15%; top: 10%;"></div>
+    <div class="bubble" style="width: 80px; height: 80px; background: #4ecdc4; left: 20%; bottom: 10%;"></div>
+    <div class="bubble" style="width: 50px; height: 50px; background: #c16ecf; right: 20%; bottom: 20%;"></div>
     <!-- Add toast element after body opening tag -->
     <div id="error-toast" class="toast"></div>
     <div class="container">
+        <div class="header">
+            <h1>HappyChat</h1>
+            <p>Speak and listen to your friendly AI assistant!</p>
+        </div>
         <div class="chat-container">
             <div class="chat-messages" id="chat-messages"></div>
             <!-- Move typing indicator outside the chat messages -->
                     <!-- Boxes will be dynamically added here -->
                 </div>
             </div>
+            <button id="start-button">Start Chatting!</button>
         </div>
     </div>
     <audio id="audio-output"></audio>
                     </div>
                 `;
             } else {
+                startButton.innerHTML = 'Start Chatting!';
             }
         }
         function showError(message) {
             updateButtonState();
         }
         startButton.addEventListener('click', () => {
+            if (startButton.textContent === 'Start Chatting!') {
                 setupWebRTC();
             } else {
                 stop();