Spaces:

tonyliu404
/

SpokenVision

Sleeping

File size: 2,291 Bytes

import gradio as gr
import cv2
import numpy as np
import os
import requests
import base64
import base64
import io
import soundfile as sf

# Backend server URL
backend_server_url = "https://0416-2600-1017-a410-36b8-2357-52be-1318-959b.ngrok-free.app"

# Backend interaction
def send_to_backend(frame):
    try:
        # _, img_encoded = cv2.imencode('.jpg', frame)
        # img_bytes = img_encoded.tobytes()
        small_frame = cv2.resize(frame, (224, 224))  
        # Save current frame to disk
        cv2.imwrite("frame.jpg", small_frame)

        # Ensure dummy audio file exists
        empty_audio_path = "input.mp3"
        if not os.path.exists(empty_audio_path):
            with open(empty_audio_path, "wb") as f:
                f.write(b"")

        with open("frame.jpg", "rb") as img, open("input.mp3", "rb") as audio:
            files = {
                "image": ("frame.jpg", img, "image/jpeg"),
                "audio": ("input.mp3", audio, "audio/mpeg")
            }
            response = requests.post(backend_server_url + "/process/", files=files)
            if response.status_code == 200:
                return response.json()
            else:
                return {"error": f"Backend error {response.status_code}: {response.text}"}
    except Exception as e:
        return {"error": f"Exception: {str(e)}"}

# # Gradio processing function
def process_webcam(image):
    if image is None:
        return None, None

    frame = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
    result = send_to_backend(frame)

    caption = result.get("caption", "No caption")
    audio_base64 = result.get("audio_base64", None)

    if audio_base64:
        audio_bytes = base64.b64decode(audio_base64)
        audio_buffer = io.BytesIO(audio_bytes)
        audio_array, sample_rate = sf.read(audio_buffer)
        return caption, (sample_rate, audio_array)

    return caption, None


# Gradio interface
demo = gr.Interface(
    fn=process_webcam,
    inputs=gr.Image(sources=["upload", "webcam"]),    
    outputs=[
        gr.Textbox(label="Caption"),
        gr.Audio(label="Audio Output")
    ],
    live=True,
    title="SpokenVision",
    description="Real-time object detection and captioning with audio feedback",
    allow_flagging="never"
)

demo.launch()