Spaces:
Sleeping
Sleeping
File size: 2,291 Bytes
7ea89c4 b8079ec 7ea89c4 b8079ec 7ea89c4 b8079ec 7ea89c4 b8079ec 7ea89c4 b8079ec 7ea89c4 b8079ec 7ea89c4 b8079ec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import gradio as gr
import cv2
import numpy as np
import os
import requests
import base64
import base64
import io
import soundfile as sf
# Backend server URL
backend_server_url = "https://0416-2600-1017-a410-36b8-2357-52be-1318-959b.ngrok-free.app"
# Backend interaction
def send_to_backend(frame):
try:
# _, img_encoded = cv2.imencode('.jpg', frame)
# img_bytes = img_encoded.tobytes()
small_frame = cv2.resize(frame, (224, 224))
# Save current frame to disk
cv2.imwrite("frame.jpg", small_frame)
# Ensure dummy audio file exists
empty_audio_path = "input.mp3"
if not os.path.exists(empty_audio_path):
with open(empty_audio_path, "wb") as f:
f.write(b"")
with open("frame.jpg", "rb") as img, open("input.mp3", "rb") as audio:
files = {
"image": ("frame.jpg", img, "image/jpeg"),
"audio": ("input.mp3", audio, "audio/mpeg")
}
response = requests.post(backend_server_url + "/process/", files=files)
if response.status_code == 200:
return response.json()
else:
return {"error": f"Backend error {response.status_code}: {response.text}"}
except Exception as e:
return {"error": f"Exception: {str(e)}"}
# # Gradio processing function
def process_webcam(image):
if image is None:
return None, None
frame = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
result = send_to_backend(frame)
caption = result.get("caption", "No caption")
audio_base64 = result.get("audio_base64", None)
if audio_base64:
audio_bytes = base64.b64decode(audio_base64)
audio_buffer = io.BytesIO(audio_bytes)
audio_array, sample_rate = sf.read(audio_buffer)
return caption, (sample_rate, audio_array)
return caption, None
# Gradio interface
demo = gr.Interface(
fn=process_webcam,
inputs=gr.Image(sources=["upload", "webcam"]),
outputs=[
gr.Textbox(label="Caption"),
gr.Audio(label="Audio Output")
],
live=True,
title="SpokenVision",
description="Real-time object detection and captioning with audio feedback",
allow_flagging="never"
)
demo.launch()
|