tonyliu404 commited on
Commit
b8079ec
·
verified ·
1 Parent(s): b206a94

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -43
app.py CHANGED
@@ -1,41 +1,16 @@
1
  import gradio as gr
2
  import cv2
3
- import tempfile
4
  import numpy as np
5
  import os
6
- import time
7
  import requests
8
  import base64
9
- import threading
10
- import pygame
 
11
 
12
  # Backend server URL
13
  backend_server_url = "https://0416-2600-1017-a410-36b8-2357-52be-1318-959b.ngrok-free.app"
14
 
15
- send_thread = None # To keep track of ongoing threads
16
-
17
- # Audio playback
18
- def play_audio(audio_base64):
19
- """
20
- Play audio file using pygame mixer.
21
-
22
- Args:
23
- audio_path: Path to audio file
24
- """
25
- audio_bytes = base64.b64decode(audio_base64)
26
- try:
27
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
28
- temp_audio.write(audio_bytes)
29
- temp_audio_path = temp_audio.name
30
-
31
- pygame.mixer.init()
32
- pygame.mixer.music.load(temp_audio_path)
33
- pygame.mixer.music.play()
34
- while pygame.mixer.music.get_busy():
35
- pygame.time.Clock().tick(10)
36
- except Exception as e:
37
- print(f"Audio error: {e}")
38
-
39
  # Backend interaction
40
  def send_to_backend(frame):
41
  try:
@@ -64,29 +39,24 @@ def send_to_backend(frame):
64
  except Exception as e:
65
  return {"error": f"Exception: {str(e)}"}
66
 
67
- def thread_sendToBackend(frame):
68
- """ Starts a thread to send the frame to the backend. """
69
- global send_thread
70
- if send_thread is None:
71
- send_thread = threading.Thread(target=send_to_backend, args=(frame,), daemon=True)
72
- send_thread.start()
73
-
74
-
75
  # # Gradio processing function
76
  def process_webcam(image):
77
  if image is None:
78
- return None, "No frame", None
79
 
80
  frame = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
81
  result = send_to_backend(frame)
82
- print(len(result))
83
- caption = result['caption']
84
- audio_base64 = result['audio_base64']
85
 
86
  if audio_base64:
87
- threading.Thread(target=play_audio, args=(audio_base64,), daemon=True).start()
 
 
 
88
 
89
- return caption
90
 
91
 
92
  # Gradio interface
@@ -95,6 +65,7 @@ demo = gr.Interface(
95
  inputs=gr.Image(sources=["upload", "webcam"]),
96
  outputs=[
97
  gr.Textbox(label="Caption"),
 
98
  ],
99
  live=True,
100
  title="SpokenVision",
@@ -102,4 +73,4 @@ demo = gr.Interface(
102
  allow_flagging="never"
103
  )
104
 
105
- demo.launch()
 
1
  import gradio as gr
2
  import cv2
 
3
  import numpy as np
4
  import os
 
5
  import requests
6
  import base64
7
+ import base64
8
+ import io
9
+ import soundfile as sf
10
 
11
  # Backend server URL
12
  backend_server_url = "https://0416-2600-1017-a410-36b8-2357-52be-1318-959b.ngrok-free.app"
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  # Backend interaction
15
  def send_to_backend(frame):
16
  try:
 
39
  except Exception as e:
40
  return {"error": f"Exception: {str(e)}"}
41
 
 
 
 
 
 
 
 
 
42
  # # Gradio processing function
43
  def process_webcam(image):
44
  if image is None:
45
+ return None, None
46
 
47
  frame = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
48
  result = send_to_backend(frame)
49
+
50
+ caption = result.get("caption", "No caption")
51
+ audio_base64 = result.get("audio_base64", None)
52
 
53
  if audio_base64:
54
+ audio_bytes = base64.b64decode(audio_base64)
55
+ audio_buffer = io.BytesIO(audio_bytes)
56
+ audio_array, sample_rate = sf.read(audio_buffer)
57
+ return caption, (sample_rate, audio_array)
58
 
59
+ return caption, None
60
 
61
 
62
  # Gradio interface
 
65
  inputs=gr.Image(sources=["upload", "webcam"]),
66
  outputs=[
67
  gr.Textbox(label="Caption"),
68
+ gr.Audio(label="Audio Output")
69
  ],
70
  live=True,
71
  title="SpokenVision",
 
73
  allow_flagging="never"
74
  )
75
 
76
+ demo.launch()