openfree commited on
Commit
3f3dfbe
ยท
verified ยท
1 Parent(s): 83f1de2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +235 -67
app.py CHANGED
@@ -15,11 +15,14 @@ import torch
15
  import numpy as np
16
  from loguru import logger
17
  from PIL import Image
18
- from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer, WhisperProcessor, WhisperForConditionalGeneration
19
  import time
20
  import warnings
21
  from typing import Dict, List, Optional, Union
22
  import librosa
 
 
 
23
 
24
  # CSV/TXT ๋ถ„์„
25
  import pandas as pd
@@ -28,7 +31,7 @@ import PyPDF2
28
 
29
  warnings.filterwarnings('ignore')
30
 
31
- print("๐ŸŽฎ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ ์ดˆ๊ธฐํ™” (Gemma3-R1984-4B + Whisper)...")
32
 
33
  ##############################################################################
34
  # ์ƒ์ˆ˜ ์ •์˜
@@ -44,7 +47,6 @@ SERPHOUSE_API_KEY = os.getenv("SERPHOUSE_API_KEY", "")
44
  model = None
45
  processor = None
46
  whisper_model = None
47
- whisper_processor = None
48
  model_loaded = False
49
  whisper_loaded = False
50
  model_name = "Gemma3-R1984-4B"
@@ -63,7 +65,7 @@ def clear_cuda_cache():
63
  ##############################################################################
64
  @spaces.GPU(duration=60)
65
  def load_whisper():
66
- global whisper_model, whisper_processor, whisper_loaded
67
 
68
  if whisper_loaded:
69
  logger.info("Whisper ๋ชจ๋ธ์ด ์ด๋ฏธ ๋กœ๋“œ๋˜์–ด ์žˆ์Šต๋‹ˆ๋‹ค.")
@@ -71,11 +73,13 @@ def load_whisper():
71
 
72
  try:
73
  logger.info("Whisper ๋ชจ๋ธ ๋กœ๋”ฉ ์‹œ์ž‘...")
74
- whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-base")
75
- whisper_model = WhisperForConditionalGeneration.from_pretrained(
76
- "openai/whisper-base",
77
- device_map="auto",
78
- torch_dtype=torch.float16
 
 
79
  )
80
  whisper_loaded = True
81
  logger.info("โœ… Whisper ๋ชจ๋ธ ๋กœ๋”ฉ ์™„๋ฃŒ!")
@@ -87,42 +91,141 @@ def load_whisper():
87
  ##############################################################################
88
  # ์˜ค๋””์˜ค ์ฒ˜๋ฆฌ ํ•จ์ˆ˜
89
  ##############################################################################
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  @spaces.GPU(duration=30)
91
- def transcribe_audio(audio_data):
92
  """Whisper๋ฅผ ์‚ฌ์šฉํ•œ ์˜ค๋””์˜ค ์ „์‚ฌ"""
93
- global whisper_model, whisper_processor
94
 
95
  if not whisper_loaded:
96
  if not load_whisper():
97
- return "์˜ค๋””์˜ค ์ฒ˜๋ฆฌ ๋ถˆ๊ฐ€"
98
 
99
  try:
100
- if audio_data is None:
101
- return None
102
-
103
- # ์˜ค๋””์˜ค ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ
104
- sample_rate, audio = audio_data
105
-
106
- # 16kHz๋กœ ๋ฆฌ์ƒ˜ํ”Œ๋ง
107
- if sample_rate != 16000:
108
- audio = librosa.resample(audio.astype(float), orig_sr=sample_rate, target_sr=16000)
109
-
110
- # Whisper ์ž…๋ ฅ ์ฒ˜๋ฆฌ
111
- inputs = whisper_processor(audio, sampling_rate=16000, return_tensors="pt")
112
- inputs = {k: v.to(whisper_model.device) for k, v in inputs.items()}
113
-
114
  # ์Œ์„ฑ ์ธ์‹
115
- with torch.no_grad():
116
- generated_ids = whisper_model.generate(**inputs, max_length=225)
117
 
118
- # ๋””์ฝ”๋”ฉ
119
- transcription = whisper_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
- return transcription.strip()
 
 
 
 
122
 
 
 
 
 
 
 
 
 
 
 
 
123
  except Exception as e:
124
- logger.error(f"์˜ค๋””์˜ค ์ „์‚ฌ ์˜ค๋ฅ˜: {e}")
125
- return f"์˜ค๋””์˜ค ์ธ์‹ ์‹คํŒจ: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
  ##############################################################################
128
  # ํ‚ค์›Œ๋“œ ์ถ”์ถœ ํ•จ์ˆ˜
@@ -591,13 +694,19 @@ css = """
591
  background: #e3f2fd;
592
  color: #1565c0;
593
  }
 
 
 
 
 
 
594
  """
595
 
596
  with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as demo:
597
  gr.HTML("""
598
  <div class="robot-header">
599
  <h1>๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ</h1>
600
- <h3>๐ŸŽฎ Gemma3-R1984-4B + ๐Ÿ“ท ์‹ค์‹œ๊ฐ„ ์›น์บ  + ๐ŸŽค ์Œ์„ฑ ์ธ์‹</h3>
601
  <p>โšก ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ AI๋กœ ๋กœ๋ด‡ ์ž‘์—… ๋ถ„์„!</p>
602
  </div>
603
  """)
@@ -636,6 +745,11 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
636
  '<div class="audio-status">๐ŸŽค ์Œ์„ฑ ์ธ์‹: ๋น„ํ™œ์„ฑํ™”</div>'
637
  )
638
 
 
 
 
 
 
639
  # ๋งˆ์ง€๋ง‰ ์ธ์‹๋œ ํ…์ŠคํŠธ
640
  last_transcript = gr.Textbox(
641
  label="์ธ์‹๋œ ์Œ์„ฑ",
@@ -657,9 +771,9 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
657
  )
658
 
659
  use_audio_toggle = gr.Checkbox(
660
- label="๐ŸŽค ์Œ์„ฑ ์ธ์‹ ์‚ฌ์šฉ",
661
  value=False,
662
- info="์ฃผ๋ณ€ ์†Œ๋ฆฌ๋ฅผ ์ธ์‹ํ•˜์—ฌ ๋ถ„์„์— ํฌํ•จ"
663
  )
664
 
665
  with gr.Row():
@@ -719,6 +833,19 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
719
  label="๋งˆ์ดํฌ ์ž…๋ ฅ"
720
  )
721
 
 
 
 
 
 
 
 
 
 
 
 
 
 
722
  # ๋ฌธ์„œ ๋ถ„์„ ํƒญ (์ˆจ๊น€)
723
  with gr.Tab("๐Ÿ“„ ๋ฌธ์„œ ๋ถ„์„", visible=False):
724
  with gr.Row():
@@ -752,8 +879,6 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
752
 
753
  # ์ด๋ฒคํŠธ ํ•ธ๋“ค๋Ÿฌ
754
  webcam_state = gr.State(None)
755
- audio_state = gr.State(None)
756
- transcript_state = gr.State("")
757
 
758
  def capture_webcam(frame):
759
  """์›น์บ  ํ”„๋ ˆ์ž„ ์บก์ฒ˜"""
@@ -763,15 +888,29 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
763
 
764
  def clear_capture():
765
  """์บก์ฒ˜ ์ดˆ๊ธฐํ™”"""
 
 
 
 
 
 
 
766
  return None, gr.update(visible=False), '<div class="status-box" style="background:#d4edda; color:#155724;">๐ŸŽฎ ์‹œ์Šคํ…œ ์ค€๋น„</div>', ""
767
 
768
- def analyze_with_task(image, prompt, task_type, use_search, thinking, tokens, transcript):
769
  """ํŠน์ • ํƒœ์Šคํฌ๋กœ ์ด๋ฏธ์ง€ ๋ถ„์„"""
 
 
770
  if image is None:
771
  return "โŒ ๋จผ์ € ์ด๋ฏธ์ง€๋ฅผ ์บก์ฒ˜ํ•˜์„ธ์š”.", '<div class="status-box" style="background:#f8d7da; color:#721c24;">โŒ ์ด๋ฏธ์ง€ ์—†์Œ</div>'
772
 
773
  status = f'<div class="status-box" style="background:#cce5ff; color:#004085;">๐Ÿš€ {task_type} ๋ถ„์„ ์ค‘...</div>'
774
 
 
 
 
 
 
775
  result = analyze_image_for_robot(
776
  image=image,
777
  prompt=prompt,
@@ -800,27 +939,31 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
800
  return formatted_result, complete_status
801
 
802
  # ์ž๋™ ์บก์ฒ˜ ๋ฐ ๋ถ„์„ ํ•จ์ˆ˜
803
- def auto_capture_and_analyze(webcam_frame, audio_data, task_prompt, use_search, thinking, tokens, use_audio, current_transcript):
804
- """์ž๋™ ์บก์ฒ˜ ๋ฐ ๋ถ„์„ (์˜ค๋””์˜ค ํฌํ•จ)"""
 
 
805
  if webcam_frame is None:
806
  return (
807
  None,
808
  "์ž๋™ ์บก์ฒ˜ ๋Œ€๊ธฐ ์ค‘...",
809
  '<div class="status-box" style="background:#fff3cd; color:#856404;">โณ ์›น์บ  ๋Œ€๊ธฐ ์ค‘</div>',
810
  '<div class="auto-capture-status">๐Ÿ”„ ์ž๋™ ์บก์ฒ˜: ์›น์บ  ๋Œ€๊ธฐ ์ค‘</div>',
811
- current_transcript,
812
- current_transcript
813
  )
814
 
815
  # ์บก์ฒ˜ ์ˆ˜ํ–‰
816
  timestamp = time.strftime("%H:%M:%S")
817
 
818
- # ์˜ค๋””์˜ค ์ฒ˜๋ฆฌ (ํ™œ์„ฑํ™”๋œ ๊ฒฝ์šฐ)
819
- new_transcript = ""
820
- if use_audio and audio_data is not None:
821
- transcribed = transcribe_audio(audio_data)
822
- if transcribed and transcribed != "์˜ค๋””์˜ค ์ฒ˜๋ฆฌ ๋ถˆ๊ฐ€":
823
- new_transcript = transcribed
 
 
 
824
 
825
  # ์ด๋ฏธ์ง€ ๋ถ„์„ (์ž‘์—… ๊ณ„ํš ๋ชจ๋“œ๋กœ)
826
  result = analyze_image_for_robot(
@@ -830,7 +973,7 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
830
  use_web_search=use_search,
831
  enable_thinking=thinking,
832
  max_new_tokens=tokens,
833
- audio_transcript=new_transcript if new_transcript else None
834
  )
835
 
836
  formatted_result = f"""๐Ÿ”„ ์ž๋™ ๋ถ„์„ ์™„๋ฃŒ ({timestamp})
@@ -838,13 +981,15 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
838
  {result}
839
  โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”"""
840
 
 
 
 
841
  return (
842
  webcam_frame,
843
  formatted_result,
844
  '<div class="status-box" style="background:#d4edda; color:#155724;">โœ… ์ž๋™ ๋ถ„์„ ์™„๋ฃŒ</div>',
845
  f'<div class="auto-capture-status">๐Ÿ”„ ์ž๋™ ์บก์ฒ˜: ๋งˆ์ง€๋ง‰ ๋ถ„์„ {timestamp}</div>',
846
- new_transcript if new_transcript else current_transcript,
847
- new_transcript if new_transcript else current_transcript
848
  )
849
 
850
  # ์›น์บ  ์ŠคํŠธ๋ฆฌ๋ฐ
@@ -854,14 +999,17 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
854
  outputs=[webcam_state]
855
  )
856
 
857
- # ์˜ค๋””์˜ค ์ŠคํŠธ๋ฆฌ๋ฐ
858
- def process_audio_stream(audio_data):
859
- return audio_data
 
 
860
 
 
861
  audio_input.stream(
862
- fn=process_audio_stream,
863
  inputs=[audio_input],
864
- outputs=[audio_state]
865
  )
866
 
867
  # ์ˆ˜๋™ ์บก์ฒ˜ ๋ฒ„ํŠผ
@@ -874,19 +1022,19 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
874
  # ์ดˆ๊ธฐํ™” ๋ฒ„ํŠผ
875
  clear_capture_btn.click(
876
  fn=clear_capture,
877
- outputs=[webcam_state, captured_image, status_display, transcript_state]
878
  )
879
 
880
  # ์ž‘์—… ๋ฒ„ํŠผ๋“ค
881
  planning_btn.click(
882
- fn=lambda img, p, s, t, tk, tr: analyze_with_task(img, p, "planning", s, t, tk, tr),
883
- inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens, transcript_state],
884
  outputs=[result_output, status_display]
885
  )
886
 
887
  grounding_btn.click(
888
- fn=lambda img, p, s, t, tk, tr: analyze_with_task(img, p, "grounding", s, t, tk, tr),
889
- inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens, transcript_state],
890
  outputs=[result_output, status_display]
891
  )
892
 
@@ -924,14 +1072,29 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
924
 
925
  # ์˜ค๋””์˜ค ํ† ๊ธ€ ์ด๋ฒคํŠธ
926
  def toggle_audio(enabled):
 
 
927
  if enabled:
928
  # Whisper ๋ชจ๋ธ ๋กœ๋“œ
929
  load_whisper()
 
 
 
 
 
 
 
930
  return (
931
  gr.update(visible=True), # audio_input ํ‘œ์‹œ
932
- '<div class="audio-status">๐ŸŽค ์Œ์„ฑ ์ธ์‹: ํ™œ์„ฑํ™”๋จ</div>'
933
  )
934
  else:
 
 
 
 
 
 
935
  return (
936
  gr.update(visible=False), # audio_input ์ˆจ๊น€
937
  '<div class="audio-status">๐ŸŽค ์Œ์„ฑ ์ธ์‹: ๋น„ํ™œ์„ฑํ™”</div>'
@@ -946,13 +1109,18 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
946
  # ํƒ€์ด๋จธ ํ‹ฑ ์ด๋ฒคํŠธ
947
  timer.tick(
948
  fn=auto_capture_and_analyze,
949
- inputs=[webcam_state, audio_state, task_prompt, use_web_search, enable_thinking, max_tokens, use_audio_toggle, transcript_state],
950
- outputs=[captured_image, result_output, status_display, auto_capture_status, transcript_state, last_transcript]
951
  )
952
 
953
  # ์ดˆ๊ธฐ ๋ชจ๋ธ ๋กœ๋“œ
954
  def initial_load():
955
  load_model()
 
 
 
 
 
956
  return "์‹œ์Šคํ…œ ์ค€๋น„ ์™„๋ฃŒ! ๐Ÿš€"
957
 
958
  demo.load(
@@ -961,8 +1129,8 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
961
  )
962
 
963
  if __name__ == "__main__":
964
- print("๐Ÿš€ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ ์‹œ์ž‘ (Gemma3-R1984-4B + Whisper)...")
965
- demo.launch(
966
  server_name="0.0.0.0",
967
  server_port=7860,
968
  share=False,
 
15
  import numpy as np
16
  from loguru import logger
17
  from PIL import Image
18
+ from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer, pipeline
19
  import time
20
  import warnings
21
  from typing import Dict, List, Optional, Union
22
  import librosa
23
+ import scipy.signal as sps
24
+ from threading import Thread, Lock
25
+ import queue
26
 
27
  # CSV/TXT ๋ถ„์„
28
  import pandas as pd
 
31
 
32
  warnings.filterwarnings('ignore')
33
 
34
+ print("๐ŸŽฎ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ ์ดˆ๊ธฐํ™” (Gemma3-R1984-4B + Whisper + 10์ดˆ ๊ต๋Œ€ ๋…น์Œ)...")
35
 
36
  ##############################################################################
37
  # ์ƒ์ˆ˜ ์ •์˜
 
47
  model = None
48
  processor = None
49
  whisper_model = None
 
50
  model_loaded = False
51
  whisper_loaded = False
52
  model_name = "Gemma3-R1984-4B"
 
65
  ##############################################################################
66
  @spaces.GPU(duration=60)
67
  def load_whisper():
68
+ global whisper_model, whisper_loaded
69
 
70
  if whisper_loaded:
71
  logger.info("Whisper ๋ชจ๋ธ์ด ์ด๋ฏธ ๋กœ๋“œ๋˜์–ด ์žˆ์Šต๋‹ˆ๋‹ค.")
 
73
 
74
  try:
75
  logger.info("Whisper ๋ชจ๋ธ ๋กœ๋”ฉ ์‹œ์ž‘...")
76
+ # ํŒŒ์ดํ”„๋ผ์ธ ๋ฐฉ์‹์œผ๋กœ ๋กœ๋“œ
77
+ device = 0 if torch.cuda.is_available() else "cpu"
78
+ whisper_model = pipeline(
79
+ task="automatic-speech-recognition",
80
+ model="openai/whisper-base",
81
+ chunk_length_s=30,
82
+ device=device,
83
  )
84
  whisper_loaded = True
85
  logger.info("โœ… Whisper ๋ชจ๋ธ ๋กœ๋”ฉ ์™„๋ฃŒ!")
 
91
  ##############################################################################
92
  # ์˜ค๋””์˜ค ์ฒ˜๋ฆฌ ํ•จ์ˆ˜
93
  ##############################################################################
94
+ import scipy.signal as sps
95
+ from threading import Thread, Lock
96
+ import queue
97
+
98
+ # ์˜ค๋””์˜ค ๋ฒ„ํผ ๊ด€๋ฆฌ
99
+ audio_buffer_lock = Lock()
100
+ audio_buffer_a = []
101
+ audio_buffer_b = []
102
+ current_buffer = 'a' # ํ˜„์žฌ ๋…น์Œ ์ค‘์ธ ๋ฒ„ํผ
103
+ processing_queue = queue.Queue() # ์ฒ˜๋ฆฌ ๋Œ€๊ธฐ ํ
104
+ last_transcription = "" # ๋งˆ์ง€๋ง‰ ์ „์‚ฌ ๊ฒฐ๊ณผ
105
+
106
+ def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int = 16000) -> np.ndarray:
107
+ """์˜ค๋””์˜ค ๋ฆฌ์ƒ˜ํ”Œ๋ง"""
108
+ if orig_sr == target_sr:
109
+ return audio.astype(np.float32)
110
+
111
+ # scipy๋ฅผ ์‚ฌ์šฉํ•œ ๋ฆฌ์ƒ˜ํ”Œ๋ง
112
+ number_of_samples = round(len(audio) * float(target_sr) / orig_sr)
113
+ audio_resampled = sps.resample(audio, number_of_samples)
114
+ return audio_resampled.astype(np.float32)
115
+
116
  @spaces.GPU(duration=30)
117
+ def transcribe_audio_whisper(audio_array: np.ndarray, sr: int = 16000):
118
  """Whisper๋ฅผ ์‚ฌ์šฉํ•œ ์˜ค๋””์˜ค ์ „์‚ฌ"""
119
+ global whisper_model, whisper_loaded
120
 
121
  if not whisper_loaded:
122
  if not load_whisper():
123
+ return None
124
 
125
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  # ์Œ์„ฑ ์ธ์‹
127
+ result = whisper_model({"array": audio_array, "sampling_rate": sr})
128
+ transcription = result["text"].strip()
129
 
130
+ return transcription if transcription else None
131
+
132
+ except Exception as e:
133
+ logger.error(f"Whisper ์˜ค๋””์˜ค ์ „์‚ฌ ์˜ค๋ฅ˜: {e}")
134
+ return None
135
+
136
+ def accumulate_audio(audio_chunk):
137
+ """์˜ค๋””์˜ค ์ฒญํฌ๋ฅผ ๋ฒ„ํผ์— ๋ˆ„์ """
138
+ global current_buffer, audio_buffer_a, audio_buffer_b
139
+
140
+ if audio_chunk is None:
141
+ return
142
+
143
+ sr, audio = audio_chunk
144
+
145
+ # ์Šคํ…Œ๋ ˆ์˜ค๋ฅผ ๋ชจ๋…ธ๋กœ ๋ณ€ํ™˜
146
+ if audio.ndim > 1:
147
+ audio = audio.mean(axis=1)
148
+
149
+ with audio_buffer_lock:
150
+ if current_buffer == 'a':
151
+ audio_buffer_a.append((audio, sr))
152
+ else:
153
+ audio_buffer_b.append((audio, sr))
154
+
155
+ def switch_buffers():
156
+ """๋ฒ„ํผ ์ „ํ™˜ ๋ฐ ์ฒ˜๋ฆฌ ํ์— ์ถ”๊ฐ€"""
157
+ global current_buffer, audio_buffer_a, audio_buffer_b
158
+
159
+ with audio_buffer_lock:
160
+ if current_buffer == 'a':
161
+ # A ๋ฒ„ํผ๋ฅผ ์ฒ˜๋ฆฌ ํ์— ์ถ”๊ฐ€
162
+ if audio_buffer_a:
163
+ processing_queue.put(('a', audio_buffer_a.copy()))
164
+ audio_buffer_a.clear()
165
+ current_buffer = 'b'
166
+ else:
167
+ # B ๋ฒ„ํผ๋ฅผ ์ฒ˜๋ฆฌ ํ์— ์ถ”๊ฐ€
168
+ if audio_buffer_b:
169
+ processing_queue.put(('b', audio_buffer_b.copy()))
170
+ audio_buffer_b.clear()
171
+ current_buffer = 'a'
172
+
173
+ def process_audio_buffer(buffer_data):
174
+ """๋ฒ„ํผ์˜ ์˜ค๋””์˜ค ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ"""
175
+ buffer_name, audio_chunks = buffer_data
176
+
177
+ if not audio_chunks:
178
+ return None
179
+
180
+ try:
181
+ # ๋ชจ๋“  ์ฒญํฌ๋ฅผ ํ•˜๋‚˜๋กœ ๊ฒฐํ•ฉ
182
+ combined_audio = []
183
+ sample_rate = 16000
184
 
185
+ for audio, sr in audio_chunks:
186
+ # 16kHz๋กœ ๋ฆฌ์ƒ˜ํ”Œ๋ง
187
+ if sr != 16000:
188
+ audio = resample_audio(audio, sr, 16000)
189
+ combined_audio.append(audio)
190
 
191
+ # ๊ฒฐํ•ฉ
192
+ if combined_audio:
193
+ full_audio = np.concatenate(combined_audio)
194
+
195
+ # Whisper๋กœ ์ „์‚ฌ
196
+ transcription = transcribe_audio_whisper(full_audio, 16000)
197
+
198
+ if transcription:
199
+ logger.info(f"๋ฒ„ํผ {buffer_name} ์ „์‚ฌ ์™„๋ฃŒ: {transcription[:50]}...")
200
+ return transcription
201
+
202
  except Exception as e:
203
+ logger.error(f"์˜ค๋””์˜ค ๋ฒ„ํผ ์ฒ˜๋ฆฌ ์˜ค๋ฅ˜: {e}")
204
+
205
+ return None
206
+
207
+ # ๋ฐฑ๊ทธ๋ผ์šด๋“œ ์ฒ˜๋ฆฌ ์Šค๋ ˆ๋“œ
208
+ def audio_processing_worker():
209
+ """๋ฐฑ๊ทธ๋ผ์šด๋“œ์—์„œ ์˜ค๋””์˜ค ๋ฒ„ํผ ์ฒ˜๋ฆฌ"""
210
+ global last_transcription
211
+
212
+ while True:
213
+ try:
214
+ # ์ฒ˜๋ฆฌํ•  ๋ฒ„ํผ ๊ฐ€์ ธ์˜ค๊ธฐ
215
+ buffer_data = processing_queue.get(timeout=1)
216
+
217
+ # ์˜ค๋””์˜ค ์ฒ˜๋ฆฌ
218
+ result = process_audio_buffer(buffer_data)
219
+
220
+ if result:
221
+ # ๊ฒฐ๊ณผ๋ฅผ ์ „์—ญ ๋ณ€์ˆ˜์— ์ €์žฅ (๋‚˜์ค‘์— ์‚ฌ์šฉ)
222
+ with audio_buffer_lock:
223
+ last_transcription = result
224
+
225
+ except queue.Empty:
226
+ continue
227
+ except Exception as e:
228
+ logger.error(f"์˜ค๋””์˜ค ์ฒ˜๋ฆฌ ์›Œ์ปค ์˜ค๋ฅ˜: {e}")
229
 
230
  ##############################################################################
231
  # ํ‚ค์›Œ๋“œ ์ถ”์ถœ ํ•จ์ˆ˜
 
694
  background: #e3f2fd;
695
  color: #1565c0;
696
  }
697
+ .buffer-info {
698
+ font-size: 0.9em;
699
+ color: #666;
700
+ text-align: center;
701
+ margin-top: 5px;
702
+ }
703
  """
704
 
705
  with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as demo:
706
  gr.HTML("""
707
  <div class="robot-header">
708
  <h1>๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ</h1>
709
+ <h3>๐ŸŽฎ Gemma3-R1984-4B + ๐Ÿ“ท ์‹ค์‹œ๊ฐ„ ์›น์บ  + ๐ŸŽค 10์ดˆ ๊ต๋Œ€ ์Œ์„ฑ ์ธ์‹</h3>
710
  <p>โšก ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ AI๋กœ ๋กœ๋ด‡ ์ž‘์—… ๋ถ„์„!</p>
711
  </div>
712
  """)
 
745
  '<div class="audio-status">๐ŸŽค ์Œ์„ฑ ์ธ์‹: ๋น„ํ™œ์„ฑํ™”</div>'
746
  )
747
 
748
+ # ๋ฒ„ํผ ์ •๋ณด
749
+ gr.HTML(
750
+ '<div class="buffer-info">A/B ๋ฒ„ํผ ๊ต๋Œ€ ๋…น์Œ์œผ๋กœ ๋Š๊น€ ์—†๋Š” ์ธ์‹</div>'
751
+ )
752
+
753
  # ๋งˆ์ง€๋ง‰ ์ธ์‹๋œ ํ…์ŠคํŠธ
754
  last_transcript = gr.Textbox(
755
  label="์ธ์‹๋œ ์Œ์„ฑ",
 
771
  )
772
 
773
  use_audio_toggle = gr.Checkbox(
774
+ label="๐ŸŽค ์Œ์„ฑ ์ธ์‹ ์‚ฌ์šฉ (10์ดˆ ๊ต๋Œ€ ๋…น์Œ)",
775
  value=False,
776
+ info="10์ดˆ๋งˆ๋‹ค ๊ต๋Œ€๋กœ ๋…น์Œํ•˜์—ฌ ๋Š๊น€ ์—†์ด ์ธ์‹"
777
  )
778
 
779
  with gr.Row():
 
833
  label="๋งˆ์ดํฌ ์ž…๋ ฅ"
834
  )
835
 
836
+ # ์˜ค๋””์˜ค ์ŠคํŠธ๋ฆฌ๋ฐ ์ฒ˜๋ฆฌ
837
+ def audio_stream_callback(audio_chunk):
838
+ """์˜ค๋””์˜ค ์ŠคํŠธ๋ฆผ ์ฝœ๋ฐฑ - ๋ฒ„ํผ์— ๋ˆ„์ """
839
+ accumulate_audio(audio_chunk)
840
+ return None # ์ƒํƒœ ์—…๋ฐ๏ฟฝ๏ฟฝ๏ฟฝํŠธ ์—†์Œ
841
+
842
+ # ์˜ค๋””์˜ค ์ŠคํŠธ๋ฆฌ๋ฐ ์—ฐ๊ฒฐ
843
+ audio_input.stream(
844
+ fn=audio_stream_callback,
845
+ inputs=[audio_input],
846
+ outputs=None
847
+ )
848
+
849
  # ๋ฌธ์„œ ๋ถ„์„ ํƒญ (์ˆจ๊น€)
850
  with gr.Tab("๐Ÿ“„ ๋ฌธ์„œ ๋ถ„์„", visible=False):
851
  with gr.Row():
 
879
 
880
  # ์ด๋ฒคํŠธ ํ•ธ๋“ค๋Ÿฌ
881
  webcam_state = gr.State(None)
 
 
882
 
883
  def capture_webcam(frame):
884
  """์›น์บ  ํ”„๋ ˆ์ž„ ์บก์ฒ˜"""
 
888
 
889
  def clear_capture():
890
  """์บก์ฒ˜ ์ดˆ๊ธฐํ™”"""
891
+ global last_transcription, audio_buffer_a, audio_buffer_b
892
+
893
+ with audio_buffer_lock:
894
+ last_transcription = ""
895
+ audio_buffer_a.clear()
896
+ audio_buffer_b.clear()
897
+
898
  return None, gr.update(visible=False), '<div class="status-box" style="background:#d4edda; color:#155724;">๐ŸŽฎ ์‹œ์Šคํ…œ ์ค€๋น„</div>', ""
899
 
900
+ def analyze_with_task(image, prompt, task_type, use_search, thinking, tokens):
901
  """ํŠน์ • ํƒœ์Šคํฌ๋กœ ์ด๋ฏธ์ง€ ๋ถ„์„"""
902
+ global last_transcription
903
+
904
  if image is None:
905
  return "โŒ ๋จผ์ € ์ด๋ฏธ์ง€๋ฅผ ์บก์ฒ˜ํ•˜์„ธ์š”.", '<div class="status-box" style="background:#f8d7da; color:#721c24;">โŒ ์ด๋ฏธ์ง€ ์—†์Œ</div>'
906
 
907
  status = f'<div class="status-box" style="background:#cce5ff; color:#004085;">๐Ÿš€ {task_type} ๋ถ„์„ ์ค‘...</div>'
908
 
909
+ # ํ˜„์žฌ ์ „์‚ฌ ํ…์ŠคํŠธ ๊ฐ€์ ธ์˜ค๊ธฐ
910
+ transcript = ""
911
+ with audio_buffer_lock:
912
+ transcript = last_transcription
913
+
914
  result = analyze_image_for_robot(
915
  image=image,
916
  prompt=prompt,
 
939
  return formatted_result, complete_status
940
 
941
  # ์ž๋™ ์บก์ฒ˜ ๋ฐ ๋ถ„์„ ํ•จ์ˆ˜
942
+ def auto_capture_and_analyze(webcam_frame, task_prompt, use_search, thinking, tokens, use_audio):
943
+ """์ž๋™ ์บก์ฒ˜ ๋ฐ ๋ถ„์„ (10์ดˆ๋งˆ๋‹ค ์˜ค๋””์˜ค ๋ฒ„ํผ ์ „ํ™˜)"""
944
+ global last_transcription
945
+
946
  if webcam_frame is None:
947
  return (
948
  None,
949
  "์ž๋™ ์บก์ฒ˜ ๋Œ€๊ธฐ ์ค‘...",
950
  '<div class="status-box" style="background:#fff3cd; color:#856404;">โณ ์›น์บ  ๋Œ€๊ธฐ ์ค‘</div>',
951
  '<div class="auto-capture-status">๐Ÿ”„ ์ž๋™ ์บก์ฒ˜: ์›น์บ  ๋Œ€๊ธฐ ์ค‘</div>',
952
+ ""
 
953
  )
954
 
955
  # ์บก์ฒ˜ ์ˆ˜ํ–‰
956
  timestamp = time.strftime("%H:%M:%S")
957
 
958
+ # ๋ฒ„ํผ ์ „ํ™˜ (10์ดˆ๋งˆ๋‹ค)
959
+ if use_audio:
960
+ switch_buffers()
961
+
962
+ # ๋งˆ์ง€๋ง‰ ์ „์‚ฌ ๊ฒฐ๊ณผ ๊ฐ€์ ธ์˜ค๊ธฐ
963
+ audio_transcript = ""
964
+ if use_audio:
965
+ with audio_buffer_lock:
966
+ audio_transcript = last_transcription
967
 
968
  # ์ด๋ฏธ์ง€ ๋ถ„์„ (์ž‘์—… ๊ณ„ํš ๋ชจ๋“œ๋กœ)
969
  result = analyze_image_for_robot(
 
973
  use_web_search=use_search,
974
  enable_thinking=thinking,
975
  max_new_tokens=tokens,
976
+ audio_transcript=audio_transcript if audio_transcript else None
977
  )
978
 
979
  formatted_result = f"""๐Ÿ”„ ์ž๋™ ๋ถ„์„ ์™„๋ฃŒ ({timestamp})
 
981
  {result}
982
  โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”"""
983
 
984
+ # ๋งˆ์ง€๋ง‰ ์ธ์‹๋œ ํ…์ŠคํŠธ ์—…๋ฐ์ดํŠธ
985
+ transcript_display = audio_transcript if audio_transcript else "์Œ์„ฑ ์ธ์‹ ๋Œ€๊ธฐ ์ค‘..."
986
+
987
  return (
988
  webcam_frame,
989
  formatted_result,
990
  '<div class="status-box" style="background:#d4edda; color:#155724;">โœ… ์ž๋™ ๋ถ„์„ ์™„๋ฃŒ</div>',
991
  f'<div class="auto-capture-status">๐Ÿ”„ ์ž๋™ ์บก์ฒ˜: ๋งˆ์ง€๋ง‰ ๋ถ„์„ {timestamp}</div>',
992
+ transcript_display
 
993
  )
994
 
995
  # ์›น์บ  ์ŠคํŠธ๋ฆฌ๋ฐ
 
999
  outputs=[webcam_state]
1000
  )
1001
 
1002
+ # ์˜ค๋””์˜ค ์ŠคํŠธ๋ฆฌ๋ฐ ์ฒ˜๋ฆฌ
1003
+ def audio_stream_callback(audio_chunk):
1004
+ """์˜ค๋””์˜ค ์ŠคํŠธ๋ฆผ ์ฝœ๋ฐฑ - ๋ฒ„ํผ์— ๋ˆ„์ """
1005
+ accumulate_audio(audio_chunk)
1006
+ return None # ์ƒํƒœ ์—…๋ฐ์ดํŠธ ์—†์Œ
1007
 
1008
+ # ์˜ค๋””์˜ค ์ŠคํŠธ๋ฆฌ๋ฐ ์—ฐ๊ฒฐ
1009
  audio_input.stream(
1010
+ fn=audio_stream_callback,
1011
  inputs=[audio_input],
1012
+ outputs=None
1013
  )
1014
 
1015
  # ์ˆ˜๋™ ์บก์ฒ˜ ๋ฒ„ํŠผ
 
1022
  # ์ดˆ๊ธฐํ™” ๋ฒ„ํŠผ
1023
  clear_capture_btn.click(
1024
  fn=clear_capture,
1025
+ outputs=[webcam_state, captured_image, status_display, last_transcript]
1026
  )
1027
 
1028
  # ์ž‘์—… ๋ฒ„ํŠผ๋“ค
1029
  planning_btn.click(
1030
+ fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "planning", s, t, tk),
1031
+ inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
1032
  outputs=[result_output, status_display]
1033
  )
1034
 
1035
  grounding_btn.click(
1036
+ fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "grounding", s, t, tk),
1037
+ inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
1038
  outputs=[result_output, status_display]
1039
  )
1040
 
 
1072
 
1073
  # ์˜ค๋””์˜ค ํ† ๊ธ€ ์ด๋ฒคํŠธ
1074
  def toggle_audio(enabled):
1075
+ global audio_buffer_a, audio_buffer_b, current_buffer, last_transcription
1076
+
1077
  if enabled:
1078
  # Whisper ๋ชจ๋ธ ๋กœ๋“œ
1079
  load_whisper()
1080
+ # ๋ฒ„ํผ ์ดˆ๊ธฐํ™”
1081
+ with audio_buffer_lock:
1082
+ audio_buffer_a.clear()
1083
+ audio_buffer_b.clear()
1084
+ current_buffer = 'a'
1085
+ last_transcription = ""
1086
+
1087
  return (
1088
  gr.update(visible=True), # audio_input ํ‘œ์‹œ
1089
+ '<div class="audio-status">๐ŸŽค ์Œ์„ฑ ์ธ์‹: ํ™œ์„ฑํ™”๋จ (10์ดˆ ๊ต๋Œ€ ๋…น์Œ)</div>'
1090
  )
1091
  else:
1092
+ # ๋ฒ„ํผ ์ดˆ๊ธฐํ™”
1093
+ with audio_buffer_lock:
1094
+ audio_buffer_a.clear()
1095
+ audio_buffer_b.clear()
1096
+ last_transcription = ""
1097
+
1098
  return (
1099
  gr.update(visible=False), # audio_input ์ˆจ๊น€
1100
  '<div class="audio-status">๐ŸŽค ์Œ์„ฑ ์ธ์‹: ๋น„ํ™œ์„ฑํ™”</div>'
 
1109
  # ํƒ€์ด๋จธ ํ‹ฑ ์ด๋ฒคํŠธ
1110
  timer.tick(
1111
  fn=auto_capture_and_analyze,
1112
+ inputs=[webcam_state, task_prompt, use_web_search, enable_thinking, max_tokens, use_audio_toggle],
1113
+ outputs=[captured_image, result_output, status_display, auto_capture_status, last_transcript]
1114
  )
1115
 
1116
  # ์ดˆ๊ธฐ ๋ชจ๋ธ ๋กœ๋“œ
1117
  def initial_load():
1118
  load_model()
1119
+
1120
+ # ์˜ค๋””์˜ค ์›Œ์ปค ์Šค๋ ˆ๋“œ ์‹œ์ž‘
1121
+ audio_worker_thread = Thread(target=audio_processing_worker, daemon=True)
1122
+ audio_worker_thread.start()
1123
+
1124
  return "์‹œ์Šคํ…œ ์ค€๋น„ ์™„๋ฃŒ! ๐Ÿš€"
1125
 
1126
  demo.load(
 
1129
  )
1130
 
1131
  if __name__ == "__main__":
1132
+ print("๐Ÿš€ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ ์‹œ์ž‘ (Gemma3-R1984-4B + Whisper 10์ดˆ ๊ต๋Œ€ ๋…น์Œ)...")
1133
+ demo.queue().launch(
1134
  server_name="0.0.0.0",
1135
  server_port=7860,
1136
  share=False,