openfree commited on
Commit
24e2113
ยท
verified ยท
1 Parent(s): 0ecba6e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +834 -0
app.py ADDED
@@ -0,0 +1,834 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ import os
4
+ import re
5
+ import tempfile
6
+ import gc
7
+ from collections.abc import Iterator
8
+ from threading import Thread
9
+ import json
10
+ import requests
11
+ import cv2
12
+ import gradio as gr
13
+ import spaces
14
+ import torch
15
+ import numpy as np
16
+ from loguru import logger
17
+ from PIL import Image
18
+ from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
19
+ import time
20
+ import warnings
21
+ from typing import Dict, List, Optional, Union
22
+
23
+ # CSV/TXT ๋ถ„์„
24
+ import pandas as pd
25
+ # PDF ํ…์ŠคํŠธ ์ถ”์ถœ
26
+ import PyPDF2
27
+
28
+ warnings.filterwarnings('ignore')
29
+
30
+ print("๐ŸŽฎ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ ์ดˆ๊ธฐํ™” (Gemma3-R1984-4B)...")
31
+
32
+ ##############################################################################
33
+ # ์ƒ์ˆ˜ ์ •์˜
34
+ ##############################################################################
35
+ MAX_CONTENT_CHARS = 2000
36
+ MAX_INPUT_LENGTH = 2096
37
+ MAX_NUM_IMAGES = 5
38
+ SERPHOUSE_API_KEY = os.getenv("SERPHOUSE_API_KEY", "")
39
+
40
+ ##############################################################################
41
+ # ์ „์—ญ ๋ณ€์ˆ˜
42
+ ##############################################################################
43
+ model = None
44
+ processor = None
45
+ model_loaded = False
46
+ model_name = "Gemma3-R1984-4B"
47
+
48
+ ##############################################################################
49
+ # ๋ฉ”๋ชจ๋ฆฌ ๊ด€๋ฆฌ
50
+ ##############################################################################
51
+ def clear_cuda_cache():
52
+ """CUDA ์บ์‹œ๋ฅผ ๋ช…์‹œ์ ์œผ๋กœ ๋น„์›๋‹ˆ๋‹ค."""
53
+ if torch.cuda.is_available():
54
+ torch.cuda.empty_cache()
55
+ gc.collect()
56
+
57
+ ##############################################################################
58
+ # ํ‚ค์›Œ๋“œ ์ถ”์ถœ ํ•จ์ˆ˜
59
+ ##############################################################################
60
+ def extract_keywords(text: str, top_k: int = 5) -> str:
61
+ """ํ‚ค์›Œ๋“œ ์ถ”์ถœ"""
62
+ text = re.sub(r"[^a-zA-Z0-9๊ฐ€-ํžฃ\s]", "", text)
63
+ tokens = text.split()
64
+
65
+ seen = set()
66
+ unique_tokens = []
67
+ for token in tokens:
68
+ if token not in seen and len(token) > 1:
69
+ seen.add(token)
70
+ unique_tokens.append(token)
71
+
72
+ key_tokens = unique_tokens[:top_k]
73
+ return " ".join(key_tokens)
74
+
75
+ ##############################################################################
76
+ # ์›น ๊ฒ€์ƒ‰ ํ•จ์ˆ˜
77
+ ##############################################################################
78
+ def do_web_search(query: str) -> str:
79
+ """SerpHouse API๋ฅผ ์‚ฌ์šฉํ•œ ์›น ๊ฒ€์ƒ‰"""
80
+ try:
81
+ url = "https://api.serphouse.com/serp/live"
82
+
83
+ params = {
84
+ "q": query,
85
+ "domain": "google.com",
86
+ "serp_type": "web",
87
+ "device": "desktop",
88
+ "lang": "ko", # ํ•œ๊ตญ์–ด ์šฐ์„ 
89
+ "num": "10" # 10๊ฐœ๋กœ ์ œํ•œ
90
+ }
91
+
92
+ headers = {
93
+ "Authorization": f"Bearer {SERPHOUSE_API_KEY}"
94
+ }
95
+
96
+ logger.info(f"์›น ๊ฒ€์ƒ‰ ์ค‘... ๊ฒ€์ƒ‰์–ด: {query}")
97
+
98
+ response = requests.get(url, headers=headers, params=params, timeout=60)
99
+ response.raise_for_status()
100
+
101
+ data = response.json()
102
+
103
+ results = data.get("results", {})
104
+ organic = results.get("organic", []) if isinstance(results, dict) else []
105
+
106
+ if not organic:
107
+ return "๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
108
+
109
+ max_results = min(10, len(organic))
110
+ limited_organic = organic[:max_results]
111
+
112
+ summary_lines = []
113
+ for idx, item in enumerate(limited_organic, start=1):
114
+ title = item.get("title", "์ œ๋ชฉ ์—†์Œ")
115
+ link = item.get("link", "#")
116
+ snippet = item.get("snippet", "์„ค๋ช… ์—†์Œ")
117
+ displayed_link = item.get("displayed_link", link)
118
+
119
+ summary_lines.append(
120
+ f"### ๊ฒฐ๊ณผ {idx}: {title}\n\n"
121
+ f"{snippet}\n\n"
122
+ f"**์ถœ์ฒ˜**: [{displayed_link}]({link})\n\n"
123
+ f"---\n"
124
+ )
125
+
126
+ instructions = """# ์›น ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ
127
+ ์•„๋ž˜๋Š” ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ์ž…๋‹ˆ๋‹ค. ๋‹ต๋ณ€ ์‹œ ์ด ์ •๋ณด๋ฅผ ํ™œ์šฉํ•˜์„ธ์š”:
128
+ 1. ๊ฐ ๊ฒฐ๊ณผ์˜ ์ œ๋ชฉ, ๋‚ด์šฉ, ์ถœ์ฒ˜ ๋งํฌ๋ฅผ ์ฐธ์กฐํ•˜์„ธ์š”
129
+ 2. ๊ด€๋ จ ์ถœ์ฒ˜๋ฅผ ๋ช…์‹œ์ ์œผ๋กœ ์ธ์šฉํ•˜์„ธ์š”
130
+ 3. ์—ฌ๋Ÿฌ ์ถœ์ฒ˜์˜ ์ •๋ณด๋ฅผ ์ข…ํ•ฉํ•˜์—ฌ ๋‹ต๋ณ€ํ•˜์„ธ์š”
131
+ """
132
+
133
+ search_results = instructions + "\n".join(summary_lines)
134
+ return search_results
135
+
136
+ except Exception as e:
137
+ logger.error(f"์›น ๊ฒ€์ƒ‰ ์‹คํŒจ: {e}")
138
+ return f"์›น ๊ฒ€์ƒ‰ ์‹คํŒจ: {str(e)}"
139
+
140
+ ##############################################################################
141
+ # ๋ฌธ์„œ ์ฒ˜๋ฆฌ ํ•จ์ˆ˜
142
+ ##############################################################################
143
+ def analyze_csv_file(path: str) -> str:
144
+ """CSV ํŒŒ์ผ ๋ถ„์„"""
145
+ try:
146
+ df = pd.read_csv(path)
147
+ if df.shape[0] > 50 or df.shape[1] > 10:
148
+ df = df.iloc[:50, :10]
149
+ df_str = df.to_string()
150
+ if len(df_str) > MAX_CONTENT_CHARS:
151
+ df_str = df_str[:MAX_CONTENT_CHARS] + "\n...(์ค‘๋žต)..."
152
+ return f"**[CSV ํŒŒ์ผ: {os.path.basename(path)}]**\n\n{df_str}"
153
+ except Exception as e:
154
+ return f"CSV ์ฝ๊ธฐ ์‹คํŒจ ({os.path.basename(path)}): {str(e)}"
155
+
156
+ def analyze_txt_file(path: str) -> str:
157
+ """TXT ํŒŒ์ผ ๋ถ„์„"""
158
+ try:
159
+ with open(path, "r", encoding="utf-8") as f:
160
+ text = f.read()
161
+ if len(text) > MAX_CONTENT_CHARS:
162
+ text = text[:MAX_CONTENT_CHARS] + "\n...(์ค‘๋žต)..."
163
+ return f"**[TXT ํŒŒ์ผ: {os.path.basename(path)}]**\n\n{text}"
164
+ except Exception as e:
165
+ return f"TXT ์ฝ๊ธฐ ์‹คํŒจ ({os.path.basename(path)}): {str(e)}"
166
+
167
+ def pdf_to_markdown(pdf_path: str) -> str:
168
+ """PDF๋ฅผ ๋งˆํฌ๋‹ค์šด์œผ๋กœ ๋ณ€ํ™˜"""
169
+ text_chunks = []
170
+ try:
171
+ with open(pdf_path, "rb") as f:
172
+ reader = PyPDF2.PdfReader(f)
173
+ max_pages = min(5, len(reader.pages))
174
+ for page_num in range(max_pages):
175
+ page = reader.pages[page_num]
176
+ page_text = page.extract_text() or ""
177
+ page_text = page_text.strip()
178
+ if page_text:
179
+ if len(page_text) > MAX_CONTENT_CHARS // max_pages:
180
+ page_text = page_text[:MAX_CONTENT_CHARS // max_pages] + "...(์ค‘๋žต)"
181
+ text_chunks.append(f"## ํŽ˜์ด์ง€ {page_num+1}\n\n{page_text}\n")
182
+ if len(reader.pages) > max_pages:
183
+ text_chunks.append(f"\n...({max_pages}/{len(reader.pages)} ํŽ˜์ด์ง€ ํ‘œ์‹œ)...")
184
+ except Exception as e:
185
+ return f"PDF ์ฝ๊ธฐ ์‹คํŒจ ({os.path.basename(pdf_path)}): {str(e)}"
186
+
187
+ full_text = "\n".join(text_chunks)
188
+ if len(full_text) > MAX_CONTENT_CHARS:
189
+ full_text = full_text[:MAX_CONTENT_CHARS] + "\n...(์ค‘๋žต)..."
190
+
191
+ return f"**[PDF ํŒŒ์ผ: {os.path.basename(pdf_path)}]**\n\n{full_text}"
192
+
193
+ ##############################################################################
194
+ # ๋ชจ๋ธ ๋กœ๋“œ
195
+ ##############################################################################
196
+ @spaces.GPU(duration=120)
197
+ def load_model():
198
+ global model, processor, model_loaded
199
+
200
+ if model_loaded:
201
+ logger.info("๋ชจ๋ธ์ด ์ด๋ฏธ ๋กœ๋“œ๋˜์–ด ์žˆ์Šต๋‹ˆ๋‹ค.")
202
+ return True
203
+
204
+ try:
205
+ logger.info("Gemma3-R1984-4B ๋ชจ๋ธ ๋กœ๋”ฉ ์‹œ์ž‘...")
206
+ clear_cuda_cache()
207
+
208
+ model_id = os.getenv("MODEL_ID", "VIDraft/Gemma-3-R1984-4B")
209
+
210
+ processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
211
+ model = Gemma3ForConditionalGeneration.from_pretrained(
212
+ model_id,
213
+ device_map="auto",
214
+ torch_dtype=torch.bfloat16,
215
+ attn_implementation="eager"
216
+ )
217
+
218
+ model_loaded = True
219
+ logger.info(f"โœ… {model_name} ๋กœ๋”ฉ ์™„๋ฃŒ!")
220
+ return True
221
+
222
+ except Exception as e:
223
+ logger.error(f"๋ชจ๋ธ ๋กœ๋”ฉ ์‹คํŒจ: {e}")
224
+ return False
225
+
226
+ ##############################################################################
227
+ # ์ด๋ฏธ์ง€ ๋ถ„์„ (๋กœ๋ด‡ ํƒœ์Šคํฌ ์ค‘์‹ฌ)
228
+ ##############################################################################
229
+ @spaces.GPU(duration=60)
230
+ def analyze_image_for_robot(
231
+ image: Union[np.ndarray, Image.Image],
232
+ prompt: str,
233
+ task_type: str = "general",
234
+ use_web_search: bool = False,
235
+ enable_thinking: bool = False, # ๊ธฐ๋ณธ๊ฐ’ False๋กœ ๋ณ€๊ฒฝ
236
+ max_new_tokens: int = 300 # ์žฅ๋ฉด ์„ค๋ช…์„ ์œ„ํ•ด 300์œผ๋กœ ์ฆ๊ฐ€
237
+ ) -> str:
238
+ """๋กœ๋ด‡ ์ž‘์—…์„ ์œ„ํ•œ ์ด๋ฏธ์ง€ ๋ถ„์„"""
239
+ global model, processor
240
+
241
+ if not model_loaded:
242
+ if not load_model():
243
+ return "โŒ ๋ชจ๋ธ ๋กœ๋”ฉ ์‹คํŒจ"
244
+
245
+ try:
246
+ # numpy ๋ฐฐ์—ด์„ PIL ์ด๋ฏธ์ง€๋กœ ๋ณ€ํ™˜
247
+ if isinstance(image, np.ndarray):
248
+ image = Image.fromarray(image).convert('RGB')
249
+
250
+ # ํƒœ์Šคํฌ๋ณ„ ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ (๋” ๊ฐ„๊ฒฐํ•˜๊ฒŒ)
251
+ system_prompts = {
252
+ "general": "๋‹น์‹ ์€ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ์ž…๋‹ˆ๋‹ค. ๋จผ์ € ์žฅ๋ฉด์„ 1-2์ค„๋กœ ์„ค๋ช…ํ•˜๊ณ , ํ•ต์‹ฌ ๋‚ด์šฉ์„ ๊ฐ„๊ฒฐํ•˜๊ฒŒ ๋ถ„์„ํ•˜์„ธ์š”.",
253
+ "planning": """๋‹น์‹ ์€ ๋กœ๋ด‡ ์ž‘์—… ๊ณ„ํš AI์ž…๋‹ˆ๋‹ค.
254
+ ๋จผ์ € ์žฅ๋ฉด ์ดํ•ด๋ฅผ 1-2์ค„๋กœ ์„ค๋ช…ํ•˜๊ณ , ๊ทธ ๋‹ค์Œ ์ž‘์—… ๊ณ„ํš์„ ์ž‘์„ฑํ•˜์„ธ์š”.
255
+ ํ˜•์‹:
256
+ [์žฅ๋ฉด ์ดํ•ด] ํ˜„์žฌ ๋ณด์ด๋Š” ์žฅ๋ฉด์„ 1-2์ค„๋กœ ์„ค๋ช…
257
+
258
+ [์ž‘์—… ๊ณ„ํš]
259
+ Step_1: xxx
260
+ Step_2: xxx
261
+ Step_n: xxx""",
262
+ "grounding": "๋‹น์‹ ์€ ๊ฐ์ฒด ์œ„์น˜ ์‹œ์Šคํ…œ์ž…๋‹ˆ๋‹ค. ๋จผ์ € ๋ณด์ด๋Š” ๊ฐ์ฒด๋“ค์„ ํ•œ ์ค„๋กœ ์„ค๋ช…ํ•˜๊ณ , ์š”์ฒญ๋œ ๊ฐ์ฒด ์œ„์น˜๋ฅผ [x1, y1, x2, y2]๋กœ ๋ฐ˜ํ™˜ํ•˜์„ธ์š”.",
263
+ "affordance": "๋‹น์‹ ์€ ํŒŒ์ง€์  ๋ถ„์„ AI์ž…๋‹ˆ๋‹ค. ๋จผ์ € ๋Œ€์ƒ ๊ฐ์ฒด๋ฅผ ํ•œ ์ค„๋กœ ์„ค๋ช…ํ•˜๊ณ , ํŒŒ์ง€ ์˜์—ญ์„ [x1, y1, x2, y2]๋กœ ๋ฐ˜ํ™˜ํ•˜์„ธ์š”.",
264
+ "trajectory": "๋‹น์‹ ์€ ๊ฒฝ๋กœ ๊ณ„ํš AI์ž…๋‹ˆ๋‹ค. ๋จผ์ € ํ™˜๊ฒฝ์„ ํ•œ ์ค„๋กœ ์„ค๋ช…ํ•˜๊ณ , ๊ฒฝ๋กœ๋ฅผ [(x1,y1), (x2,y2), ...]๋กœ ์ œ์‹œํ•˜์„ธ์š”.",
265
+ "pointing": "๋‹น์‹ ์€ ์ง€์  ์ง€์ • ์‹œ์Šคํ…œ์ž…๋‹ˆ๋‹ค. ๋จผ์ € ์ฐธ์กฐ์ ๋“ค์„ ํ•œ ์ค„๋กœ ์„ค๋ช…ํ•˜๊ณ , ์œ„์น˜๋ฅผ [(x1,y1), (x2,y2), ...]๋กœ ๋ฐ˜ํ™˜ํ•˜์„ธ์š”."
266
+ }
267
+
268
+ system_prompt = system_prompts.get(task_type, system_prompts["general"])
269
+
270
+ # Chain-of-Thought ์ถ”๊ฐ€ (์„ ํƒ์ )
271
+ if enable_thinking:
272
+ system_prompt += "\n\n์ถ”๋ก  ๊ณผ์ •์„ <thinking></thinking> ํƒœ๊ทธ ์•ˆ์— ์ž‘์„ฑ ํ›„ ์ตœ์ข… ๋‹ต๋ณ€์„ ์ œ์‹œํ•˜์„ธ์š”. ์žฅ๋ฉด ์ดํ•ด๋Š” ์ถ”๋ก  ๊ณผ์ •๊ณผ ๋ณ„๋„๋กœ ๋ฐ˜๋“œ์‹œ ํฌํ•จํ•˜์„ธ์š”."
273
+
274
+ # ์›น ๊ฒ€์ƒ‰ ์ˆ˜ํ–‰
275
+ combined_system = system_prompt
276
+ if use_web_search:
277
+ keywords = extract_keywords(prompt, top_k=5)
278
+ if keywords:
279
+ logger.info(f"์›น ๊ฒ€์ƒ‰ ํ‚ค์›Œ๋“œ: {keywords}")
280
+ search_results = do_web_search(keywords)
281
+ combined_system = f"{search_results}\n\n{system_prompt}"
282
+
283
+ # ๋ฉ”์‹œ์ง€ ๊ตฌ์„ฑ
284
+ messages = [
285
+ {
286
+ "role": "system",
287
+ "content": [{"type": "text", "text": combined_system}]
288
+ },
289
+ {
290
+ "role": "user",
291
+ "content": [
292
+ {"type": "image", "url": image},
293
+ {"type": "text", "text": prompt}
294
+ ]
295
+ }
296
+ ]
297
+
298
+ # ์ž…๋ ฅ ์ฒ˜๋ฆฌ
299
+ inputs = processor.apply_chat_template(
300
+ messages,
301
+ add_generation_prompt=True,
302
+ tokenize=True,
303
+ return_dict=True,
304
+ return_tensors="pt",
305
+ ).to(device=model.device, dtype=torch.bfloat16)
306
+
307
+ # ์ž…๋ ฅ ํ† ํฐ ์ˆ˜ ์ œํ•œ
308
+ if inputs.input_ids.shape[1] > MAX_INPUT_LENGTH:
309
+ inputs.input_ids = inputs.input_ids[:, -MAX_INPUT_LENGTH:]
310
+ if 'attention_mask' in inputs:
311
+ inputs.attention_mask = inputs.attention_mask[:, -MAX_INPUT_LENGTH:]
312
+
313
+ # ์ƒ์„ฑ
314
+ with torch.no_grad():
315
+ outputs = model.generate(
316
+ **inputs,
317
+ max_new_tokens=max_new_tokens,
318
+ do_sample=True,
319
+ temperature=0.7,
320
+ top_p=0.9,
321
+ pad_token_id=processor.tokenizer.pad_token_id,
322
+ eos_token_id=processor.tokenizer.eos_token_id,
323
+ )
324
+
325
+ # ์ž…๋ ฅ ํ† ํฐ ์ œ๊ฑฐํ•˜์—ฌ ์ถœ๋ ฅ๋งŒ ์ถ”์ถœ
326
+ generated_tokens = outputs[0][inputs.input_ids.shape[1]:]
327
+
328
+ # ๋””์ฝ”๋”ฉ
329
+ response = processor.decode(generated_tokens, skip_special_tokens=True).strip()
330
+
331
+ # ํ”„๋กฌํ”„ํŠธ ์ œ๊ฑฐ ๋ฐ ์ •๋ฆฌ
332
+ # ์ด๋ฏธ ์ž…๋ ฅ ํ† ํฐ์„ ์ œ๊ฑฐํ–ˆ์œผ๋ฏ€๋กœ ์ถ”๊ฐ€ ์ •๋ฆฌ๋งŒ ์ˆ˜ํ–‰
333
+ response = response.strip()
334
+
335
+ # ํ˜น์‹œ ๋‚จ์•„์žˆ๋Š” ๋ถˆํ•„์š”ํ•œ ํ…์ŠคํŠธ ์ œ๊ฑฐ
336
+ if response.startswith("model\n"):
337
+ response = response[6:].strip()
338
+ elif response.startswith("model"):
339
+ response = response[5:].strip()
340
+
341
+ return response
342
+
343
+ except Exception as e:
344
+ logger.error(f"์ด๋ฏธ์ง€ ๋ถ„์„ ์˜ค๋ฅ˜: {e}")
345
+ import traceback
346
+ return f"โŒ ๋ถ„์„ ์˜ค๋ฅ˜: {str(e)}\n{traceback.format_exc()}"
347
+ finally:
348
+ clear_cuda_cache()
349
+
350
+ ##############################################################################
351
+ # ๋ฌธ์„œ ๋ถ„์„ (์ŠคํŠธ๋ฆฌ๋ฐ)
352
+ ##############################################################################
353
+ def _model_gen_with_oom_catch(**kwargs):
354
+ """OOM ์ฒ˜๋ฆฌ๋ฅผ ์œ„ํ•œ ์ƒ์„ฑ ํ•จ์ˆ˜"""
355
+ global model
356
+ try:
357
+ model.generate(**kwargs)
358
+ except torch.cuda.OutOfMemoryError:
359
+ raise RuntimeError("GPU ๋ฉ”๋ชจ๋ฆฌ ๋ถ€์กฑ. Max Tokens๋ฅผ ์ค„์—ฌ์ฃผ์„ธ์š”.")
360
+ finally:
361
+ clear_cuda_cache()
362
+
363
+ @spaces.GPU(duration=120)
364
+ def analyze_documents_streaming(
365
+ files: List[str],
366
+ prompt: str,
367
+ use_web_search: bool = False,
368
+ max_new_tokens: int = 2048
369
+ ) -> Iterator[str]:
370
+ """๋ฌธ์„œ ๋ถ„์„ (์ŠคํŠธ๋ฆฌ๋ฐ)"""
371
+ global model, processor
372
+
373
+ if not model_loaded:
374
+ if not load_model():
375
+ yield "โŒ ๋ชจ๋ธ ๋กœ๋”ฉ ์‹คํŒจ"
376
+ return
377
+
378
+ try:
379
+ # ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ
380
+ system_content = "๋‹น์‹ ์€ ๋ฌธ์„œ๋ฅผ ๋ถ„์„ํ•˜๊ณ  ์š”์•ฝํ•˜๋Š” ์ „๋ฌธ AI์ž…๋‹ˆ๋‹ค."
381
+
382
+ # ์›น ๊ฒ€์ƒ‰
383
+ if use_web_search:
384
+ keywords = extract_keywords(prompt, top_k=5)
385
+ if keywords:
386
+ search_results = do_web_search(keywords)
387
+ system_content = f"{search_results}\n\n{system_content}"
388
+
389
+ # ๋ฌธ์„œ ๋‚ด์šฉ ์ฒ˜๋ฆฌ
390
+ doc_contents = []
391
+ for file_path in files:
392
+ if file_path.lower().endswith('.csv'):
393
+ content = analyze_csv_file(file_path)
394
+ elif file_path.lower().endswith('.txt'):
395
+ content = analyze_txt_file(file_path)
396
+ elif file_path.lower().endswith('.pdf'):
397
+ content = pdf_to_markdown(file_path)
398
+ else:
399
+ continue
400
+ doc_contents.append(content)
401
+
402
+ # ๋ฉ”์‹œ์ง€ ๊ตฌ์„ฑ
403
+ messages = [
404
+ {
405
+ "role": "system",
406
+ "content": [{"type": "text", "text": system_content}]
407
+ },
408
+ {
409
+ "role": "user",
410
+ "content": [
411
+ {"type": "text", "text": "\n\n".join(doc_contents) + f"\n\n{prompt}"}
412
+ ]
413
+ }
414
+ ]
415
+
416
+ # ์ž…๋ ฅ ์ฒ˜๋ฆฌ
417
+ inputs = processor.apply_chat_template(
418
+ messages,
419
+ add_generation_prompt=True,
420
+ tokenize=True,
421
+ return_dict=True,
422
+ return_tensors="pt",
423
+ ).to(device=model.device, dtype=torch.bfloat16)
424
+
425
+ # ์ŠคํŠธ๋ฆฌ๋ฐ ์„ค์ •
426
+ streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
427
+ gen_kwargs = dict(
428
+ inputs,
429
+ streamer=streamer,
430
+ max_new_tokens=max_new_tokens,
431
+ temperature=0.8,
432
+ top_p=0.9,
433
+ )
434
+
435
+ # ๋ณ„๋„ ์Šค๋ ˆ๋“œ์—์„œ ์ƒ์„ฑ
436
+ t = Thread(target=_model_gen_with_oom_catch, kwargs=gen_kwargs)
437
+ t.start()
438
+
439
+ # ์ŠคํŠธ๋ฆฌ๋ฐ ์ถœ๋ ฅ
440
+ output = ""
441
+ for new_text in streamer:
442
+ output += new_text
443
+ yield output
444
+
445
+ except Exception as e:
446
+ logger.error(f"๋ฌธ์„œ ๋ถ„์„ ์˜ค๋ฅ˜: {e}")
447
+ yield f"โŒ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
448
+ finally:
449
+ clear_cuda_cache()
450
+
451
+ ##############################################################################
452
+ # Gradio UI (๋กœ๋ด‡ ์‹œ๊ฐํ™” ์ค‘์‹ฌ)
453
+ ##############################################################################
454
+ css = """
455
+ .robot-header {
456
+ text-align: center;
457
+ background: linear-gradient(135deg, #1e3c72 0%, #2a5298 50%, #667eea 100%);
458
+ color: white;
459
+ padding: 20px;
460
+ border-radius: 10px;
461
+ margin-bottom: 20px;
462
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
463
+ }
464
+ .status-box {
465
+ text-align: center;
466
+ padding: 10px;
467
+ border-radius: 5px;
468
+ margin: 10px 0;
469
+ font-weight: bold;
470
+ }
471
+ .info-box {
472
+ background: #f0f0f0;
473
+ padding: 15px;
474
+ border-radius: 8px;
475
+ margin: 10px 0;
476
+ border-left: 4px solid #2a5298;
477
+ }
478
+ .task-button {
479
+ min-height: 60px;
480
+ font-size: 1.1em;
481
+ }
482
+ .webcam-container {
483
+ border: 3px solid #2a5298;
484
+ border-radius: 10px;
485
+ padding: 10px;
486
+ background: #f8f9fa;
487
+ }
488
+ .auto-capture-status {
489
+ text-align: center;
490
+ padding: 5px;
491
+ border-radius: 5px;
492
+ margin: 5px 0;
493
+ font-weight: bold;
494
+ background: #e8f5e9;
495
+ color: #2e7d32;
496
+ }
497
+ """
498
+
499
+ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as demo:
500
+ gr.HTML("""
501
+ <div class="robot-header">
502
+ <h1>๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ</h1>
503
+ <h3>๐ŸŽฎ Gemma3-R1984-4B + ๐Ÿ“ท ์‹ค์‹œ๊ฐ„ ์›น์บ  + ๐Ÿ” ์›น ๊ฒ€์ƒ‰</h3>
504
+ <p>โšก ์ตœ์‹  ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ AI๋กœ ๋กœ๋ด‡ ์ž‘์—… ๋ถ„์„ ๋ฐ ๊ณ„ํš ์ˆ˜๋ฆฝ!</p>
505
+ </div>
506
+ """)
507
+
508
+ gr.HTML("""
509
+ <div class="info-box">
510
+ <h4>๐ŸŒŸ ์‹œ์Šคํ…œ ํŠน์ง•:</h4>
511
+ <ul>
512
+ <li>๐Ÿ–ผ๏ธ ๊ณ ๊ธ‰ ์ด๋ฏธ์ง€/๋น„๋””์˜ค ๋ถ„์„ (Gemma3-4B VLM)</li>
513
+ <li>๐Ÿ‘๏ธ ์žฅ๋ฉด ์ดํ•ด ๋ฐ ์ƒํ™ฉ ์„ค๋ช…</li>
514
+ <li>๐Ÿ“‹ ๋‹ค๋‹จ๊ณ„ ์ž‘์—… ๊ณ„ํš ๋ฐ ์ถ”๋ก </li>
515
+ <li>๐Ÿ“ ์ •๋ฐ€ํ•œ ๊ฐ์ฒด ์œ„์น˜ ํŒŒ์•… (Grounding)</li>
516
+ <li>๐Ÿค ๋กœ๋ด‡ ํŒŒ์ง€์  ๋ถ„์„ (Affordance)</li>
517
+ <li>๐Ÿ›ค๏ธ ๊ฒฝ๋กœ ๊ณ„ํš (Trajectory Planning)</li>
518
+ <li>๐Ÿ” ์‹ค์‹œ๊ฐ„ ์›น ๊ฒ€์ƒ‰ ํ†ตํ•ฉ</li>
519
+ <li>๐Ÿ”„ 10์ดˆ๋งˆ๋‹ค ์ž๋™ ์บก์ฒ˜ ๋ฐ ๋ถ„์„</li>
520
+ </ul>
521
+ </div>
522
+ """)
523
+
524
+ with gr.Row():
525
+ # ์™ผ์ชฝ: ์›น์บ  ๋ฐ ์ž…๋ ฅ
526
+ with gr.Column(scale=1):
527
+ gr.Markdown("### ๐Ÿ“ท ์‹ค์‹œ๊ฐ„ ์›น์บ ")
528
+
529
+ with gr.Group(elem_classes="webcam-container"):
530
+ webcam = gr.Image(
531
+ sources=["webcam"],
532
+ streaming=True,
533
+ type="numpy",
534
+ label="์‹ค์‹œ๊ฐ„ ์ŠคํŠธ๋ฆฌ๋ฐ",
535
+ height=350
536
+ )
537
+
538
+ # ์ž๋™ ์บก์ฒ˜ ์ƒํƒœ ํ‘œ์‹œ
539
+ auto_capture_status = gr.HTML(
540
+ '<div class="auto-capture-status">๐Ÿ”„ ์ž๋™ ์บก์ฒ˜: ๋Œ€๊ธฐ ์ค‘</div>'
541
+ )
542
+
543
+ # ์บก์ฒ˜๋œ ์ด๋ฏธ์ง€ ํ‘œ์‹œ
544
+ captured_image = gr.Image(
545
+ label="์บก์ฒ˜๋œ ์ด๋ฏธ์ง€",
546
+ height=200,
547
+ visible=False
548
+ )
549
+
550
+ # ๋กœ๋ด‡ ์ž‘์—… ๋ฒ„ํŠผ๋“ค
551
+ gr.Markdown("### ๐ŸŽฏ ๋กœ๋ด‡ ์ž‘์—… ์„ ํƒ")
552
+ with gr.Row():
553
+ capture_btn = gr.Button("๐Ÿ“ธ ์ˆ˜๋™ ์บก์ฒ˜", variant="primary", elem_classes="task-button")
554
+ clear_capture_btn = gr.Button("๐Ÿ—‘๏ธ ์ดˆ๊ธฐํ™”", elem_classes="task-button")
555
+
556
+ with gr.Row():
557
+ auto_capture_toggle = gr.Checkbox(
558
+ label="๐Ÿ”„ ์ž๋™ ์บก์ฒ˜ ํ™œ์„ฑํ™” (10์ดˆ๋งˆ๋‹ค)",
559
+ value=False,
560
+ info="ํ™œ์„ฑํ™” ์‹œ 10์ดˆ๋งˆ๋‹ค ์ž๋™์œผ๋กœ ์บก์ฒ˜ ๋ฐ ๋ถ„์„"
561
+ )
562
+
563
+ with gr.Row():
564
+ planning_btn = gr.Button("๐Ÿ“‹ ์ž‘์—… ๊ณ„ํš", elem_classes="task-button")
565
+ grounding_btn = gr.Button("๐Ÿ“ ๊ฐ์ฒด ์œ„์น˜", elem_classes="task-button")
566
+
567
+ with gr.Row():
568
+ affordance_btn = gr.Button("๐Ÿค ํŒŒ์ง€์  ๋ถ„์„", elem_classes="task-button")
569
+ trajectory_btn = gr.Button("๐Ÿ›ค๏ธ ๊ฒฝ๋กœ ๊ณ„ํš", elem_classes="task-button")
570
+
571
+ # ์˜ค๋ฅธ์ชฝ: ๋ถ„์„ ์„ค์ • ๋ฐ ๊ฒฐ๊ณผ
572
+ with gr.Column(scale=2):
573
+ gr.Markdown("### โš™๏ธ ๋ถ„์„ ์„ค์ •")
574
+
575
+ with gr.Row():
576
+ with gr.Column():
577
+ task_prompt = gr.Textbox(
578
+ label="์ž‘์—… ์„ค๋ช… / ์งˆ๋ฌธ",
579
+ placeholder="์˜ˆ: ํ…Œ์ด๋ธ” ์œ„์˜ ์ปต์„ ์žก์•„์„œ ์‹ฑํฌ๋Œ€์— ๋†“๊ธฐ",
580
+ value="ํ˜„์žฌ ์žฅ๋ฉด์„ ๋ถ„์„ํ•˜๊ณ  ๋กœ๋ด‡์ด ์ˆ˜ํ–‰ํ•  ์ˆ˜ ์žˆ๋Š” ์ž‘์—…์„ ์ œ์•ˆํ•˜์„ธ์š”.",
581
+ lines=2
582
+ )
583
+
584
+ with gr.Row():
585
+ use_web_search = gr.Checkbox(
586
+ label="๐Ÿ” ์›น ๊ฒ€์ƒ‰ ์‚ฌ์šฉ",
587
+ value=False,
588
+ info="๊ด€๋ จ ์ •๋ณด๋ฅผ ์›น์—์„œ ๊ฒ€์ƒ‰ํ•ฉ๋‹ˆ๋‹ค"
589
+ )
590
+
591
+ enable_thinking = gr.Checkbox(
592
+ label="๐Ÿค” ์ถ”๋ก  ๊ณผ์ • ํ‘œ์‹œ",
593
+ value=False, # ๊ธฐ๋ณธ๊ฐ’ False๋กœ ๋ณ€๊ฒฝ
594
+ info="Chain-of-Thought ์ถ”๋ก  ๊ณผ์ •์„ ๋ณด์—ฌ์ค๋‹ˆ๋‹ค"
595
+ )
596
+
597
+ max_tokens = gr.Slider(
598
+ label="์ตœ๋Œ€ ํ† ํฐ ์ˆ˜",
599
+ minimum=100,
600
+ maximum=4096,
601
+ value=300, # ์žฅ๋ฉด ์„ค๋ช…์„ ์œ„ํ•ด 300์œผ๋กœ ์ฆ๊ฐ€
602
+ step=50
603
+ )
604
+
605
+ gr.Markdown("### ๐Ÿ“Š ๋ถ„์„ ๊ฒฐ๊ณผ")
606
+ result_output = gr.Textbox(
607
+ label="AI ๋ถ„์„ ๊ฒฐ๊ณผ",
608
+ lines=20,
609
+ max_lines=40,
610
+ show_copy_button=True,
611
+ elem_id="result"
612
+ )
613
+
614
+ status_display = gr.HTML(
615
+ '<div class="status-box" style="background:#d4edda; color:#155724;">๐ŸŽฎ ์‹œ์Šคํ…œ ์ค€๋น„ ์™„๋ฃŒ</div>'
616
+ )
617
+
618
+ # ๋ฌธ์„œ ๋ถ„์„ ํƒญ (์ˆจ๊น€ ์ฒ˜๋ฆฌ)
619
+ with gr.Tab("๐Ÿ“„ ๋ฌธ์„œ ๋ถ„์„", visible=False): # visible=False๋กœ ์ˆจ๊น€
620
+ with gr.Row():
621
+ with gr.Column():
622
+ doc_files = gr.File(
623
+ label="๋ฌธ์„œ ์—…๋กœ๋“œ",
624
+ file_count="multiple",
625
+ file_types=[".pdf", ".csv", ".txt"],
626
+ type="filepath"
627
+ )
628
+
629
+ doc_prompt = gr.Textbox(
630
+ label="๋ถ„์„ ์š”์ฒญ",
631
+ placeholder="์˜ˆ: ์ด ๋ฌธ์„œ๋“ค์˜ ํ•ต์‹ฌ ๋‚ด์šฉ์„ ์š”์•ฝํ•˜๊ณ  ๋น„๊ต ๋ถ„์„ํ•˜์„ธ์š”.",
632
+ lines=3
633
+ )
634
+
635
+ doc_web_search = gr.Checkbox(
636
+ label="๐Ÿ” ์›น ๊ฒ€์ƒ‰ ์‚ฌ์šฉ",
637
+ value=False
638
+ )
639
+
640
+ analyze_docs_btn = gr.Button("๐Ÿ“Š ๋ฌธ์„œ ๋ถ„์„", variant="primary")
641
+
642
+ with gr.Column():
643
+ doc_result = gr.Textbox(
644
+ label="๋ถ„์„ ๊ฒฐ๊ณผ",
645
+ lines=25,
646
+ max_lines=50
647
+ )
648
+
649
+ # ์ด๋ฒคํŠธ ํ•ธ๋“ค๋Ÿฌ
650
+ webcam_state = gr.State(None)
651
+ auto_capture_state = gr.State({"enabled": False, "timer": None})
652
+
653
+ def capture_webcam(frame):
654
+ """์›น์บ  ํ”„๋ ˆ์ž„ ์บก์ฒ˜"""
655
+ if frame is None:
656
+ return None, None, '<div class="status-box" style="background:#f8d7da; color:#721c24;">โŒ ์›น์บ  ํ”„๋ ˆ์ž„ ์—†์Œ</div>'
657
+ return frame, gr.update(value=frame, visible=True), '<div class="status-box" style="background:#d4edda; color:#155724;">โœ… ์ด๋ฏธ์ง€ ์บก์ฒ˜ ์™„๋ฃŒ</div>'
658
+
659
+ def clear_capture():
660
+ """์บก์ฒ˜ ์ดˆ๊ธฐํ™”"""
661
+ return None, gr.update(visible=False), '<div class="status-box" style="background:#d4edda; color:#155724;">๐ŸŽฎ ์‹œ์Šคํ…œ ์ค€๋น„ ์™„๋ฃŒ</div>'
662
+
663
+ def analyze_with_task(image, prompt, task_type, use_search, thinking, tokens):
664
+ """ํŠน์ • ํƒœ์Šคํฌ๋กœ ์ด๋ฏธ์ง€ ๋ถ„์„"""
665
+ if image is None:
666
+ return "โŒ ๋จผ์ € ์ด๋ฏธ์ง€๋ฅผ ์บก์ฒ˜ํ•˜์„ธ์š”.", '<div class="status-box" style="background:#f8d7da; color:#721c24;">โŒ ์ด๋ฏธ์ง€ ์—†์Œ</div>'
667
+
668
+ status = f'<div class="status-box" style="background:#cce5ff; color:#004085;">๐Ÿš€ {task_type} ๋ถ„์„ ์ค‘...</div>'
669
+
670
+ result = analyze_image_for_robot(
671
+ image=image,
672
+ prompt=prompt,
673
+ task_type=task_type,
674
+ use_web_search=use_search,
675
+ enable_thinking=thinking,
676
+ max_new_tokens=tokens
677
+ )
678
+
679
+ # ๊ฒฐ๊ณผ ํฌ๋งทํŒ… (๋” ๊ฐ„๊ฒฐํ•˜๊ฒŒ)
680
+ timestamp = time.strftime("%H:%M:%S")
681
+ task_names = {
682
+ "planning": "์ž‘์—… ๊ณ„ํš",
683
+ "grounding": "๊ฐ์ฒด ์œ„์น˜",
684
+ "affordance": "ํŒŒ์ง€์ ",
685
+ "trajectory": "๊ฒฝ๋กœ ๊ณ„ํš"
686
+ }
687
+
688
+ formatted_result = f"""๐Ÿค– {task_names.get(task_type, '๋ถ„์„')} ๊ฒฐ๊ณผ ({timestamp})
689
+ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
690
+ {result}
691
+ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”"""
692
+
693
+ complete_status = '<div class="status-box" style="background:#d4edda; color:#155724;">โœ… ๋ถ„์„ ์™„๋ฃŒ!</div>'
694
+ return formatted_result, complete_status
695
+
696
+ # ์ž๋™ ์บก์ฒ˜ ๋ฐ ๋ถ„์„ ํ•จ์ˆ˜
697
+ def auto_capture_and_analyze(webcam_frame, task_prompt, use_search, thinking, tokens, auto_state):
698
+ """์ž๋™ ์บก์ฒ˜ ๋ฐ ๋ถ„์„"""
699
+ if webcam_frame is None:
700
+ return (
701
+ None,
702
+ "์ž๋™ ์บก์ฒ˜ ๋Œ€๊ธฐ ์ค‘...",
703
+ '<div class="status-box" style="background:#fff3cd; color:#856404;">โณ ์›น์บ  ๋Œ€๊ธฐ ์ค‘</div>',
704
+ '<div class="auto-capture-status">๐Ÿ”„ ์ž๋™ ์บก์ฒ˜: ์›น์บ  ๋Œ€๊ธฐ ์ค‘</div>'
705
+ )
706
+
707
+ # ์บก์ฒ˜ ์ˆ˜ํ–‰
708
+ timestamp = time.strftime("%H:%M:%S")
709
+
710
+ # ์ด๋ฏธ์ง€ ๋ถ„์„ (์ž‘์—… ๊ณ„ํš ๋ชจ๋“œ๋กœ)
711
+ result = analyze_image_for_robot(
712
+ image=webcam_frame,
713
+ prompt=task_prompt,
714
+ task_type="planning",
715
+ use_web_search=use_search,
716
+ enable_thinking=thinking,
717
+ max_new_tokens=tokens
718
+ )
719
+
720
+ formatted_result = f"""๐Ÿ”„ ์ž๋™ ๋ถ„์„ ์™„๋ฃŒ ({timestamp})
721
+ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
722
+ {result}
723
+ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”"""
724
+
725
+ return (
726
+ webcam_frame,
727
+ formatted_result,
728
+ '<div class="status-box" style="background:#d4edda; color:#155724;">โœ… ์ž๋™ ๋ถ„์„ ์™„๋ฃŒ</div>',
729
+ f'<div class="auto-capture-status">๐Ÿ”„ ์ž๋™ ์บก์ฒ˜: ๋งˆ์ง€๋ง‰ ๋ถ„์„ {timestamp}</div>'
730
+ )
731
+
732
+ # ์›น์บ  ์ŠคํŠธ๋ฆฌ๋ฐ
733
+ webcam.stream(
734
+ fn=lambda x: x,
735
+ inputs=[webcam],
736
+ outputs=[webcam_state]
737
+ )
738
+
739
+ # ์ˆ˜๋™ ์บก์ฒ˜ ๋ฒ„ํŠผ
740
+ capture_btn.click(
741
+ fn=capture_webcam,
742
+ inputs=[webcam_state],
743
+ outputs=[webcam_state, captured_image, status_display]
744
+ )
745
+
746
+ # ์ดˆ๊ธฐํ™” ๋ฒ„ํŠผ
747
+ clear_capture_btn.click(
748
+ fn=clear_capture,
749
+ outputs=[webcam_state, captured_image, status_display]
750
+ )
751
+
752
+ # ์ž‘์—… ๋ฒ„ํŠผ๋“ค
753
+ planning_btn.click(
754
+ fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "planning", s, t, tk),
755
+ inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
756
+ outputs=[result_output, status_display]
757
+ )
758
+
759
+ grounding_btn.click(
760
+ fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "grounding", s, t, tk),
761
+ inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
762
+ outputs=[result_output, status_display]
763
+ )
764
+
765
+ affordance_btn.click(
766
+ fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "affordance", s, t, tk),
767
+ inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
768
+ outputs=[result_output, status_display]
769
+ )
770
+
771
+ trajectory_btn.click(
772
+ fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "trajectory", s, t, tk),
773
+ inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
774
+ outputs=[result_output, status_display]
775
+ )
776
+
777
+ # ๋ฌธ์„œ ๋ถ„์„
778
+ def analyze_docs(files, prompt, use_search):
779
+ if not files:
780
+ return "โŒ ๋ฌธ์„œ๋ฅผ ์—…๋กœ๋“œํ•˜์„ธ์š”."
781
+
782
+ output = ""
783
+ for chunk in analyze_documents_streaming(files, prompt, use_search):
784
+ output = chunk
785
+ return output
786
+
787
+ analyze_docs_btn.click(
788
+ fn=analyze_docs,
789
+ inputs=[doc_files, doc_prompt, doc_web_search],
790
+ outputs=[doc_result]
791
+ )
792
+
793
+ # ์ž๋™ ์บก์ฒ˜ ํƒ€์ด๋จธ (10์ดˆ๋งˆ๋‹ค)
794
+ timer = gr.Timer(10.0, active=False) # 10์ดˆ ํƒ€์ด๋จธ, ์ดˆ๊ธฐ์—๋Š” ๋น„ํ™œ์„ฑํ™”
795
+
796
+ # ์ž๋™ ์บก์ฒ˜ ํ† ๊ธ€ ์ด๋ฒคํŠธ
797
+ def toggle_auto_capture(enabled):
798
+ if enabled:
799
+ return gr.Timer(10.0, active=True), '<div class="auto-capture-status">๐Ÿ”„ ์ž๋™ ์บก์ฒ˜: ํ™œ์„ฑํ™”๋จ (10์ดˆ๋งˆ๋‹ค)</div>'
800
+ else:
801
+ return gr.Timer(active=False), '<div class="auto-capture-status">๐Ÿ”„ ์ž๋™ ์บก์ฒ˜: ๋น„ํ™œ์„ฑํ™”๋จ</div>'
802
+
803
+ auto_capture_toggle.change(
804
+ fn=toggle_auto_capture,
805
+ inputs=[auto_capture_toggle],
806
+ outputs=[timer, auto_capture_status]
807
+ )
808
+
809
+ # ํƒ€์ด๋จธ ํ‹ฑ ์ด๋ฒคํŠธ
810
+ timer.tick(
811
+ fn=auto_capture_and_analyze,
812
+ inputs=[webcam_state, task_prompt, use_web_search, enable_thinking, max_tokens, auto_capture_state],
813
+ outputs=[captured_image, result_output, status_display, auto_capture_status]
814
+ )
815
+
816
+ # ์ดˆ๊ธฐ ๋ชจ๋ธ ๋กœ๋“œ
817
+ def initial_load():
818
+ load_model()
819
+ return "์‹œ์Šคํ…œ ์ค€๋น„ ์™„๋ฃŒ! ๐Ÿš€"
820
+
821
+ demo.load(
822
+ fn=initial_load,
823
+ outputs=None
824
+ )
825
+
826
+ if __name__ == "__main__":
827
+ print("๐Ÿš€ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ ์‹œ์ž‘ (Gemma3-R1984-4B)...")
828
+ demo.launch(
829
+ server_name="0.0.0.0",
830
+ server_port=7860,
831
+ share=False,
832
+ show_error=True,
833
+ debug=False
834
+ )