Spaces:
Running
on
Zero
Running
on
Zero
Upload 8 files
Browse files- app.py +181 -31
- enhance_scene_describer.py +75 -43
- image_processor.py +8 -2
- llm_enhancer.py +1218 -0
- requirements.txt +5 -0
- scene_analyzer.py +92 -2
- scene_type.py +0 -7
- style.py +35 -4
app.py
CHANGED
@@ -1,12 +1,13 @@
|
|
|
|
1 |
import os
|
2 |
import numpy as np
|
3 |
import matplotlib.pyplot as plt
|
4 |
import gradio as gr
|
5 |
from typing import Dict, List, Any, Optional, Tuple
|
6 |
-
import cv2
|
7 |
-
from PIL import Image
|
8 |
-
import tempfile
|
9 |
-
import uuid
|
10 |
import spaces
|
11 |
|
12 |
from detection_model import DetectionModel
|
@@ -15,10 +16,11 @@ from evaluation_metrics import EvaluationMetrics
|
|
15 |
from style import Style
|
16 |
from image_processor import ImageProcessor
|
17 |
from video_processor import VideoProcessor
|
|
|
18 |
|
19 |
-
# Initialize Processors
|
20 |
-
image_processor = ImageProcessor()
|
21 |
-
video_processor = VideoProcessor(image_processor)
|
22 |
|
23 |
# Helper Function
|
24 |
def get_all_classes():
|
@@ -56,10 +58,15 @@ def get_all_classes():
|
|
56 |
return sorted(default_classes.items())
|
57 |
|
58 |
@spaces.GPU
|
59 |
-
def handle_image_upload(image, model_name, confidence_threshold, filter_classes=None):
|
60 |
"""Processes a single uploaded image."""
|
61 |
-
print(f"Processing image with model: {model_name}, confidence: {confidence_threshold}")
|
62 |
try:
|
|
|
|
|
|
|
|
|
|
|
63 |
class_ids_to_filter = None
|
64 |
if filter_classes:
|
65 |
class_ids_to_filter = []
|
@@ -118,8 +125,127 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
|
|
118 |
scene_desc = scene_analysis.get("description", "Scene analysis requires detected objects.")
|
119 |
# Ensure scene_desc is a string before adding HTML
|
120 |
if not isinstance(scene_desc, str):
|
121 |
-
|
122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
# Prepare activities list
|
125 |
activities_list = scene_analysis.get("possible_activities", [])
|
@@ -138,8 +264,15 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
|
|
138 |
zones = scene_analysis.get("functional_zones", {})
|
139 |
lighting = scene_analysis.get("lighting_conditions", {"time_of_day": "unknown", "confidence": 0})
|
140 |
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
return (result_image, result_text, formatted_stats, plot_figure,
|
142 |
-
|
|
|
143 |
|
144 |
except Exception as e:
|
145 |
print(f"Error in handle_image_upload: {e}")
|
@@ -149,8 +282,8 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
|
|
149 |
ax.text(0.5, 0.5, "Processing Error", color="red", ha="center", va="center")
|
150 |
ax.axis('off')
|
151 |
# Ensure return structure matches outputs even on error
|
152 |
-
return (None, error_msg, {}, fig, f"<div>Error: {str(e)}</div>",
|
153 |
-
|
154 |
|
155 |
def download_video_from_url(video_url, max_duration_minutes=10):
|
156 |
"""
|
@@ -273,7 +406,7 @@ def handle_video_upload(video_input, video_url, input_type, model_name, confiden
|
|
273 |
return None, error_html, {"error": str(e)}
|
274 |
|
275 |
|
276 |
-
# Create Gradio Interface
|
277 |
def create_interface():
|
278 |
"""Creates the Gradio interface with Tabs."""
|
279 |
css = Style.get_css()
|
@@ -283,7 +416,7 @@ def create_interface():
|
|
283 |
|
284 |
with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue="teal", secondary_hue="blue")) as demo:
|
285 |
|
286 |
-
# Header
|
287 |
with gr.Group(elem_classes="app-header"):
|
288 |
gr.HTML("""
|
289 |
<div style="text-align: center; width: 100%; padding: 2rem 0 3rem 0; background: linear-gradient(135deg, #f0f9ff, #e1f5fe);">
|
@@ -303,10 +436,10 @@ def create_interface():
|
|
303 |
</div>
|
304 |
""")
|
305 |
|
306 |
-
# Main Content with Tabs
|
307 |
with gr.Tabs(elem_classes="tabs"):
|
308 |
|
309 |
-
# Tab 1: Image Processing
|
310 |
with gr.Tab("Image Processing"):
|
311 |
current_image_model = gr.State("yolov8m.pt") # State for image model selection
|
312 |
with gr.Row(equal_height=False): # Allow columns to have different heights
|
@@ -331,6 +464,13 @@ def create_interface():
|
|
331 |
label="Confidence Threshold",
|
332 |
info="Minimum confidence for displaying a detected object"
|
333 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
334 |
with gr.Accordion("Filter Classes", open=False):
|
335 |
gr.HTML('<div class="section-heading" style="font-size: 1rem;">Common Categories</div>')
|
336 |
with gr.Row():
|
@@ -350,11 +490,12 @@ def create_interface():
|
|
350 |
with gr.Group(elem_classes="how-to-use"):
|
351 |
gr.HTML('<div class="section-heading">How to Use (Image)</div>')
|
352 |
gr.Markdown("""
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
|
|
358 |
# Image Examples
|
359 |
gr.Examples(
|
360 |
examples=[
|
@@ -392,8 +533,18 @@ def create_interface():
|
|
392 |
</details>
|
393 |
""")
|
394 |
|
395 |
-
|
396 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
397 |
|
398 |
with gr.Row():
|
399 |
with gr.Column(scale=1):
|
@@ -419,7 +570,7 @@ def create_interface():
|
|
419 |
gr.HTML('<div class="section-heading">Detection Statistics</div>')
|
420 |
image_stats_json = gr.JSON(label=None, elem_classes="enhanced-json-display")
|
421 |
|
422 |
-
# Tab 2: Video Processing
|
423 |
with gr.Tab("Video Processing"):
|
424 |
with gr.Row(equal_height=False):
|
425 |
# Left Column: Video Input & Controls
|
@@ -525,7 +676,7 @@ def create_interface():
|
|
525 |
gr.HTML('<div class="section-heading">Aggregated Statistics</div>')
|
526 |
video_stats_json = gr.JSON(label=None, elem_classes="video-stats-display") # Display statistics
|
527 |
|
528 |
-
# Event Listeners
|
529 |
# Image Model Change Handler
|
530 |
image_model_dropdown.change(
|
531 |
fn=lambda model: (model, DetectionModel.get_model_description(model)),
|
@@ -556,13 +707,12 @@ def create_interface():
|
|
556 |
outputs=[video_input, video_url_input]
|
557 |
)
|
558 |
|
559 |
-
# Image Processing Button Click
|
560 |
image_detect_btn.click(
|
561 |
fn=handle_image_upload,
|
562 |
-
inputs=[image_input, image_model_dropdown, image_confidence, image_class_filter],
|
563 |
outputs=[
|
564 |
image_result_image, image_result_text, image_stats_json, image_plot_output,
|
565 |
-
image_scene_description_html, image_activities_list, image_safety_list, image_zones_json,
|
566 |
image_lighting_info
|
567 |
]
|
568 |
)
|
@@ -584,7 +734,7 @@ def create_interface():
|
|
584 |
gr.HTML("""
|
585 |
<div class="footer" style="padding: 25px 0; text-align: center; background: linear-gradient(to right, #f5f9fc, #e1f5fe); border-top: 1px solid #e2e8f0; margin-top: 30px;">
|
586 |
<div style="margin-bottom: 15px;">
|
587 |
-
<p style="font-size: 14px; color: #4A5568; margin: 5px 0;">Powered by YOLOv8, CLIP and Ultralytics • Created with Gradio</p>
|
588 |
</div>
|
589 |
<div style="display: flex; align-items: center; justify-content: center; gap: 20px; margin-top: 15px;">
|
590 |
<p style="font-family: 'Arial', sans-serif; font-size: 14px; font-weight: 500; letter-spacing: 2px; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0; text-transform: uppercase; display: inline-block;">EXPLORE THE CODE →</p>
|
|
|
1 |
+
import re
|
2 |
import os
|
3 |
import numpy as np
|
4 |
import matplotlib.pyplot as plt
|
5 |
import gradio as gr
|
6 |
from typing import Dict, List, Any, Optional, Tuple
|
7 |
+
import cv2
|
8 |
+
from PIL import Image
|
9 |
+
import tempfile
|
10 |
+
import uuid
|
11 |
import spaces
|
12 |
|
13 |
from detection_model import DetectionModel
|
|
|
16 |
from style import Style
|
17 |
from image_processor import ImageProcessor
|
18 |
from video_processor import VideoProcessor
|
19 |
+
from llm_enhancer import LLMEnhancer
|
20 |
|
21 |
+
# Initialize Processors with LLM support
|
22 |
+
image_processor = ImageProcessor(use_llm=True, llm_model_path="meta-llama/Llama-3.2-3B-Instruct")
|
23 |
+
video_processor = VideoProcessor(image_processor)
|
24 |
|
25 |
# Helper Function
|
26 |
def get_all_classes():
|
|
|
58 |
return sorted(default_classes.items())
|
59 |
|
60 |
@spaces.GPU
|
61 |
+
def handle_image_upload(image, model_name, confidence_threshold, filter_classes=None, use_llm=True):
|
62 |
"""Processes a single uploaded image."""
|
63 |
+
print(f"Processing image with model: {model_name}, confidence: {confidence_threshold}, use_llm: {use_llm}")
|
64 |
try:
|
65 |
+
image_processor.use_llm = use_llm
|
66 |
+
if hasattr(image_processor, 'scene_analyzer'):
|
67 |
+
image_processor.scene_analyzer.use_llm = use_llm
|
68 |
+
print(f"Updated existing scene_analyzer use_llm setting to: {use_llm}")
|
69 |
+
|
70 |
class_ids_to_filter = None
|
71 |
if filter_classes:
|
72 |
class_ids_to_filter = []
|
|
|
125 |
scene_desc = scene_analysis.get("description", "Scene analysis requires detected objects.")
|
126 |
# Ensure scene_desc is a string before adding HTML
|
127 |
if not isinstance(scene_desc, str):
|
128 |
+
scene_desc = str(scene_desc)
|
129 |
+
|
130 |
+
def clean_description(desc):
|
131 |
+
if not desc:
|
132 |
+
return ""
|
133 |
+
|
134 |
+
# 先過濾問答格式
|
135 |
+
if "Questions:" in desc:
|
136 |
+
desc = desc.split("Questions:")[0].strip()
|
137 |
+
if "Answers:" in desc:
|
138 |
+
desc = desc.split("Answers:")[0].strip()
|
139 |
+
|
140 |
+
# 然後按行過濾代碼和其他非敘述內容
|
141 |
+
lines = desc.split('\n')
|
142 |
+
clean_lines = []
|
143 |
+
skip_block = False
|
144 |
+
|
145 |
+
for line in lines:
|
146 |
+
# 檢測問題格式
|
147 |
+
if re.match(r'^\d+\.\s+(What|How|Why|When|Where|Who|The)', line):
|
148 |
+
continue
|
149 |
+
|
150 |
+
# 檢查需要跳過的行
|
151 |
+
if line.strip().startswith(':param') or line.strip().startswith('"""'):
|
152 |
+
continue
|
153 |
+
if line.strip().startswith("Exercise") or "class SceneDescriptionSystem" in line:
|
154 |
+
skip_block = True
|
155 |
+
continue
|
156 |
+
if ('def generate_scene_description' in line or
|
157 |
+
'def enhance_scene_descriptions' in line or
|
158 |
+
'def __init__' in line):
|
159 |
+
skip_block = True
|
160 |
+
continue
|
161 |
+
if line.strip().startswith('#TEST'):
|
162 |
+
skip_block = True
|
163 |
+
continue
|
164 |
+
|
165 |
+
if skip_block and line.strip() == "":
|
166 |
+
skip_block = False
|
167 |
+
|
168 |
+
# 如果不需要跳過
|
169 |
+
if not skip_block:
|
170 |
+
clean_lines.append(line)
|
171 |
+
|
172 |
+
cleaned_text = '\n'.join(clean_lines)
|
173 |
+
|
174 |
+
# 如果清理後為空,返回原始描述的第一段作為保險
|
175 |
+
if not cleaned_text.strip():
|
176 |
+
paragraphs = [p.strip() for p in desc.split('\n\n') if p.strip()]
|
177 |
+
if paragraphs:
|
178 |
+
return paragraphs[0]
|
179 |
+
return desc
|
180 |
+
|
181 |
+
return cleaned_text
|
182 |
+
|
183 |
+
# 獲取和處理場景描述
|
184 |
+
scene_analysis = stats.get("scene_analysis", {})
|
185 |
+
print("Processing scene_analysis:", scene_analysis.keys())
|
186 |
+
|
187 |
+
# 獲取原始描述
|
188 |
+
scene_desc = scene_analysis.get("description", "Scene analysis requires detected objects.")
|
189 |
+
if not isinstance(scene_desc, str):
|
190 |
+
scene_desc = str(scene_desc)
|
191 |
+
|
192 |
+
print(f"Original scene description (first 50 chars): {scene_desc[:50]}...")
|
193 |
+
|
194 |
+
# 確保使用的是有效的描述
|
195 |
+
clean_scene_desc = clean_description(scene_desc)
|
196 |
+
print(f"Cleaned scene description (first 50 chars): {clean_scene_desc[:50]}...")
|
197 |
+
|
198 |
+
# 即使清理後為空也確保顯示原始內容
|
199 |
+
if not clean_scene_desc.strip():
|
200 |
+
clean_scene_desc = scene_desc
|
201 |
+
|
202 |
+
# 創建原始描述的HTML
|
203 |
+
scene_desc_html = f"<div>{clean_scene_desc}</div>"
|
204 |
+
|
205 |
+
# 獲取LLM增強描述並且確保設置默認值為空字符串而非 None,不然會有None type Error
|
206 |
+
enhanced_description = scene_analysis.get("enhanced_description", "")
|
207 |
+
if enhanced_description is None:
|
208 |
+
enhanced_description = ""
|
209 |
+
|
210 |
+
if not enhanced_description or not enhanced_description.strip():
|
211 |
+
print("WARNING: LLM enhanced description is empty!")
|
212 |
+
|
213 |
+
# 準備徽章和描述標籤
|
214 |
+
llm_badge = ""
|
215 |
+
description_to_show = ""
|
216 |
+
|
217 |
+
if use_llm and enhanced_description:
|
218 |
+
llm_badge = '<span style="display:inline-block; margin-left:8px; padding:3px 10px; border-radius:12px; background: linear-gradient(90deg, #38b2ac, #4299e1); color:white; font-size:0.7rem; font-weight:bold; box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); border: 1px solid rgba(255, 255, 255, 0.2);">LLM Enhanced</span>'
|
219 |
+
description_to_show = enhanced_description
|
220 |
+
# 在 Original Scene Analysis 折疊區顯示原始的描述
|
221 |
+
else:
|
222 |
+
llm_badge = '<span style="display:inline-block; margin-left:8px; padding:3px 10px; border-radius:12px; background-color:#718096; color:white; font-size:0.7rem; font-weight:bold;">Basic</span>'
|
223 |
+
description_to_show = clean_scene_desc
|
224 |
+
# 不使用 LLM 時,折疊區不顯示內容
|
225 |
+
|
226 |
+
# 使用LLM敘述時會有徽章標籤在標題上
|
227 |
+
scene_description_html = f'''
|
228 |
+
<div>
|
229 |
+
<div class="section-heading" style="font-size:1.2rem; margin-top:15px;">Scene Description {llm_badge}
|
230 |
+
<span style="font-size:0.8rem; color:#666; font-weight:normal; display:block; margin-top:2px;">
|
231 |
+
{('(Enhanced by AI language model)' if use_llm and enhanced_description else '(Based on object detection)')}
|
232 |
+
</span>
|
233 |
+
</div>
|
234 |
+
<div style="padding:15px; background-color:#ffffff; border-radius:8px; border:1px solid #e2e8f0; margin-bottom:20px; box-shadow:0 1px 3px rgba(0,0,0,0.05);">
|
235 |
+
{description_to_show}
|
236 |
+
</div>
|
237 |
+
</div>
|
238 |
+
'''
|
239 |
+
|
240 |
+
# 原始描述只在使用 LLM 且有增強描述時在折疊區顯示
|
241 |
+
original_desc_visibility = "block" if use_llm and enhanced_description else "none"
|
242 |
+
original_desc_html = f'''
|
243 |
+
<div id="original_scene_analysis_accordion" style="display: {original_desc_visibility};">
|
244 |
+
<div style="padding:15px; background-color:#f0f0f0; border-radius:8px; border:1px solid #e2e8f0;">
|
245 |
+
{clean_scene_desc}
|
246 |
+
</div>
|
247 |
+
</div>
|
248 |
+
'''
|
249 |
|
250 |
# Prepare activities list
|
251 |
activities_list = scene_analysis.get("possible_activities", [])
|
|
|
264 |
zones = scene_analysis.get("functional_zones", {})
|
265 |
lighting = scene_analysis.get("lighting_conditions", {"time_of_day": "unknown", "confidence": 0})
|
266 |
|
267 |
+
# 如果描述為空,記錄警告
|
268 |
+
if not clean_scene_desc.strip():
|
269 |
+
print("WARNING: Scene description is empty after cleaning!")
|
270 |
+
if not enhanced_description.strip():
|
271 |
+
print("WARNING: LLM enhanced description is empty!")
|
272 |
+
|
273 |
return (result_image, result_text, formatted_stats, plot_figure,
|
274 |
+
scene_description_html, original_desc_html,
|
275 |
+
activities_list_data, safety_data, zones, lighting)
|
276 |
|
277 |
except Exception as e:
|
278 |
print(f"Error in handle_image_upload: {e}")
|
|
|
282 |
ax.text(0.5, 0.5, "Processing Error", color="red", ha="center", va="center")
|
283 |
ax.axis('off')
|
284 |
# Ensure return structure matches outputs even on error
|
285 |
+
return (None, error_msg, {}, fig, f"<div>Error: {str(e)}</div>", "Error",
|
286 |
+
[["Error"]], [["Error"]], {}, {"time_of_day": "error", "confidence": 0})
|
287 |
|
288 |
def download_video_from_url(video_url, max_duration_minutes=10):
|
289 |
"""
|
|
|
406 |
return None, error_html, {"error": str(e)}
|
407 |
|
408 |
|
409 |
+
# Create Gradio Interface
|
410 |
def create_interface():
|
411 |
"""Creates the Gradio interface with Tabs."""
|
412 |
css = Style.get_css()
|
|
|
416 |
|
417 |
with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue="teal", secondary_hue="blue")) as demo:
|
418 |
|
419 |
+
# Header
|
420 |
with gr.Group(elem_classes="app-header"):
|
421 |
gr.HTML("""
|
422 |
<div style="text-align: center; width: 100%; padding: 2rem 0 3rem 0; background: linear-gradient(135deg, #f0f9ff, #e1f5fe);">
|
|
|
436 |
</div>
|
437 |
""")
|
438 |
|
439 |
+
# Main Content with Tabs
|
440 |
with gr.Tabs(elem_classes="tabs"):
|
441 |
|
442 |
+
# Tab 1: Image Processing
|
443 |
with gr.Tab("Image Processing"):
|
444 |
current_image_model = gr.State("yolov8m.pt") # State for image model selection
|
445 |
with gr.Row(equal_height=False): # Allow columns to have different heights
|
|
|
464 |
label="Confidence Threshold",
|
465 |
info="Minimum confidence for displaying a detected object"
|
466 |
)
|
467 |
+
|
468 |
+
use_llm = gr.Checkbox(
|
469 |
+
label="Use LLM for enhanced scene descriptions",
|
470 |
+
value=True,
|
471 |
+
info="Provides more detailed and natural language descriptions (may increase processing time)"
|
472 |
+
)
|
473 |
+
|
474 |
with gr.Accordion("Filter Classes", open=False):
|
475 |
gr.HTML('<div class="section-heading" style="font-size: 1rem;">Common Categories</div>')
|
476 |
with gr.Row():
|
|
|
490 |
with gr.Group(elem_classes="how-to-use"):
|
491 |
gr.HTML('<div class="section-heading">How to Use (Image)</div>')
|
492 |
gr.Markdown("""
|
493 |
+
1. Upload an image or use the camera
|
494 |
+
2. (Optional) Adjust settings like confidence threshold or model size (n, m=balanced, x=accurate)
|
495 |
+
3. In Analysis Settings, you can uncheck "Use LLM for enhanced scene descriptions" if you prefer faster processing
|
496 |
+
4. Optionally filter to specific object classes
|
497 |
+
5. Click **Detect Objects** button
|
498 |
+
""")
|
499 |
# Image Examples
|
500 |
gr.Examples(
|
501 |
examples=[
|
|
|
533 |
</details>
|
534 |
""")
|
535 |
|
536 |
+
gr.HTML('''
|
537 |
+
<div style="margin-top: 5px; padding: 6px 10px; background-color: #f0f9ff; border-radius: 4px; border-left: 3px solid #63b3ed; font-size: 12px; margin-bottom: 10px;">
|
538 |
+
<p style="margin: 0; color: #4a5568;">
|
539 |
+
<b>Note:</b> AI descriptions may vary slightly with each generation, reflecting the creative nature of AI. This is similar to how a person might use different words each time they describe the same image. Processing time may be longer during first use or when analyzing complex scenes, as the LLM enhancement requires additional computational resources.
|
540 |
+
</p>
|
541 |
+
</div>
|
542 |
+
''')
|
543 |
+
image_scene_description_html = gr.HTML(label=None, elem_id="scene_analysis_description_text")
|
544 |
+
|
545 |
+
# 使用LLM增強敘述時也會顯示原本敘述內容
|
546 |
+
with gr.Accordion("Original Scene Analysis", open=False, elem_id="original_scene_analysis_accordion"):
|
547 |
+
image_llm_description = gr.HTML(label=None, elem_id="original_scene_description_text")
|
548 |
|
549 |
with gr.Row():
|
550 |
with gr.Column(scale=1):
|
|
|
570 |
gr.HTML('<div class="section-heading">Detection Statistics</div>')
|
571 |
image_stats_json = gr.JSON(label=None, elem_classes="enhanced-json-display")
|
572 |
|
573 |
+
# Tab 2: Video Processing
|
574 |
with gr.Tab("Video Processing"):
|
575 |
with gr.Row(equal_height=False):
|
576 |
# Left Column: Video Input & Controls
|
|
|
676 |
gr.HTML('<div class="section-heading">Aggregated Statistics</div>')
|
677 |
video_stats_json = gr.JSON(label=None, elem_classes="video-stats-display") # Display statistics
|
678 |
|
679 |
+
# Event Listeners
|
680 |
# Image Model Change Handler
|
681 |
image_model_dropdown.change(
|
682 |
fn=lambda model: (model, DetectionModel.get_model_description(model)),
|
|
|
707 |
outputs=[video_input, video_url_input]
|
708 |
)
|
709 |
|
|
|
710 |
image_detect_btn.click(
|
711 |
fn=handle_image_upload,
|
712 |
+
inputs=[image_input, image_model_dropdown, image_confidence, image_class_filter, use_llm],
|
713 |
outputs=[
|
714 |
image_result_image, image_result_text, image_stats_json, image_plot_output,
|
715 |
+
image_scene_description_html, image_llm_description, image_activities_list, image_safety_list, image_zones_json,
|
716 |
image_lighting_info
|
717 |
]
|
718 |
)
|
|
|
734 |
gr.HTML("""
|
735 |
<div class="footer" style="padding: 25px 0; text-align: center; background: linear-gradient(to right, #f5f9fc, #e1f5fe); border-top: 1px solid #e2e8f0; margin-top: 30px;">
|
736 |
<div style="margin-bottom: 15px;">
|
737 |
+
<p style="font-size: 14px; color: #4A5568; margin: 5px 0;">Powered by YOLOv8, CLIP, Meta Llama3.2 and Ultralytics • Created with Gradio</p>
|
738 |
</div>
|
739 |
<div style="display: flex; align-items: center; justify-content: center; gap: 20px; margin-top: 15px;">
|
740 |
<p style="font-family: 'Arial', sans-serif; font-size: 14px; font-weight: 500; letter-spacing: 2px; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0; text-transform: uppercase; display: inline-block;">EXPLORE THE CODE →</p>
|
enhance_scene_describer.py
CHANGED
@@ -164,8 +164,8 @@ class EnhancedSceneDescriber:
|
|
164 |
"elevated_threshold": 0.6, # Objects mostly in middle/bottom
|
165 |
"elevated_top_threshold": 0.3 # Few objects at top of frame
|
166 |
}
|
167 |
-
|
168 |
-
|
169 |
def generate_description(self,
|
170 |
scene_type: str,
|
171 |
detected_objects: List[Dict],
|
@@ -193,7 +193,7 @@ class EnhancedSceneDescriber:
|
|
193 |
return self._format_final_description(self._generate_generic_description(detected_objects, lighting_info))
|
194 |
|
195 |
# Detect viewpoint
|
196 |
-
viewpoint = self._detect_viewpoint(detected_objects)
|
197 |
|
198 |
# Process aerial viewpoint scene types
|
199 |
if viewpoint == "aerial":
|
@@ -326,7 +326,7 @@ class EnhancedSceneDescriber:
|
|
326 |
r"with \d+ people",
|
327 |
r"with \d+ person"
|
328 |
]
|
329 |
-
|
330 |
# Check and remove each pattern
|
331 |
filtered_description = description
|
332 |
for pattern in small_people_patterns:
|
@@ -358,40 +358,72 @@ class EnhancedSceneDescriber:
|
|
358 |
# Final formatting to ensure correct punctuation and capitalization
|
359 |
description = self._format_final_description(description)
|
360 |
|
361 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
362 |
|
363 |
def _smart_append(self, current_text: str, new_fragment: str) -> str:
|
364 |
"""
|
365 |
Intelligently append a new text fragment to the current text,
|
366 |
handling punctuation and capitalization correctly.
|
367 |
-
|
368 |
Args:
|
369 |
current_text: The existing text to append to
|
370 |
new_fragment: The new text fragment to append
|
371 |
-
|
372 |
Returns:
|
373 |
str: The combined text with proper formatting
|
374 |
"""
|
375 |
# Handle empty cases
|
376 |
if not new_fragment:
|
377 |
return current_text
|
378 |
-
|
379 |
if not current_text:
|
380 |
# Ensure first character is uppercase for the first fragment
|
381 |
return new_fragment[0].upper() + new_fragment[1:] if new_fragment else ""
|
382 |
-
|
383 |
# Clean up existing text
|
384 |
current_text = current_text.rstrip()
|
385 |
-
|
386 |
# Check for ending punctuation
|
387 |
ends_with_sentence = current_text.endswith(('.', '!', '?'))
|
388 |
ends_with_comma = current_text.endswith(',')
|
389 |
-
|
390 |
# Specifically handle the "A xxx A yyy" pattern that's causing issues
|
391 |
if (current_text.startswith("A ") or current_text.startswith("An ")) and \
|
392 |
(new_fragment.startswith("A ") or new_fragment.startswith("An ")):
|
393 |
return current_text + ". " + new_fragment
|
394 |
-
|
395 |
# Decide how to join the texts
|
396 |
if ends_with_sentence:
|
397 |
# After a sentence, start with uppercase and add proper spacing
|
@@ -406,7 +438,7 @@ class EnhancedSceneDescriber:
|
|
406 |
# When adding a new sentence about the scene, use a period
|
407 |
joined_text = current_text + ". " + new_fragment
|
408 |
else:
|
409 |
-
# For other cases, decide based on the content
|
410 |
if self._is_related_phrases(current_text, new_fragment):
|
411 |
if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper():
|
412 |
joined_text = current_text + ", " + new_fragment
|
@@ -415,18 +447,18 @@ class EnhancedSceneDescriber:
|
|
415 |
else:
|
416 |
# Use period for unrelated phrases
|
417 |
joined_text = current_text + ". " + (new_fragment[0].upper() + new_fragment[1:])
|
418 |
-
|
419 |
return joined_text
|
420 |
|
421 |
def _is_related_phrases(self, text1: str, text2: str) -> bool:
|
422 |
"""
|
423 |
Determine if two phrases are related and should be connected with a comma
|
424 |
rather than separated with a period.
|
425 |
-
|
426 |
Args:
|
427 |
text1: The first text fragment
|
428 |
text2: The second text fragment to be appended
|
429 |
-
|
430 |
Returns:
|
431 |
bool: Whether the phrases appear to be related
|
432 |
"""
|
@@ -434,61 +466,61 @@ class EnhancedSceneDescriber:
|
|
434 |
if (text1.startswith("A ") or text1.startswith("An ")) and \
|
435 |
(text2.startswith("A ") or text2.startswith("An ")):
|
436 |
return False # These are separate descriptions, not related phrases
|
437 |
-
|
438 |
# Check if the second phrase starts with a connecting word
|
439 |
-
connecting_words = ["which", "where", "who", "whom", "whose", "with", "without",
|
440 |
"this", "these", "that", "those", "and", "or", "but"]
|
441 |
-
|
442 |
first_word = text2.split()[0].lower() if text2 else ""
|
443 |
if first_word in connecting_words:
|
444 |
return True
|
445 |
-
|
446 |
# Check if the first phrase ends with something that suggests continuity
|
447 |
-
ending_patterns = ["such as", "including", "like", "especially", "particularly",
|
448 |
"for example", "for instance", "namely", "specifically"]
|
449 |
-
|
450 |
for pattern in ending_patterns:
|
451 |
if text1.lower().endswith(pattern):
|
452 |
return True
|
453 |
-
|
454 |
# Check if both phrases are about the scene
|
455 |
if "scene" in text1.lower() and "scene" in text2.lower():
|
456 |
return False # Separate statements about the scene should be separate sentences
|
457 |
-
|
458 |
return False
|
459 |
|
460 |
def _format_final_description(self, text: str) -> str:
|
461 |
"""
|
462 |
Format the final description text to ensure correct punctuation,
|
463 |
capitalization, and spacing.
|
464 |
-
|
465 |
Args:
|
466 |
text: The text to format
|
467 |
-
|
468 |
Returns:
|
469 |
str: The properly formatted text
|
470 |
"""
|
471 |
import re
|
472 |
-
|
473 |
if not text:
|
474 |
return ""
|
475 |
-
|
476 |
# 1. 特別處理連續以"A"開頭的片段 (這是一個常見問題)
|
477 |
text = re.sub(r'(A\s[^.!?]+?)\s+(A\s)', r'\1. \2', text, flags=re.IGNORECASE)
|
478 |
text = re.sub(r'(An\s[^.!?]+?)\s+(An?\s)', r'\1. \2', text, flags=re.IGNORECASE)
|
479 |
-
|
480 |
# 2. 確保第一個字母大寫
|
481 |
text = text[0].upper() + text[1:] if text else ""
|
482 |
-
|
483 |
# 3. 修正詞之間的空格問題
|
484 |
text = re.sub(r'\s{2,}', ' ', text) # 多個空格改為一個
|
485 |
text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text) # 小寫後大寫間加空格
|
486 |
-
|
487 |
# 4. 修正詞連接問題
|
488 |
text = re.sub(r'([a-zA-Z])and', r'\1 and', text) # "xxx"和"and"間加空格
|
489 |
text = re.sub(r'([a-zA-Z])with', r'\1 with', text) # "xxx"和"with"間加空格
|
490 |
text = re.sub(r'plants(and|with|or)', r'plants \1', text) # 修正"plantsand"這類問題
|
491 |
-
|
492 |
# 5. 修正標點符號後的大小寫問題
|
493 |
text = re.sub(r'\.(\s+)([a-z])', lambda m: f'.{m.group(1)}{m.group(2).upper()}', text) # 句號後大寫
|
494 |
|
@@ -498,46 +530,46 @@ class EnhancedSceneDescriber:
|
|
498 |
# 例外情況:保留專有名詞、人稱代詞等的大寫
|
499 |
if word in ["I", "I'm", "I've", "I'd", "I'll"]:
|
500 |
return match.group(0) # 保持原樣
|
501 |
-
|
502 |
# 保留月份、星期、地名等專有名詞的大寫
|
503 |
-
proper_nouns = ["January", "February", "March", "April", "May", "June", "July",
|
504 |
"August", "September", "October", "November", "December",
|
505 |
"Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
|
506 |
if word in proper_nouns:
|
507 |
return match.group(0) # 保持原樣
|
508 |
-
|
509 |
# 其他情況:將首字母改為小寫
|
510 |
return match.group(1) + word[0].lower() + word[1:]
|
511 |
-
|
512 |
# 匹配逗號後接空格再接大寫單詞的模式
|
513 |
text = re.sub(r'(,\s+)([A-Z][a-zA-Z]*)', fix_capitalization_after_comma, text)
|
514 |
-
|
515 |
-
|
516 |
common_phrases = [
|
517 |
(r'Social or seating area', r'social or seating area'),
|
518 |
(r'Sleeping area', r'sleeping area'),
|
519 |
(r'Dining area', r'dining area'),
|
520 |
(r'Living space', r'living space')
|
521 |
]
|
522 |
-
|
523 |
for phrase, replacement in common_phrases:
|
524 |
# 只修改句中的術語,保留句首的大寫
|
525 |
text = re.sub(r'(?<=[.!?]\s)' + phrase, replacement, text)
|
526 |
# 修改句中的術語,但保留句首的大寫
|
527 |
text = re.sub(r'(?<=,\s)' + phrase, replacement, text)
|
528 |
-
|
529 |
# 7. 確保標點符號後有空格
|
530 |
text = re.sub(r'\s+([.,;:!?])', r'\1', text) # 標點符號前不要空格
|
531 |
text = re.sub(r'([.,;:!?])([a-zA-Z0-9])', r'\1 \2', text) # 標點符號後要有空格
|
532 |
-
|
533 |
# 8. 修正重複標點符號
|
534 |
text = re.sub(r'\.{2,}', '.', text) # 多個句號變一個
|
535 |
text = re.sub(r',{2,}', ',', text) # 多個逗號變一個
|
536 |
-
|
537 |
# 9. 確保文本以標點結束
|
538 |
if text and not text[-1] in '.!?':
|
539 |
text += '.'
|
540 |
-
|
541 |
return text
|
542 |
|
543 |
def _is_intersection(self, detected_objects: List[Dict]) -> bool:
|
|
|
164 |
"elevated_threshold": 0.6, # Objects mostly in middle/bottom
|
165 |
"elevated_top_threshold": 0.3 # Few objects at top of frame
|
166 |
}
|
167 |
+
|
168 |
+
|
169 |
def generate_description(self,
|
170 |
scene_type: str,
|
171 |
detected_objects: List[Dict],
|
|
|
193 |
return self._format_final_description(self._generate_generic_description(detected_objects, lighting_info))
|
194 |
|
195 |
# Detect viewpoint
|
196 |
+
viewpoint = self._detect_viewpoint(detected_objects)
|
197 |
|
198 |
# Process aerial viewpoint scene types
|
199 |
if viewpoint == "aerial":
|
|
|
326 |
r"with \d+ people",
|
327 |
r"with \d+ person"
|
328 |
]
|
329 |
+
|
330 |
# Check and remove each pattern
|
331 |
filtered_description = description
|
332 |
for pattern in small_people_patterns:
|
|
|
358 |
# Final formatting to ensure correct punctuation and capitalization
|
359 |
description = self._format_final_description(description)
|
360 |
|
361 |
+
description_lines = description.split('\n')
|
362 |
+
clean_description = []
|
363 |
+
skip_block = False # 添加這個變數的定義
|
364 |
+
|
365 |
+
for line in description_lines:
|
366 |
+
# 檢查是否需要跳過這行
|
367 |
+
if line.strip().startswith(':param') or line.strip().startswith('"""'):
|
368 |
+
continue
|
369 |
+
if line.strip().startswith("Exercise") or "class SceneDescriptionSystem" in line:
|
370 |
+
skip_block = True
|
371 |
+
continue
|
372 |
+
if ('def generate_scene_description' in line or
|
373 |
+
'def enhance_scene_descriptions' in line or
|
374 |
+
'def __init__' in line):
|
375 |
+
skip_block = True
|
376 |
+
continue
|
377 |
+
if line.strip().startswith('#TEST'):
|
378 |
+
skip_block = True
|
379 |
+
continue
|
380 |
+
|
381 |
+
# 空行結束跳過模式
|
382 |
+
if skip_block and line.strip() == "":
|
383 |
+
skip_block = False
|
384 |
+
|
385 |
+
# 如果不需要跳過,添加這行到結果
|
386 |
+
if not skip_block:
|
387 |
+
clean_description.append(line)
|
388 |
+
|
389 |
+
# 如果過濾後的描述為空,返回原始描述
|
390 |
+
if not clean_description:
|
391 |
+
return description
|
392 |
+
else:
|
393 |
+
return '\n'.join(clean_description)
|
394 |
|
395 |
def _smart_append(self, current_text: str, new_fragment: str) -> str:
|
396 |
"""
|
397 |
Intelligently append a new text fragment to the current text,
|
398 |
handling punctuation and capitalization correctly.
|
399 |
+
|
400 |
Args:
|
401 |
current_text: The existing text to append to
|
402 |
new_fragment: The new text fragment to append
|
403 |
+
|
404 |
Returns:
|
405 |
str: The combined text with proper formatting
|
406 |
"""
|
407 |
# Handle empty cases
|
408 |
if not new_fragment:
|
409 |
return current_text
|
410 |
+
|
411 |
if not current_text:
|
412 |
# Ensure first character is uppercase for the first fragment
|
413 |
return new_fragment[0].upper() + new_fragment[1:] if new_fragment else ""
|
414 |
+
|
415 |
# Clean up existing text
|
416 |
current_text = current_text.rstrip()
|
417 |
+
|
418 |
# Check for ending punctuation
|
419 |
ends_with_sentence = current_text.endswith(('.', '!', '?'))
|
420 |
ends_with_comma = current_text.endswith(',')
|
421 |
+
|
422 |
# Specifically handle the "A xxx A yyy" pattern that's causing issues
|
423 |
if (current_text.startswith("A ") or current_text.startswith("An ")) and \
|
424 |
(new_fragment.startswith("A ") or new_fragment.startswith("An ")):
|
425 |
return current_text + ". " + new_fragment
|
426 |
+
|
427 |
# Decide how to join the texts
|
428 |
if ends_with_sentence:
|
429 |
# After a sentence, start with uppercase and add proper spacing
|
|
|
438 |
# When adding a new sentence about the scene, use a period
|
439 |
joined_text = current_text + ". " + new_fragment
|
440 |
else:
|
441 |
+
# For other cases, decide based on the content
|
442 |
if self._is_related_phrases(current_text, new_fragment):
|
443 |
if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper():
|
444 |
joined_text = current_text + ", " + new_fragment
|
|
|
447 |
else:
|
448 |
# Use period for unrelated phrases
|
449 |
joined_text = current_text + ". " + (new_fragment[0].upper() + new_fragment[1:])
|
450 |
+
|
451 |
return joined_text
|
452 |
|
453 |
def _is_related_phrases(self, text1: str, text2: str) -> bool:
|
454 |
"""
|
455 |
Determine if two phrases are related and should be connected with a comma
|
456 |
rather than separated with a period.
|
457 |
+
|
458 |
Args:
|
459 |
text1: The first text fragment
|
460 |
text2: The second text fragment to be appended
|
461 |
+
|
462 |
Returns:
|
463 |
bool: Whether the phrases appear to be related
|
464 |
"""
|
|
|
466 |
if (text1.startswith("A ") or text1.startswith("An ")) and \
|
467 |
(text2.startswith("A ") or text2.startswith("An ")):
|
468 |
return False # These are separate descriptions, not related phrases
|
469 |
+
|
470 |
# Check if the second phrase starts with a connecting word
|
471 |
+
connecting_words = ["which", "where", "who", "whom", "whose", "with", "without",
|
472 |
"this", "these", "that", "those", "and", "or", "but"]
|
473 |
+
|
474 |
first_word = text2.split()[0].lower() if text2 else ""
|
475 |
if first_word in connecting_words:
|
476 |
return True
|
477 |
+
|
478 |
# Check if the first phrase ends with something that suggests continuity
|
479 |
+
ending_patterns = ["such as", "including", "like", "especially", "particularly",
|
480 |
"for example", "for instance", "namely", "specifically"]
|
481 |
+
|
482 |
for pattern in ending_patterns:
|
483 |
if text1.lower().endswith(pattern):
|
484 |
return True
|
485 |
+
|
486 |
# Check if both phrases are about the scene
|
487 |
if "scene" in text1.lower() and "scene" in text2.lower():
|
488 |
return False # Separate statements about the scene should be separate sentences
|
489 |
+
|
490 |
return False
|
491 |
|
492 |
def _format_final_description(self, text: str) -> str:
|
493 |
"""
|
494 |
Format the final description text to ensure correct punctuation,
|
495 |
capitalization, and spacing.
|
496 |
+
|
497 |
Args:
|
498 |
text: The text to format
|
499 |
+
|
500 |
Returns:
|
501 |
str: The properly formatted text
|
502 |
"""
|
503 |
import re
|
504 |
+
|
505 |
if not text:
|
506 |
return ""
|
507 |
+
|
508 |
# 1. 特別處理連續以"A"開頭的片段 (這是一個常見問題)
|
509 |
text = re.sub(r'(A\s[^.!?]+?)\s+(A\s)', r'\1. \2', text, flags=re.IGNORECASE)
|
510 |
text = re.sub(r'(An\s[^.!?]+?)\s+(An?\s)', r'\1. \2', text, flags=re.IGNORECASE)
|
511 |
+
|
512 |
# 2. 確保第一個字母大寫
|
513 |
text = text[0].upper() + text[1:] if text else ""
|
514 |
+
|
515 |
# 3. 修正詞之間的空格問題
|
516 |
text = re.sub(r'\s{2,}', ' ', text) # 多個空格改為一個
|
517 |
text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text) # 小寫後大寫間加空格
|
518 |
+
|
519 |
# 4. 修正詞連接問題
|
520 |
text = re.sub(r'([a-zA-Z])and', r'\1 and', text) # "xxx"和"and"間加空格
|
521 |
text = re.sub(r'([a-zA-Z])with', r'\1 with', text) # "xxx"和"with"間加空格
|
522 |
text = re.sub(r'plants(and|with|or)', r'plants \1', text) # 修正"plantsand"這類問題
|
523 |
+
|
524 |
# 5. 修正標點符號後的大小寫問題
|
525 |
text = re.sub(r'\.(\s+)([a-z])', lambda m: f'.{m.group(1)}{m.group(2).upper()}', text) # 句號後大寫
|
526 |
|
|
|
530 |
# 例外情況:保留專有名詞、人稱代詞等的大寫
|
531 |
if word in ["I", "I'm", "I've", "I'd", "I'll"]:
|
532 |
return match.group(0) # 保持原樣
|
533 |
+
|
534 |
# 保留月份、星期、地名等專有名詞的大寫
|
535 |
+
proper_nouns = ["January", "February", "March", "April", "May", "June", "July",
|
536 |
"August", "September", "October", "November", "December",
|
537 |
"Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
|
538 |
if word in proper_nouns:
|
539 |
return match.group(0) # 保持原樣
|
540 |
+
|
541 |
# 其他情況:將首字母改為小寫
|
542 |
return match.group(1) + word[0].lower() + word[1:]
|
543 |
+
|
544 |
# 匹配逗號後接空格再接大寫單詞的模式
|
545 |
text = re.sub(r'(,\s+)([A-Z][a-zA-Z]*)', fix_capitalization_after_comma, text)
|
546 |
+
|
547 |
+
|
548 |
common_phrases = [
|
549 |
(r'Social or seating area', r'social or seating area'),
|
550 |
(r'Sleeping area', r'sleeping area'),
|
551 |
(r'Dining area', r'dining area'),
|
552 |
(r'Living space', r'living space')
|
553 |
]
|
554 |
+
|
555 |
for phrase, replacement in common_phrases:
|
556 |
# 只修改句中的術語,保留句首的大寫
|
557 |
text = re.sub(r'(?<=[.!?]\s)' + phrase, replacement, text)
|
558 |
# 修改句中的術語,但保留句首的大寫
|
559 |
text = re.sub(r'(?<=,\s)' + phrase, replacement, text)
|
560 |
+
|
561 |
# 7. 確保標點符號後有空格
|
562 |
text = re.sub(r'\s+([.,;:!?])', r'\1', text) # 標點符號前不要空格
|
563 |
text = re.sub(r'([.,;:!?])([a-zA-Z0-9])', r'\1 \2', text) # 標點符號後要有空格
|
564 |
+
|
565 |
# 8. 修正重複標點符號
|
566 |
text = re.sub(r'\.{2,}', '.', text) # 多個句號變一個
|
567 |
text = re.sub(r',{2,}', ',', text) # 多個逗號變一個
|
568 |
+
|
569 |
# 9. 確保文本以標點結束
|
570 |
if text and not text[-1] in '.!?':
|
571 |
text += '.'
|
572 |
+
|
573 |
return text
|
574 |
|
575 |
def _is_intersection(self, detected_objects: List[Dict]) -> bool:
|
image_processor.py
CHANGED
@@ -20,11 +20,13 @@ class ImageProcessor:
|
|
20 |
Separates processing logic from UI components
|
21 |
"""
|
22 |
|
23 |
-
def __init__(self):
|
24 |
"""Initialize the image processor with required components"""
|
25 |
self.color_mapper = ColorMapper()
|
26 |
self.model_instances = {}
|
27 |
self.lighting_analyzer = LightingAnalyzer()
|
|
|
|
|
28 |
|
29 |
def get_model_instance(self, model_name: str, confidence: float = 0.25, iou: float = 0.25) -> DetectionModel:
|
30 |
"""
|
@@ -65,7 +67,11 @@ class ImageProcessor:
|
|
65 |
try:
|
66 |
# Initialize scene analyzer if not already done
|
67 |
if not hasattr(self, 'scene_analyzer'):
|
68 |
-
self.scene_analyzer = SceneAnalyzer(
|
|
|
|
|
|
|
|
|
69 |
|
70 |
# 確保類名正確更新
|
71 |
if self.scene_analyzer.class_names is None:
|
|
|
20 |
Separates processing logic from UI components
|
21 |
"""
|
22 |
|
23 |
+
def __init__(self, use_llm=True, llm_model_path=None):
|
24 |
"""Initialize the image processor with required components"""
|
25 |
self.color_mapper = ColorMapper()
|
26 |
self.model_instances = {}
|
27 |
self.lighting_analyzer = LightingAnalyzer()
|
28 |
+
self.use_llm = use_llm
|
29 |
+
self.llm_model_path = llm_model_path
|
30 |
|
31 |
def get_model_instance(self, model_name: str, confidence: float = 0.25, iou: float = 0.25) -> DetectionModel:
|
32 |
"""
|
|
|
67 |
try:
|
68 |
# Initialize scene analyzer if not already done
|
69 |
if not hasattr(self, 'scene_analyzer'):
|
70 |
+
self.scene_analyzer = SceneAnalyzer(
|
71 |
+
class_names=detection_result.names,
|
72 |
+
use_llm=self.use_llm,
|
73 |
+
llm_model_path=self.llm_model_path
|
74 |
+
)
|
75 |
|
76 |
# 確保類名正確更新
|
77 |
if self.scene_analyzer.class_names is None:
|
llm_enhancer.py
ADDED
@@ -0,0 +1,1218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import os
|
3 |
+
import torch
|
4 |
+
from typing import Dict, List, Tuple, Any, Optional
|
5 |
+
import logging
|
6 |
+
|
7 |
+
class LLMEnhancer:
|
8 |
+
"""
|
9 |
+
負責使用LLM (Large Language Model) 增強場景理解和描述。
|
10 |
+
未來可以再整合Llama或其他LLM模型進行場景描述的生成和豐富化。
|
11 |
+
"""
|
12 |
+
|
13 |
+
def __init__(self,
|
14 |
+
model_path: Optional[str] = None,
|
15 |
+
tokenizer_path: Optional[str] = None,
|
16 |
+
device: Optional[str] = None,
|
17 |
+
max_length: int = 2048,
|
18 |
+
temperature: float = 0.3,
|
19 |
+
top_p: float = 0.85):
|
20 |
+
"""
|
21 |
+
初始化LLM增強器
|
22 |
+
|
23 |
+
Args:
|
24 |
+
model_path: LLM模型的路徑或HuggingFace log in,默認使用Llama 3.2
|
25 |
+
tokenizer_path: token處理器的路徑,通常與model_path相同
|
26 |
+
device: 設備檢查 ('cpu'或'cuda')
|
27 |
+
max_length: 生成文本的最大長度
|
28 |
+
temperature: 生成文本的溫度(較高比較有創意,較低會偏保守)
|
29 |
+
top_p: 生成文本時的核心採樣機率閾值
|
30 |
+
"""
|
31 |
+
self.logger = logging.getLogger("LLMEnhancer")
|
32 |
+
self.logger.setLevel(logging.INFO)
|
33 |
+
handler = logging.StreamHandler()
|
34 |
+
handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
|
35 |
+
self.logger.addHandler(handler)
|
36 |
+
|
37 |
+
# 設置默認模型路徑就是用Llama3.2
|
38 |
+
self.model_path = model_path or "meta-llama/Llama-3.2-3B-Instruct"
|
39 |
+
self.tokenizer_path = tokenizer_path or self.model_path
|
40 |
+
|
41 |
+
# 確定運行設備
|
42 |
+
self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
|
43 |
+
self.logger.info(f"Using device: {self.device}")
|
44 |
+
|
45 |
+
# create parameters
|
46 |
+
self.max_length = max_length
|
47 |
+
self.temperature = temperature
|
48 |
+
self.top_p = top_p
|
49 |
+
|
50 |
+
self.model = None
|
51 |
+
self.tokenizer = None
|
52 |
+
|
53 |
+
# 計數器,用來追蹤模型調用次數
|
54 |
+
self.call_count = 0
|
55 |
+
|
56 |
+
self._initialize_prompts()
|
57 |
+
|
58 |
+
# 只在需要時加載模型
|
59 |
+
self._model_loaded = False
|
60 |
+
|
61 |
+
try:
|
62 |
+
self.hf_token = os.environ.get("HF_TOKEN")
|
63 |
+
if self.hf_token:
|
64 |
+
self.logger.info("Logging in to Hugging Face with token")
|
65 |
+
from huggingface_hub import login
|
66 |
+
login(token=self.hf_token)
|
67 |
+
else:
|
68 |
+
self.logger.warning("HF_TOKEN not found in environment variables. Access to gated models may be limited.")
|
69 |
+
except Exception as e:
|
70 |
+
self.logger.error(f"Error during Hugging Face login: {e}")
|
71 |
+
|
72 |
+
def _load_model(self):
|
73 |
+
"""懶加載模型 - 僅在首次需要時加載,使用 8 位量化以節省記憶體"""
|
74 |
+
if self._model_loaded:
|
75 |
+
return
|
76 |
+
|
77 |
+
try:
|
78 |
+
self.logger.info(f"Loading LLM model from {self.model_path} with 8-bit quantization")
|
79 |
+
import torch
|
80 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
81 |
+
torch.cuda.empty_cache()
|
82 |
+
|
83 |
+
# 打印可用 GPU 記憶體
|
84 |
+
if torch.cuda.is_available():
|
85 |
+
free_in_GB = torch.cuda.get_device_properties(0).total_memory / 1024**3
|
86 |
+
print(f"Total GPU memory: {free_in_GB:.2f} GB")
|
87 |
+
|
88 |
+
# 設置 8 位元量化配置
|
89 |
+
quantization_config = BitsAndBytesConfig(
|
90 |
+
load_in_8bit=True,
|
91 |
+
llm_int8_enable_fp32_cpu_offload=True
|
92 |
+
)
|
93 |
+
|
94 |
+
# 加載詞元處理器
|
95 |
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
96 |
+
self.tokenizer_path,
|
97 |
+
padding_side="left",
|
98 |
+
use_fast=False,
|
99 |
+
token=self.hf_token
|
100 |
+
)
|
101 |
+
|
102 |
+
# 設置特殊標記
|
103 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
104 |
+
|
105 |
+
# 加載 8 位量化模型
|
106 |
+
self.model = AutoModelForCausalLM.from_pretrained(
|
107 |
+
self.model_path,
|
108 |
+
quantization_config=quantization_config,
|
109 |
+
device_map="auto",
|
110 |
+
low_cpu_mem_usage=True,
|
111 |
+
token=self.hf_token
|
112 |
+
)
|
113 |
+
|
114 |
+
self.logger.info("Model loaded successfully with 8-bit quantization")
|
115 |
+
self._model_loaded = True
|
116 |
+
|
117 |
+
except Exception as e:
|
118 |
+
self.logger.error(f"Error loading LLM model: {e}")
|
119 |
+
import traceback
|
120 |
+
traceback.print_exc()
|
121 |
+
raise
|
122 |
+
|
123 |
+
def _initialize_prompts(self):
|
124 |
+
"""Return an optimized prompt template specifically for Zephyr model"""
|
125 |
+
# the prompt for the model
|
126 |
+
self.enhance_description_template = """
|
127 |
+
<|system|>
|
128 |
+
You are an expert visual analyst. Your task is to improve the readability and fluency of scene descriptions using STRICT factual accuracy.
|
129 |
+
|
130 |
+
Your **top priority is to avoid hallucination** or fabrication. You are working in a computer vision pipeline using object detection (YOLO) and image embeddings. You MUST treat the input object list as a whitelist. Do not speculate beyond this list.
|
131 |
+
|
132 |
+
</|system|>
|
133 |
+
|
134 |
+
<|user|>
|
135 |
+
Rewrite the following scene description to be fluent and clear. DO NOT add any objects, events, or spatial relationships that are not explicitly present in the original or object list.
|
136 |
+
|
137 |
+
ORIGINAL:
|
138 |
+
{original_description}
|
139 |
+
|
140 |
+
CRITICAL RULES:
|
141 |
+
1. NEVER assume room type, object function, or scene purpose unless directly stated.
|
142 |
+
2. NEVER invent object types. You are limited to: {object_list}
|
143 |
+
3. NEVER speculate on object quantity. If the description says "10 people" , DO NOT say "dozens" or "many". Maintain the original quantity unless specified.
|
144 |
+
4. Use terms like "in the scene", "visible in the background", or "positioned in the lower left" instead of assuming direction or layout logic.
|
145 |
+
5. You MAY describe confirmed materials, colors, and composition style if visually obvious and non-speculative.
|
146 |
+
6. Write 2–4 complete, well-structured sentences with punctuation.
|
147 |
+
7. Final output MUST be a single fluent paragraph of 60–200 words (not longer).
|
148 |
+
8. NEVER include explanations, reasoning, or tags. ONLY provide the enhanced description.
|
149 |
+
9. Do not repeat any sentence structure or phrase more than once.
|
150 |
+
</|user|>
|
151 |
+
|
152 |
+
<|assistant|>
|
153 |
+
"""
|
154 |
+
|
155 |
+
|
156 |
+
# 錯誤檢測提示
|
157 |
+
self.verify_detection_template = """
|
158 |
+
Task: You are an advanced vision system that verifies computer vision detections for accuracy.
|
159 |
+
|
160 |
+
Analyze the following detection results and identify any potential errors or inconsistencies:
|
161 |
+
|
162 |
+
SCENE TYPE: {scene_type}
|
163 |
+
SCENE NAME: {scene_name}
|
164 |
+
CONFIDENCE: {confidence:.2f}
|
165 |
+
|
166 |
+
DETECTED OBJECTS: {detected_objects}
|
167 |
+
|
168 |
+
CLIP ANALYSIS RESULTS:
|
169 |
+
{clip_analysis}
|
170 |
+
|
171 |
+
Possible Errors to Check:
|
172 |
+
1. Objects misidentified (e.g., architectural elements labeled as vehicles)
|
173 |
+
2. Cultural elements misunderstood (e.g., Asian temple structures labeled as boats)
|
174 |
+
3. Objects that seem out of place for this type of scene
|
175 |
+
4. Inconsistencies between different detection systems
|
176 |
+
|
177 |
+
If you find potential errors, list them clearly with explanations. If the detections seem reasonable, state that they appear accurate.
|
178 |
+
|
179 |
+
Verification Results:
|
180 |
+
"""
|
181 |
+
|
182 |
+
# 無檢測處理提示
|
183 |
+
self.no_detection_template = """
|
184 |
+
Task: You are an advanced scene understanding system analyzing an image where standard object detection failed to identify specific objects.
|
185 |
+
|
186 |
+
Based on advanced image embeddings (CLIP analysis), we have the following information:
|
187 |
+
|
188 |
+
MOST LIKELY SCENE: {top_scene} (confidence: {top_confidence:.2f})
|
189 |
+
VIEWPOINT: {viewpoint}
|
190 |
+
LIGHTING: {lighting_condition}
|
191 |
+
|
192 |
+
CULTURAL ANALYSIS: {cultural_analysis}
|
193 |
+
|
194 |
+
Create a detailed description of what might be in this scene, considering:
|
195 |
+
1. The most likely type of location or setting
|
196 |
+
2. Possible architectural or natural elements present
|
197 |
+
3. The lighting and atmosphere
|
198 |
+
4. Potential cultural or regional characteristics
|
199 |
+
|
200 |
+
Your description should be natural, flowing, and offer insights into what the image likely contains despite the lack of specific object detection.
|
201 |
+
|
202 |
+
Scene Description:
|
203 |
+
"""
|
204 |
+
|
205 |
+
def _clean_llama_response(self, response: str) -> str:
|
206 |
+
"""處理 Llama 模型特有的輸出格式問題"""
|
207 |
+
# 首先應用通用清理
|
208 |
+
response = self._clean_model_response(response)
|
209 |
+
|
210 |
+
# 移除 Llama 常見的前綴短語
|
211 |
+
prefixes_to_remove = [
|
212 |
+
"Here's the enhanced description:",
|
213 |
+
"Enhanced description:",
|
214 |
+
"Here is the enhanced scene description:",
|
215 |
+
"I've enhanced the description while preserving all factual details:"
|
216 |
+
]
|
217 |
+
|
218 |
+
for prefix in prefixes_to_remove:
|
219 |
+
if response.lower().startswith(prefix.lower()):
|
220 |
+
response = response[len(prefix):].strip()
|
221 |
+
|
222 |
+
# 移除可能的後綴說明
|
223 |
+
suffixes_to_remove = [
|
224 |
+
"I've maintained all the key factual elements",
|
225 |
+
"I've preserved all the factual details",
|
226 |
+
"All factual elements have been maintained"
|
227 |
+
]
|
228 |
+
|
229 |
+
for suffix in suffixes_to_remove:
|
230 |
+
if response.lower().endswith(suffix.lower()):
|
231 |
+
response = response[:response.rfind(suffix)].strip()
|
232 |
+
|
233 |
+
return response
|
234 |
+
|
235 |
+
def _detect_scene_type(self, detected_objects: List[Dict]) -> str:
|
236 |
+
"""
|
237 |
+
Detect scene type based on object distribution and patterns
|
238 |
+
"""
|
239 |
+
# Default scene type
|
240 |
+
scene_type = "intersection"
|
241 |
+
|
242 |
+
# Count objects by class
|
243 |
+
object_counts = {}
|
244 |
+
for obj in detected_objects:
|
245 |
+
class_name = obj.get("class_name", "")
|
246 |
+
if class_name not in object_counts:
|
247 |
+
object_counts[class_name] = 0
|
248 |
+
object_counts[class_name] += 1
|
249 |
+
|
250 |
+
# 辨識人
|
251 |
+
people_count = object_counts.get("person", 0)
|
252 |
+
|
253 |
+
# 交通工具的
|
254 |
+
car_count = object_counts.get("car", 0)
|
255 |
+
bus_count = object_counts.get("bus", 0)
|
256 |
+
truck_count = object_counts.get("truck", 0)
|
257 |
+
total_vehicles = car_count + bus_count + truck_count
|
258 |
+
|
259 |
+
# Simple scene type detection logic
|
260 |
+
if people_count > 8 and total_vehicles < 2:
|
261 |
+
scene_type = "pedestrian_crossing"
|
262 |
+
elif people_count > 5 and total_vehicles > 2:
|
263 |
+
scene_type = "busy_intersection"
|
264 |
+
elif people_count < 3 and total_vehicles > 3:
|
265 |
+
scene_type = "traffic_junction"
|
266 |
+
|
267 |
+
return scene_type
|
268 |
+
|
269 |
+
def _clean_scene_type(self, scene_type: str) -> str:
|
270 |
+
"""清理場景類型,使其更適合用於提示詞"""
|
271 |
+
if not scene_type:
|
272 |
+
return "scene"
|
273 |
+
|
274 |
+
# replace underline to space or sometime capital letter
|
275 |
+
if '_' in scene_type:
|
276 |
+
return ' '.join(word.capitalize() for word in scene_type.split('_'))
|
277 |
+
|
278 |
+
return scene_type
|
279 |
+
|
280 |
+
def _clean_model_response(self, response: str) -> str:
|
281 |
+
"""清理模型回應以移除常見的標記和前綴"""
|
282 |
+
# 移除任何可能殘留的系統樣式標記
|
283 |
+
response = re.sub(r'<\|.*?\|>', '', response)
|
284 |
+
|
285 |
+
# 移除任何 "This european_plaza" 或類似前綴
|
286 |
+
response = re.sub(r'^This [a-z_]+\s+', '', response)
|
287 |
+
|
288 |
+
# 確保響應以大寫字母開頭
|
289 |
+
if response and not response[0].isupper():
|
290 |
+
response = response[0].upper() + response[1:]
|
291 |
+
|
292 |
+
return response.strip()
|
293 |
+
|
294 |
+
def _validate_scene_facts(self, enhanced_desc: str, original_desc: str, people_count: int) -> str:
|
295 |
+
"""Validate key facts in enhanced description"""
|
296 |
+
# Check if people count is preserved
|
297 |
+
if people_count > 0:
|
298 |
+
people_pattern = re.compile(r'(\d+)\s+(?:people|persons|pedestrians|individuals)', re.IGNORECASE)
|
299 |
+
people_match = people_pattern.search(enhanced_desc)
|
300 |
+
|
301 |
+
if not people_match or int(people_match.group(1)) != people_count:
|
302 |
+
# Replace incorrect count or add if missing
|
303 |
+
if people_match:
|
304 |
+
enhanced_desc = people_pattern.sub(f"{people_count} people", enhanced_desc)
|
305 |
+
else:
|
306 |
+
enhanced_desc = f"The scene shows {people_count} people. " + enhanced_desc
|
307 |
+
|
308 |
+
# Ensure aerial perspective is mentioned
|
309 |
+
if "aerial" in original_desc.lower() and "aerial" not in enhanced_desc.lower():
|
310 |
+
enhanced_desc = "From an aerial perspective, " + enhanced_desc[0].lower() + enhanced_desc[1:]
|
311 |
+
|
312 |
+
return enhanced_desc
|
313 |
+
|
314 |
+
def reset_context(self):
|
315 |
+
"""在處理新圖像前重置模型上下文"""
|
316 |
+
if self._model_loaded:
|
317 |
+
# 清除 GPU 緩存
|
318 |
+
torch.cuda.empty_cache()
|
319 |
+
self.logger.info("Model context reset")
|
320 |
+
else:
|
321 |
+
self.logger.info("Model not loaded, no context to reset")
|
322 |
+
|
323 |
+
def _remove_introduction_sentences(self, response: str) -> str:
|
324 |
+
"""移除生成文本中可能的介紹性句子"""
|
325 |
+
# 識別常見的介紹性模式
|
326 |
+
intro_patterns = [
|
327 |
+
r'^Here is the (?:rewritten|enhanced) .*?description:',
|
328 |
+
r'^The (?:rewritten|enhanced) description:',
|
329 |
+
r'^Here\'s the (?:rewritten|enhanced) description of .*?:'
|
330 |
+
]
|
331 |
+
|
332 |
+
for pattern in intro_patterns:
|
333 |
+
if re.match(pattern, response, re.IGNORECASE):
|
334 |
+
# 找到冒號後的內容
|
335 |
+
parts = re.split(r':', response, 1)
|
336 |
+
if len(parts) > 1:
|
337 |
+
return parts[1].strip()
|
338 |
+
|
339 |
+
return response
|
340 |
+
|
341 |
+
def enhance_description(self, scene_data: Dict[str, Any]) -> str:
|
342 |
+
"""改進的場景描述增強器,處理各種場景類型並保留視角與光照資訊,並作為總窗口可運用於其他class"""
|
343 |
+
try:
|
344 |
+
# 重置上下文
|
345 |
+
self.reset_context()
|
346 |
+
|
347 |
+
# 確保模型已加載
|
348 |
+
if not self._model_loaded:
|
349 |
+
self._load_model()
|
350 |
+
|
351 |
+
# extract original description
|
352 |
+
original_desc = scene_data.get("original_description", "")
|
353 |
+
if not original_desc:
|
354 |
+
return "No original description provided."
|
355 |
+
|
356 |
+
# 獲取scene type 並標準化
|
357 |
+
scene_type = scene_data.get("scene_type", "unknown scene")
|
358 |
+
scene_type = self._clean_scene_type(scene_type)
|
359 |
+
|
360 |
+
# 提取檢測到的物件並過濾低置信度物件
|
361 |
+
detected_objects = scene_data.get("detected_objects", [])
|
362 |
+
filtered_objects = []
|
363 |
+
|
364 |
+
# 高置信度閾值,嚴格過濾物件
|
365 |
+
high_confidence_threshold = 0.65
|
366 |
+
|
367 |
+
for obj in detected_objects:
|
368 |
+
confidence = obj.get("confidence", 0)
|
369 |
+
class_name = obj.get("class_name", "")
|
370 |
+
|
371 |
+
# 為特殊類別設置更高閾值
|
372 |
+
special_classes = ["airplane", "helicopter", "boat"]
|
373 |
+
if class_name in special_classes:
|
374 |
+
if confidence < 0.75: # 為這些類別設置更高閾值
|
375 |
+
continue
|
376 |
+
|
377 |
+
# 僅保留高置信度物件
|
378 |
+
if confidence >= high_confidence_threshold:
|
379 |
+
filtered_objects.append(obj)
|
380 |
+
|
381 |
+
# 計算物件列表和數量 - 僅使用過濾後的高置信度物件
|
382 |
+
object_counts = {}
|
383 |
+
for obj in filtered_objects:
|
384 |
+
class_name = obj.get("class_name", "")
|
385 |
+
if class_name not in object_counts:
|
386 |
+
object_counts[class_name] = 0
|
387 |
+
object_counts[class_name] += 1
|
388 |
+
|
389 |
+
# 將高置信度物件格式化為清單
|
390 |
+
high_confidence_objects = ", ".join([f"{count} {obj}" for obj, count in object_counts.items()])
|
391 |
+
|
392 |
+
# 如果沒有高置信度物件,回退到使用原始描述中的關鍵詞
|
393 |
+
if not high_confidence_objects:
|
394 |
+
# 從原始描述中提取物件提及
|
395 |
+
object_keywords = self._extract_objects_from_description(original_desc)
|
396 |
+
high_confidence_objects = ", ".join(object_keywords) if object_keywords else "objects visible in the scene"
|
397 |
+
|
398 |
+
# 保留原始描述中的關鍵視角信息
|
399 |
+
perspective = self._extract_perspective_from_description(original_desc)
|
400 |
+
|
401 |
+
# 提取光照資訊
|
402 |
+
lighting_description = "unknown lighting"
|
403 |
+
if "lighting_info" in scene_data:
|
404 |
+
lighting_info = scene_data.get("lighting_info", {})
|
405 |
+
time_of_day = lighting_info.get("time_of_day", "unknown")
|
406 |
+
is_indoor = lighting_info.get("is_indoor", False)
|
407 |
+
lighting_description = f"{'indoor' if is_indoor else 'outdoor'} {time_of_day} lighting"
|
408 |
+
|
409 |
+
# 構建提示詞,整合所有關鍵資訊
|
410 |
+
prompt = self.enhance_description_template.format(
|
411 |
+
scene_type=scene_type,
|
412 |
+
object_list=high_confidence_objects,
|
413 |
+
original_description=original_desc,
|
414 |
+
perspective=perspective,
|
415 |
+
lighting_description=lighting_description
|
416 |
+
)
|
417 |
+
|
418 |
+
# 生成增強描述
|
419 |
+
self.logger.info("Generating LLM response...")
|
420 |
+
response = self._generate_llm_response(prompt)
|
421 |
+
|
422 |
+
# 檢查回應完整性的更嚴格標準
|
423 |
+
is_incomplete = (
|
424 |
+
len(response) < 100 or # 太短
|
425 |
+
(len(response) < 200 and "." not in response[-30:]) or # 結尾沒有適當標點
|
426 |
+
any(response.endswith(phrase) for phrase in ["in the", "with the", "and the"]) # 以不完整短語結尾
|
427 |
+
)
|
428 |
+
|
429 |
+
max_retries = 3
|
430 |
+
attempts = 0
|
431 |
+
while attempts < max_retries and is_incomplete:
|
432 |
+
self.logger.warning(f"Generated incomplete response, retrying... Attempt {attempts+1}/{max_retries}")
|
433 |
+
# 重新生成
|
434 |
+
response = self._generate_llm_response(prompt)
|
435 |
+
attempts += 1
|
436 |
+
|
437 |
+
# 重新檢查完整性
|
438 |
+
is_incomplete = (len(response) < 100 or
|
439 |
+
(len(response) < 200 and "." not in response[-30:]) or
|
440 |
+
any(response.endswith(phrase) for phrase in ["in the", "with the", "and the"]))
|
441 |
+
|
442 |
+
# 確保響應不為空
|
443 |
+
if not response or len(response.strip()) < 10:
|
444 |
+
self.logger.warning("Generated response was empty or too short, returning original description")
|
445 |
+
return original_desc
|
446 |
+
|
447 |
+
# 清理響應 - 使用與模型相符的清理方法
|
448 |
+
if "llama" in self.model_path.lower():
|
449 |
+
result = self._clean_llama_response(response)
|
450 |
+
else:
|
451 |
+
result = self._clean_model_response(response)
|
452 |
+
|
453 |
+
# 移除介紹性句子
|
454 |
+
result = self._remove_introduction_sentences(result)
|
455 |
+
|
456 |
+
# 移除解釋性注釋
|
457 |
+
result = self._remove_explanatory_notes(result)
|
458 |
+
|
459 |
+
# 進行事實準確性檢查
|
460 |
+
result = self._verify_factual_accuracy(original_desc, result, high_confidence_objects)
|
461 |
+
|
462 |
+
# 確保場景類型和視角一致性
|
463 |
+
result = self._ensure_scene_type_consistency(result, scene_type, original_desc)
|
464 |
+
if perspective and perspective.lower() not in result.lower():
|
465 |
+
result = f"{perspective}, {result[0].lower()}{result[1:]}"
|
466 |
+
|
467 |
+
return str(result)
|
468 |
+
|
469 |
+
except Exception as e:
|
470 |
+
self.logger.error(f"Enhancement failed: {str(e)}")
|
471 |
+
import traceback
|
472 |
+
self.logger.error(traceback.format_exc())
|
473 |
+
return original_desc # 發生任何錯誤時返回原始描述
|
474 |
+
|
475 |
+
def _verify_factual_accuracy(self, original: str, generated: str, object_list: str) -> str:
|
476 |
+
"""驗證生成的描述不包含原始描述或物體列表中沒有的信息"""
|
477 |
+
|
478 |
+
# 將原始描述和物體列表合併為授權詞彙源
|
479 |
+
authorized_content = original.lower() + " " + object_list.lower()
|
480 |
+
|
481 |
+
# 提取生成描述中具有實質意義的名詞
|
482 |
+
# 創建常見地點、文化和地域詞彙的列表
|
483 |
+
location_terms = ["plaza", "square", "market", "mall", "avenue", "boulevard"]
|
484 |
+
cultural_terms = ["european", "asian", "american", "african", "western", "eastern"]
|
485 |
+
|
486 |
+
# 檢查生成文本中的每個詞
|
487 |
+
for term in location_terms + cultural_terms:
|
488 |
+
# 僅當該詞出現在生成文本但不在授權內容中時進行替換
|
489 |
+
if term in generated.lower() and term not in authorized_content:
|
490 |
+
# 根據詞語類型選擇適當的替換詞
|
491 |
+
if term in location_terms:
|
492 |
+
replacement = "area"
|
493 |
+
else:
|
494 |
+
replacement = "scene"
|
495 |
+
|
496 |
+
# 使用正則表達式進行完整詞匹配替換
|
497 |
+
pattern = re.compile(r'\b' + term + r'\b', re.IGNORECASE)
|
498 |
+
generated = pattern.sub(replacement, generated)
|
499 |
+
|
500 |
+
return generated
|
501 |
+
|
502 |
+
|
503 |
+
def verify_detection(self,
|
504 |
+
detected_objects: List[Dict],
|
505 |
+
clip_analysis: Dict[str, Any],
|
506 |
+
scene_type: str,
|
507 |
+
scene_name: str,
|
508 |
+
confidence: float) -> Dict[str, Any]:
|
509 |
+
"""
|
510 |
+
驗證並可能修正YOLO的檢測結果
|
511 |
+
|
512 |
+
Args:
|
513 |
+
detected_objects: YOLO檢測到的物體列表
|
514 |
+
clip_analysis: CLIP分析結果
|
515 |
+
scene_type: 識別的場景類型
|
516 |
+
scene_name: 場景名稱
|
517 |
+
confidence: 場景分類的信心度
|
518 |
+
|
519 |
+
Returns:
|
520 |
+
Dict: 包含驗證結果和建議的字典
|
521 |
+
"""
|
522 |
+
# 確保模型已加載
|
523 |
+
self._load_model()
|
524 |
+
|
525 |
+
# 格式化數據
|
526 |
+
objects_str = self._format_objects_for_prompt(detected_objects)
|
527 |
+
clip_str = self._format_clip_results(clip_analysis)
|
528 |
+
|
529 |
+
# 構建提示
|
530 |
+
prompt = self.verify_detection_template.format(
|
531 |
+
scene_type=scene_type,
|
532 |
+
scene_name=scene_name,
|
533 |
+
confidence=confidence,
|
534 |
+
detected_objects=objects_str,
|
535 |
+
clip_analysis=clip_str
|
536 |
+
)
|
537 |
+
|
538 |
+
# 調用LLM進行驗證
|
539 |
+
verification_result = self._generate_llm_response(prompt)
|
540 |
+
|
541 |
+
# 解析驗證結果
|
542 |
+
result = {
|
543 |
+
"verification_text": verification_result,
|
544 |
+
"has_errors": "appear accurate" not in verification_result.lower(),
|
545 |
+
"corrected_objects": None # 可能在未來版本實現詳細錯誤修正
|
546 |
+
}
|
547 |
+
|
548 |
+
return result
|
549 |
+
|
550 |
+
def _validate_content_consistency(self, original_desc: str, enhanced_desc: str) -> str:
|
551 |
+
"""驗證增強描述的內容與原始描述一致"""
|
552 |
+
# 提取原始描述中的關鍵數值
|
553 |
+
people_count_match = re.search(r'(\d+)\s+people', original_desc, re.IGNORECASE)
|
554 |
+
people_count = int(people_count_match.group(1)) if people_count_match else None
|
555 |
+
|
556 |
+
# 驗證人數一致性
|
557 |
+
if people_count:
|
558 |
+
enhanced_count_match = re.search(r'(\d+)\s+people', enhanced_desc, re.IGNORECASE)
|
559 |
+
if not enhanced_count_match or int(enhanced_count_match.group(1)) != people_count:
|
560 |
+
# 保留原始人數
|
561 |
+
if enhanced_count_match:
|
562 |
+
enhanced_desc = re.sub(r'\b\d+\s+people\b', f"{people_count} people", enhanced_desc, flags=re.IGNORECASE)
|
563 |
+
elif "people" in enhanced_desc.lower():
|
564 |
+
enhanced_desc = re.sub(r'\bpeople\b', f"{people_count} people", enhanced_desc, flags=re.IGNORECASE)
|
565 |
+
|
566 |
+
# 驗證視角/透視一致性
|
567 |
+
perspective_terms = ["aerial", "bird's-eye", "overhead", "ground level", "eye level"]
|
568 |
+
|
569 |
+
for term in perspective_terms:
|
570 |
+
if term in original_desc.lower() and term not in enhanced_desc.lower():
|
571 |
+
# 添加缺失的視角信息
|
572 |
+
if enhanced_desc[0].isupper():
|
573 |
+
enhanced_desc = f"From {term} view, {enhanced_desc[0].lower()}{enhanced_desc[1:]}"
|
574 |
+
else:
|
575 |
+
enhanced_desc = f"From {term} view, {enhanced_desc}"
|
576 |
+
break
|
577 |
+
|
578 |
+
return enhanced_desc
|
579 |
+
|
580 |
+
def _remove_explanatory_notes(self, response: str) -> str:
|
581 |
+
"""移除解釋性注釋、說明和其他非描述性內容"""
|
582 |
+
|
583 |
+
# 識別常見的注釋和解釋模式
|
584 |
+
note_patterns = [
|
585 |
+
r'(?:^|\n)Note:.*?(?:\n|$)',
|
586 |
+
r'(?:^|\n)I have (?:followed|adhered to|ensured).*?(?:\n|$)',
|
587 |
+
r'(?:^|\n)This description (?:follows|adheres to|maintains).*?(?:\n|$)',
|
588 |
+
r'(?:^|\n)The enhanced description (?:maintains|preserves).*?(?:\n|$)'
|
589 |
+
]
|
590 |
+
|
591 |
+
# 尋找第一段完整的描述內容
|
592 |
+
paragraphs = [p.strip() for p in response.split('\n\n') if p.strip()]
|
593 |
+
|
594 |
+
# 如果只有一個段落,檢查並清理它
|
595 |
+
if len(paragraphs) == 1:
|
596 |
+
for pattern in note_patterns:
|
597 |
+
paragraphs[0] = re.sub(pattern, '', paragraphs[0], flags=re.IGNORECASE)
|
598 |
+
return paragraphs[0].strip()
|
599 |
+
|
600 |
+
# 如果有多個段落,識別並移除注釋段落
|
601 |
+
content_paragraphs = []
|
602 |
+
for paragraph in paragraphs:
|
603 |
+
is_note = False
|
604 |
+
for pattern in note_patterns:
|
605 |
+
if re.search(pattern, paragraph, flags=re.IGNORECASE):
|
606 |
+
is_note = True
|
607 |
+
break
|
608 |
+
|
609 |
+
# 檢查段落是否以常見的注釋詞開頭
|
610 |
+
if paragraph.lower().startswith(('note:', 'please note:', 'remember:')):
|
611 |
+
is_note = True
|
612 |
+
|
613 |
+
if not is_note:
|
614 |
+
content_paragraphs.append(paragraph)
|
615 |
+
|
616 |
+
# 返回清理後的內容
|
617 |
+
return '\n\n'.join(content_paragraphs).strip()
|
618 |
+
|
619 |
+
def handle_no_detection(self, clip_analysis: Dict[str, Any]) -> str:
|
620 |
+
"""
|
621 |
+
處理YOLO未檢測到物體的情況
|
622 |
+
|
623 |
+
Args:
|
624 |
+
clip_analysis: CLIP分析結果
|
625 |
+
|
626 |
+
Returns:
|
627 |
+
str: 生成的場景描述
|
628 |
+
"""
|
629 |
+
# 確保模型已加載
|
630 |
+
self._load_model()
|
631 |
+
|
632 |
+
# 提取CLIP結果
|
633 |
+
top_scene, top_confidence = clip_analysis.get("top_scene", ("unknown", 0))
|
634 |
+
viewpoint = clip_analysis.get("viewpoint", ("standard", 0))[0]
|
635 |
+
lighting = clip_analysis.get("lighting_condition", ("unknown", 0))[0]
|
636 |
+
|
637 |
+
# 格式化文化分析
|
638 |
+
cultural_str = self._format_cultural_analysis(clip_analysis.get("cultural_analysis", {}))
|
639 |
+
|
640 |
+
# 構建提示
|
641 |
+
prompt = self.no_detection_template.format(
|
642 |
+
top_scene=top_scene,
|
643 |
+
top_confidence=top_confidence,
|
644 |
+
viewpoint=viewpoint,
|
645 |
+
lighting_condition=lighting,
|
646 |
+
cultural_analysis=cultural_str
|
647 |
+
)
|
648 |
+
|
649 |
+
# 調用LLM生成描述
|
650 |
+
description = self._generate_llm_response(prompt)
|
651 |
+
|
652 |
+
# 優化輸出
|
653 |
+
return self._clean_llm_response(description)
|
654 |
+
|
655 |
+
def _clean_input_text(self, text: str) -> str:
|
656 |
+
"""
|
657 |
+
對輸入文本進行通用的格式清理,處理常見的格式問題。
|
658 |
+
|
659 |
+
Args:
|
660 |
+
text: 輸入文本
|
661 |
+
|
662 |
+
Returns:
|
663 |
+
清理後的文本
|
664 |
+
"""
|
665 |
+
if not text:
|
666 |
+
return ""
|
667 |
+
|
668 |
+
# 清理格式的問題
|
669 |
+
# 1. 處理連續標點符號問題
|
670 |
+
text = re.sub(r'([.,;:!?])\1+', r'\1', text)
|
671 |
+
|
672 |
+
# 2. 修復不完整句子的標點(如 "Something," 後沒有繼續句子)
|
673 |
+
text = re.sub(r',\s*$', '.', text)
|
674 |
+
|
675 |
+
# 3. 修復如 "word." 後未加空格即接下一句的問題
|
676 |
+
text = re.sub(r'([.!?])([A-Z])', r'\1 \2', text)
|
677 |
+
|
678 |
+
# 4. 移除多餘空格
|
679 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
680 |
+
|
681 |
+
# 5. 確保句子正確結束(句尾加句號)
|
682 |
+
if text and not text[-1] in '.!?':
|
683 |
+
text += '.'
|
684 |
+
|
685 |
+
return text
|
686 |
+
|
687 |
+
def _fact_check_description(self, original_desc: str, enhanced_desc: str, scene_type: str, detected_objects: List[str]) -> str:
|
688 |
+
"""
|
689 |
+
驗證並可能修正增強後的描述,確保其保持事實準確性,針對普遍事實而非特定場景。
|
690 |
+
|
691 |
+
Args:
|
692 |
+
original_desc: 原始場景描述
|
693 |
+
enhanced_desc: 增強後的描述待驗證
|
694 |
+
scene_type: 場景類型
|
695 |
+
detected_objects: 檢測到的物體名稱列表
|
696 |
+
|
697 |
+
Returns:
|
698 |
+
經過事實檢查的描述
|
699 |
+
"""
|
700 |
+
# 如果增強描述為空或太短,返回原始描述
|
701 |
+
if not enhanced_desc or len(enhanced_desc) < 30:
|
702 |
+
return original_desc
|
703 |
+
|
704 |
+
# 1. 檢查數值一致性(如人數、物體數量等)
|
705 |
+
# 從原始描述中提取數字和相關名詞
|
706 |
+
number_patterns = [
|
707 |
+
(r'(\d+)\s+(people|person|pedestrians|individuals)', r'\1', r'\2'), # 人數
|
708 |
+
(r'(\d+)\s+(cars|vehicles|automobiles)', r'\1', r'\2'), # 車輛數
|
709 |
+
(r'(\d+)\s+(buildings|structures)', r'\1', r'\2') # 建築數
|
710 |
+
]
|
711 |
+
|
712 |
+
# 檢查原始描述中的每個數字
|
713 |
+
for pattern, num_group, word_group in number_patterns:
|
714 |
+
original_matches = re.finditer(pattern, original_desc, re.IGNORECASE)
|
715 |
+
for match in original_matches:
|
716 |
+
number = match.group(1)
|
717 |
+
noun = match.group(2)
|
718 |
+
|
719 |
+
# 檢查增強描述中是否保留了這個數字
|
720 |
+
# 創建一個更通用的模式來檢查增強描述中是否包含此數字和對象類別
|
721 |
+
enhanced_pattern = r'(\d+)\s+(' + re.escape(noun) + r'|' + re.escape(noun.rstrip('s')) + r'|' + re.escape(noun + 's') + r')'
|
722 |
+
enhanced_matches = list(re.finditer(enhanced_pattern, enhanced_desc, re.IGNORECASE))
|
723 |
+
|
724 |
+
if not enhanced_matches:
|
725 |
+
# 數字+名詞未在增強描述中找到
|
726 |
+
plural_form = noun if noun.endswith('s') or number == '1' else noun + 's'
|
727 |
+
if enhanced_desc.startswith("This") or enhanced_desc.startswith("The"):
|
728 |
+
enhanced_desc = enhanced_desc.replace("This ", f"This scene with {number} {plural_form} ", 1)
|
729 |
+
enhanced_desc = enhanced_desc.replace("The ", f"The scene with {number} {plural_form} ", 1)
|
730 |
+
else:
|
731 |
+
enhanced_desc = f"The scene includes {number} {plural_form}. " + enhanced_desc
|
732 |
+
elif enhanced_matches and match.group(1) != number:
|
733 |
+
# 存在但數字不一致,就要更正數字
|
734 |
+
for ematch in enhanced_matches:
|
735 |
+
wrong_number = ematch.group(1)
|
736 |
+
enhanced_desc = enhanced_desc.replace(f"{wrong_number} {ematch.group(2)}", f"{number} {ematch.group(2)}")
|
737 |
+
|
738 |
+
# 2. 檢查視角的一致性
|
739 |
+
perspective_terms = {
|
740 |
+
"aerial": ["aerial", "bird's-eye", "overhead", "top-down", "above", "looking down"],
|
741 |
+
"ground": ["street-level", "ground level", "eye-level", "standing"],
|
742 |
+
"indoor": ["inside", "interior", "indoor", "within"],
|
743 |
+
"close-up": ["close-up", "detailed view", "close shot"]
|
744 |
+
}
|
745 |
+
|
746 |
+
# 確定原始視角
|
747 |
+
original_perspective = None
|
748 |
+
for persp, terms in perspective_terms.items():
|
749 |
+
if any(term in original_desc.lower() for term in terms):
|
750 |
+
original_perspective = persp
|
751 |
+
break
|
752 |
+
|
753 |
+
# 檢查是否保留了視角方面
|
754 |
+
if original_perspective:
|
755 |
+
enhanced_has_perspective = any(term in enhanced_desc.lower() for term in perspective_terms[original_perspective])
|
756 |
+
|
757 |
+
if not enhanced_has_perspective:
|
758 |
+
# 添加之前缺的視角方面
|
759 |
+
perspective_prefixes = {
|
760 |
+
"aerial": "From an aerial perspective, ",
|
761 |
+
"ground": "From street level, ",
|
762 |
+
"indoor": "In this indoor setting, ",
|
763 |
+
"close-up": "In this close-up view, "
|
764 |
+
}
|
765 |
+
|
766 |
+
prefix = perspective_prefixes.get(original_perspective, "")
|
767 |
+
if prefix:
|
768 |
+
if enhanced_desc[0].isupper():
|
769 |
+
enhanced_desc = prefix + enhanced_desc[0].lower() + enhanced_desc[1:]
|
770 |
+
else:
|
771 |
+
enhanced_desc = prefix + enhanced_desc
|
772 |
+
|
773 |
+
# 3. 檢查場景類型一致性
|
774 |
+
if scene_type and scene_type.lower() != "unknown" and scene_type.lower() not in enhanced_desc.lower():
|
775 |
+
# 優雅地添加場景類型
|
776 |
+
if enhanced_desc.startswith("This ") or enhanced_desc.startswith("The "):
|
777 |
+
# 避免產生 "This scene" 和 "This intersection" 的重複
|
778 |
+
if "scene" in enhanced_desc[:15].lower():
|
779 |
+
fixed_type = scene_type.lower()
|
780 |
+
enhanced_desc = enhanced_desc.replace("scene", fixed_type, 1)
|
781 |
+
else:
|
782 |
+
enhanced_desc = enhanced_desc.replace("This ", f"This {scene_type} ", 1)
|
783 |
+
enhanced_desc = enhanced_desc.replace("The ", f"The {scene_type} ", 1)
|
784 |
+
else:
|
785 |
+
enhanced_desc = f"This {scene_type} " + enhanced_desc
|
786 |
+
|
787 |
+
# 4. 確保文字長度適當,這邊的限制要與prompt相同,否則會產生矛盾
|
788 |
+
words = enhanced_desc.split()
|
789 |
+
if len(words) > 200:
|
790 |
+
# 找尋接近字數限制的句子結束處
|
791 |
+
truncated = ' '.join(words[:200])
|
792 |
+
last_period = max(truncated.rfind('.'), truncated.rfind('!'), truncated.rfind('?'))
|
793 |
+
|
794 |
+
if last_period > 0:
|
795 |
+
enhanced_desc = truncated[:last_period+1]
|
796 |
+
else:
|
797 |
+
enhanced_desc = truncated + '.'
|
798 |
+
|
799 |
+
return enhanced_desc
|
800 |
+
|
801 |
+
def _extract_perspective_from_description(self, description: str) -> str:
|
802 |
+
"""從原始描述中提取視角/透視信息"""
|
803 |
+
perspective_terms = {
|
804 |
+
"aerial": ["aerial perspective", "aerial view", "bird's-eye view", "overhead view", "from above"],
|
805 |
+
"ground": ["ground level", "eye level", "street level"],
|
806 |
+
"indoor": ["indoor setting", "inside", "interior"]
|
807 |
+
}
|
808 |
+
|
809 |
+
for persp_type, terms in perspective_terms.items():
|
810 |
+
for term in terms:
|
811 |
+
if term.lower() in description.lower():
|
812 |
+
return term
|
813 |
+
|
814 |
+
return ""
|
815 |
+
|
816 |
+
def _extract_objects_from_description(self, description: str) -> List[str]:
|
817 |
+
"""從原始描述中提取物件提及"""
|
818 |
+
# 常見物件正則表達式模式
|
819 |
+
object_patterns = [
|
820 |
+
r'(\d+)\s+(people|persons|pedestrians|individuals)',
|
821 |
+
r'(\d+)\s+(cars|vehicles|automobiles)',
|
822 |
+
r'(\d+)\s+(buildings|structures)',
|
823 |
+
r'(\d+)\s+(plants|potted plants|flowers)',
|
824 |
+
r'(\d+)\s+(beds|furniture|tables|chairs)'
|
825 |
+
]
|
826 |
+
|
827 |
+
extracted_objects = []
|
828 |
+
|
829 |
+
for pattern in object_patterns:
|
830 |
+
matches = re.finditer(pattern, description, re.IGNORECASE)
|
831 |
+
for match in matches:
|
832 |
+
number = match.group(1)
|
833 |
+
object_type = match.group(2)
|
834 |
+
extracted_objects.append(f"{number} {object_type}")
|
835 |
+
|
836 |
+
return extracted_objects
|
837 |
+
|
838 |
+
def _ensure_scene_type_consistency(self, description: str, scene_type: str, original_desc: str) -> str:
|
839 |
+
"""確保描述中的場景類型與指定的場景類型一致"""
|
840 |
+
# 禁止使用的錯誤場景詞列表
|
841 |
+
prohibited_scene_words = ["plaza", "square", "european", "asian", "american"]
|
842 |
+
|
843 |
+
# 檢查是否包含禁止的場景詞
|
844 |
+
for word in prohibited_scene_words:
|
845 |
+
if word in description.lower() and word not in original_desc.lower() and word not in scene_type.lower():
|
846 |
+
# 替換錯誤場景詞為正確場景類型
|
847 |
+
pattern = re.compile(r'\b' + word + r'\b', re.IGNORECASE)
|
848 |
+
description = pattern.sub(scene_type, description)
|
849 |
+
|
850 |
+
# 確保場景類型在描述中被提及
|
851 |
+
if scene_type.lower() not in description.lower():
|
852 |
+
# 尋找通用場景詞並替換
|
853 |
+
for general_term in ["scene", "area", "place", "location"]:
|
854 |
+
if general_term in description.lower():
|
855 |
+
pattern = re.compile(r'\b' + general_term + r'\b', re.IGNORECASE)
|
856 |
+
description = pattern.sub(scene_type, description, count=1)
|
857 |
+
break
|
858 |
+
else:
|
859 |
+
# 如果沒有找到通用詞,在開頭添加場景類型
|
860 |
+
if description.startswith("The "):
|
861 |
+
description = description.replace("The ", f"The {scene_type} ", 1)
|
862 |
+
elif description.startswith("This "):
|
863 |
+
description = description.replace("This ", f"This {scene_type} ", 1)
|
864 |
+
else:
|
865 |
+
description = f"This {scene_type} " + description
|
866 |
+
|
867 |
+
return description
|
868 |
+
|
869 |
+
def _generate_llm_response(self, prompt: str) -> str:
|
870 |
+
"""生成 LLM 的回應"""
|
871 |
+
self._load_model()
|
872 |
+
|
873 |
+
try:
|
874 |
+
self.call_count += 1
|
875 |
+
self.logger.info(f"LLM call #{self.call_count}")
|
876 |
+
|
877 |
+
# 清除 GPU 緩存
|
878 |
+
torch.cuda.empty_cache()
|
879 |
+
|
880 |
+
# 設置固定種子以提高一致性
|
881 |
+
torch.manual_seed(42)
|
882 |
+
|
883 |
+
# 準備輸入
|
884 |
+
inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=self.max_length).to(self.device)
|
885 |
+
|
886 |
+
# 根據模型類型調整參數
|
887 |
+
generation_params = {
|
888 |
+
"max_new_tokens": 120,
|
889 |
+
"pad_token_id": self.tokenizer.eos_token_id,
|
890 |
+
"attention_mask": inputs.attention_mask,
|
891 |
+
"use_cache": True,
|
892 |
+
}
|
893 |
+
|
894 |
+
# 為 Llama 模型設置特定參數
|
895 |
+
if "llama" in self.model_path.lower():
|
896 |
+
generation_params.update({
|
897 |
+
"temperature": 0.4, # 不要太高, 否則模型可能會太有主觀意見
|
898 |
+
"max_new_tokens": 600,
|
899 |
+
"do_sample": True,
|
900 |
+
"top_p": 0.8,
|
901 |
+
"repetition_penalty": 1.2, # 重複的懲罰權重,可避免掉重複字
|
902 |
+
"num_beams": 4 ,
|
903 |
+
"length_penalty": 1.2,
|
904 |
+
})
|
905 |
+
|
906 |
+
else:
|
907 |
+
# 如果用其他模型的參數
|
908 |
+
generation_params.update({
|
909 |
+
"temperature": 0.6,
|
910 |
+
"max_new_tokens": 300,
|
911 |
+
"top_p": 0.9,
|
912 |
+
"do_sample": True,
|
913 |
+
"num_beams": 1,
|
914 |
+
"repetition_penalty": 1.05
|
915 |
+
})
|
916 |
+
|
917 |
+
# 生成回應
|
918 |
+
with torch.no_grad():
|
919 |
+
outputs = self.model.generate(inputs.input_ids, **generation_params)
|
920 |
+
|
921 |
+
# 解碼完整輸出
|
922 |
+
full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
923 |
+
|
924 |
+
# 提取生成的響應部分
|
925 |
+
assistant_tag = "<|assistant|>"
|
926 |
+
if assistant_tag in full_response:
|
927 |
+
response = full_response.split(assistant_tag)[-1].strip()
|
928 |
+
|
929 |
+
# 檢查是否有未閉合的 <|assistant|>
|
930 |
+
user_tag = "<|user|>"
|
931 |
+
if user_tag in response:
|
932 |
+
response = response.split(user_tag)[0].strip()
|
933 |
+
else:
|
934 |
+
# 移除輸入提示
|
935 |
+
input_text = self.tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)
|
936 |
+
response = full_response
|
937 |
+
if response.startswith(input_text):
|
938 |
+
response = response[len(input_text):].strip()
|
939 |
+
|
940 |
+
# 確保不返回空響應
|
941 |
+
if not response or len(response.strip()) < 10:
|
942 |
+
self.logger.warning("生成的回應為空的或太短,返回默認回應")
|
943 |
+
return "No detailed description could be generated."
|
944 |
+
|
945 |
+
return response
|
946 |
+
|
947 |
+
except Exception as e:
|
948 |
+
self.logger.error(f"生成 LLM 響應時出錯: {str(e)}")
|
949 |
+
import traceback
|
950 |
+
self.logger.error(traceback.format_exc())
|
951 |
+
return "Unable to generate enhanced description."
|
952 |
+
|
953 |
+
def _clean_llm_response(self, response: str) -> str:
|
954 |
+
"""
|
955 |
+
Clean the LLM response to ensure the output contains only clean descriptive text.
|
956 |
+
Sometimes it will not only display the description but display tags, notes...etc
|
957 |
+
|
958 |
+
Args:
|
959 |
+
response: Original response from the LLM
|
960 |
+
|
961 |
+
Returns:
|
962 |
+
Cleaned description text
|
963 |
+
"""
|
964 |
+
if not response:
|
965 |
+
return ""
|
966 |
+
|
967 |
+
# Save original response as backup
|
968 |
+
original_response = response
|
969 |
+
|
970 |
+
# 1. Extract content between markers (if present)
|
971 |
+
output_start = response.find("[OUTPUT_START]")
|
972 |
+
output_end = response.find("[OUTPUT_END]")
|
973 |
+
if output_start != -1 and output_end != -1 and output_end > output_start:
|
974 |
+
response = response[output_start + len("[OUTPUT_START]"):output_end].strip()
|
975 |
+
|
976 |
+
# 2. Remove all remaining section markers and instructions
|
977 |
+
section_markers = [
|
978 |
+
r'\[.*?\]', # [any text]
|
979 |
+
r'OUTPUT_START\s*:|OUTPUT_END\s*:', # OUTPUT_START: or OUTPUT_END:
|
980 |
+
r'ENHANCED DESCRIPTION\s*:', # ENHANCED DESCRIPTION:
|
981 |
+
r'Scene Type\s*:.*?(?=\n|$)', # Scene Type: text
|
982 |
+
r'Original Description\s*:.*?(?=\n|$)', # Original Description: text
|
983 |
+
r'GOOD\s*:|BAD\s*:', # GOOD: or BAD:
|
984 |
+
r'PROBLEM\s*:.*?(?=\n|$)', # PROBLEM: text
|
985 |
+
r'</?\|(?:assistant|system|user)\|>', # Dialog markers
|
986 |
+
r'\(Note:.*?\)', # Notes in parentheses
|
987 |
+
r'\(.*?I\'ve.*?\)', # Common explanatory content
|
988 |
+
r'\(.*?as per your request.*?\)' # References to instructions
|
989 |
+
]
|
990 |
+
|
991 |
+
for marker in section_markers:
|
992 |
+
response = re.sub(marker, '', response, flags=re.IGNORECASE)
|
993 |
+
|
994 |
+
# 3. Remove common prefixes and suffixes
|
995 |
+
prefixes_to_remove = [
|
996 |
+
"Enhanced Description:",
|
997 |
+
"Scene Description:",
|
998 |
+
"Description:",
|
999 |
+
"Here is the enhanced description:",
|
1000 |
+
"Here's the enhanced description:"
|
1001 |
+
]
|
1002 |
+
|
1003 |
+
for prefix in prefixes_to_remove:
|
1004 |
+
if response.lower().startswith(prefix.lower()):
|
1005 |
+
response = response[len(prefix):].strip()
|
1006 |
+
|
1007 |
+
# 4. Remove any Context tags or text containing Context
|
1008 |
+
response = re.sub(r'<\s*Context:.*?>', '', response)
|
1009 |
+
response = re.sub(r'Context:.*?(?=\n|$)', '', response)
|
1010 |
+
response = re.sub(r'Note:.*?(?=\n|$)', '', response, flags=re.IGNORECASE)
|
1011 |
+
|
1012 |
+
# 5. Clean improper scene type references
|
1013 |
+
scene_type_pattern = r'This ([a-zA-Z_]+) (features|shows|displays|contains)'
|
1014 |
+
match = re.search(scene_type_pattern, response)
|
1015 |
+
if match and '_' in match.group(1):
|
1016 |
+
fixed_text = f"This scene {match.group(2)}"
|
1017 |
+
response = re.sub(scene_type_pattern, fixed_text, response)
|
1018 |
+
|
1019 |
+
# 6. Reduce dash usage for more natural punctuation
|
1020 |
+
response = re.sub(r'—', ', ', response)
|
1021 |
+
response = re.sub(r' - ', ', ', response)
|
1022 |
+
|
1023 |
+
# 7. Remove excess whitespace and line breaks
|
1024 |
+
response = response.replace('\r', ' ')
|
1025 |
+
response = re.sub(r'\n+', ' ', response) # 將所有換行符替換為空格
|
1026 |
+
response = re.sub(r'\s{2,}', ' ', response) # 將多個空格替換為單個空格
|
1027 |
+
|
1028 |
+
# 8. Remove Markdown formatting
|
1029 |
+
response = re.sub(r'\*\*|\*|__|\|', '', response) # Remove Markdown indicators
|
1030 |
+
|
1031 |
+
# 9. Detect and remove sentence duplicates
|
1032 |
+
sentences = re.split(r'(?<=[.!?])\s+', response)
|
1033 |
+
unique_sentences = []
|
1034 |
+
seen_content = set()
|
1035 |
+
|
1036 |
+
for sentence in sentences:
|
1037 |
+
# Skip empty sentences
|
1038 |
+
if not sentence.strip():
|
1039 |
+
continue
|
1040 |
+
|
1041 |
+
# Create simplified version for comparison (lowercase, no punctuation)
|
1042 |
+
simplified = re.sub(r'[^\w\s]', '', sentence.lower())
|
1043 |
+
simplified = ' '.join(simplified.split()) # Standardize whitespace
|
1044 |
+
|
1045 |
+
# Check if we've seen a similar sentence
|
1046 |
+
is_duplicate = False
|
1047 |
+
for existing in seen_content:
|
1048 |
+
if len(simplified) > 10 and (existing in simplified or simplified in existing):
|
1049 |
+
is_duplicate = True
|
1050 |
+
break
|
1051 |
+
|
1052 |
+
if not is_duplicate and simplified:
|
1053 |
+
unique_sentences.append(sentence)
|
1054 |
+
seen_content.add(simplified)
|
1055 |
+
|
1056 |
+
# Recombine unique sentences
|
1057 |
+
response = ' '.join(unique_sentences)
|
1058 |
+
|
1059 |
+
# 10. Ensure word count is within limits (50-150 words)
|
1060 |
+
words = response.split()
|
1061 |
+
if len(words) > 200:
|
1062 |
+
# Find sentence ending near the word limit
|
1063 |
+
truncated = ' '.join(words[:200])
|
1064 |
+
last_period = max(truncated.rfind('.'), truncated.rfind('!'), truncated.rfind('?'))
|
1065 |
+
|
1066 |
+
if last_period > 0:
|
1067 |
+
response = truncated[:last_period+1]
|
1068 |
+
else:
|
1069 |
+
response = truncated + "."
|
1070 |
+
|
1071 |
+
# 11. Check sentence completeness
|
1072 |
+
if response and not response.strip()[-1] in ['.', '!', '?']:
|
1073 |
+
# Find the last preposition or conjunction
|
1074 |
+
common_prepositions = ["into", "onto", "about", "above", "across", "after", "along", "around", "at", "before", "behind", "below", "beneath", "beside", "between", "beyond", "by", "down", "during", "except", "for", "from", "in", "inside", "near", "of", "off", "on", "over", "through", "to", "toward", "under", "up", "upon", "with", "within"]
|
1075 |
+
|
1076 |
+
# Check if ending with preposition or conjunction
|
1077 |
+
last_word = response.strip().split()[-1].lower() if response.strip().split() else ""
|
1078 |
+
if last_word in common_prepositions or last_word in ["and", "or", "but"]:
|
1079 |
+
# Find the last complete sentence
|
1080 |
+
last_period = max(response.rfind('.'), response.rfind('!'), response.rfind('?'))
|
1081 |
+
if last_period > 0:
|
1082 |
+
response = response[:last_period+1]
|
1083 |
+
else:
|
1084 |
+
# If no complete sentence found, modify the ending
|
1085 |
+
words = response.strip().split()
|
1086 |
+
if words:
|
1087 |
+
# Remove the last preposition or conjunction
|
1088 |
+
response = " ".join(words[:-1]) + "."
|
1089 |
+
|
1090 |
+
# 12. Ensure haven't over-filtered
|
1091 |
+
if not response or len(response) < 40:
|
1092 |
+
# Try to get the first meaningful paragraph from the original response
|
1093 |
+
paragraphs = [p for p in original_response.split('\n\n') if p.strip()]
|
1094 |
+
if paragraphs:
|
1095 |
+
# Choose the longest paragraph as it's most likely the actual description
|
1096 |
+
best_para = max(paragraphs, key=len)
|
1097 |
+
# Clean it using a subset of the above rules
|
1098 |
+
best_para = re.sub(r'\[.*?\]', '', best_para) # Remove [SECTION] markers
|
1099 |
+
best_para = re.sub(r'\s{2,}', ' ', best_para).strip() # Clean whitespace
|
1100 |
+
|
1101 |
+
if len(best_para) >= 40:
|
1102 |
+
return best_para
|
1103 |
+
|
1104 |
+
# If still no good content, return a simple message
|
1105 |
+
return "Unable to generate a valid enhanced description."
|
1106 |
+
|
1107 |
+
# 13. Final cleaning - catch any missed special cases
|
1108 |
+
response = re.sub(r'</?\|.*?\|>', '', response) # Any remaining tags
|
1109 |
+
response = re.sub(r'\(.*?\)', '', response) # Any remaining parenthetical content
|
1110 |
+
response = re.sub(r'Note:.*?(?=\n|$)', '', response, flags=re.IGNORECASE) # Any remaining notes
|
1111 |
+
|
1112 |
+
# Ensure proper spacing after punctuation
|
1113 |
+
response = re.sub(r'([.!?])([A-Z])', r'\1 \2', response)
|
1114 |
+
|
1115 |
+
# Ensure first letter is capitalized
|
1116 |
+
if response and response[0].islower():
|
1117 |
+
response = response[0].upper() + response[1:]
|
1118 |
+
|
1119 |
+
# 14. 統一格式 - 確保輸出始終是單一段落
|
1120 |
+
response = re.sub(r'\s*\n\s*', ' ', response) # 將所有換行符替換為空格
|
1121 |
+
response = ' '.join(response.split())
|
1122 |
+
|
1123 |
+
return response.strip()
|
1124 |
+
|
1125 |
+
def _format_objects_for_prompt(self, objects: List[Dict]) -> str:
|
1126 |
+
"""格式化物體列表以用於提示"""
|
1127 |
+
if not objects:
|
1128 |
+
return "No objects detected"
|
1129 |
+
|
1130 |
+
formatted = []
|
1131 |
+
for obj in objects:
|
1132 |
+
formatted.append(f"{obj['class_name']} (confidence: {obj['confidence']:.2f})")
|
1133 |
+
|
1134 |
+
return "\n- " + "\n- ".join(formatted)
|
1135 |
+
|
1136 |
+
def _format_lighting(self, lighting_info: Dict) -> str:
|
1137 |
+
"""格式化光照信息以用於提示"""
|
1138 |
+
if not lighting_info:
|
1139 |
+
return "Unknown lighting conditions"
|
1140 |
+
|
1141 |
+
time = lighting_info.get("time_of_day", "unknown")
|
1142 |
+
conf = lighting_info.get("confidence", 0)
|
1143 |
+
is_indoor = lighting_info.get("is_indoor", False)
|
1144 |
+
|
1145 |
+
base_info = f"{'Indoor' if is_indoor else 'Outdoor'} {time} (confidence: {conf:.2f})"
|
1146 |
+
|
1147 |
+
# 添加更詳細的診斷信息
|
1148 |
+
diagnostics = lighting_info.get("diagnostics", {})
|
1149 |
+
if diagnostics:
|
1150 |
+
diag_str = "\nAdditional lighting diagnostics:"
|
1151 |
+
for key, value in diagnostics.items():
|
1152 |
+
diag_str += f"\n- {key}: {value}"
|
1153 |
+
base_info += diag_str
|
1154 |
+
|
1155 |
+
return base_info
|
1156 |
+
|
1157 |
+
def _format_zones(self, zones: Dict) -> str:
|
1158 |
+
"""格式化功能區域以用於提示"""
|
1159 |
+
if not zones:
|
1160 |
+
return "No distinct functional zones identified"
|
1161 |
+
|
1162 |
+
formatted = ["Identified functional zones:"]
|
1163 |
+
for zone_name, zone_data in zones.items():
|
1164 |
+
desc = zone_data.get("description", "")
|
1165 |
+
objects = zone_data.get("objects", [])
|
1166 |
+
|
1167 |
+
zone_str = f"- {zone_name}: {desc}"
|
1168 |
+
if objects:
|
1169 |
+
zone_str += f" (Contains: {', '.join(objects)})"
|
1170 |
+
|
1171 |
+
formatted.append(zone_str)
|
1172 |
+
|
1173 |
+
return "\n".join(formatted)
|
1174 |
+
|
1175 |
+
def _format_clip_results(self, clip_analysis: Dict) -> str:
|
1176 |
+
"""格式化CLIP分析結果以用於提示"""
|
1177 |
+
if not clip_analysis or "error" in clip_analysis:
|
1178 |
+
return "No CLIP analysis available"
|
1179 |
+
|
1180 |
+
parts = ["CLIP Analysis Results:"]
|
1181 |
+
|
1182 |
+
# 加上頂級場景
|
1183 |
+
top_scene, confidence = clip_analysis.get("top_scene", ("unknown", 0))
|
1184 |
+
parts.append(f"- Most likely scene: {top_scene} (confidence: {confidence:.2f})")
|
1185 |
+
|
1186 |
+
# 加上視角
|
1187 |
+
viewpoint, vp_conf = clip_analysis.get("viewpoint", ("standard", 0))
|
1188 |
+
parts.append(f"- Camera viewpoint: {viewpoint} (confidence: {vp_conf:.2f})")
|
1189 |
+
|
1190 |
+
# 加上物體組合
|
1191 |
+
if "object_combinations" in clip_analysis:
|
1192 |
+
combos = []
|
1193 |
+
for combo, score in clip_analysis["object_combinations"][:3]:
|
1194 |
+
combos.append(f"{combo} ({score:.2f})")
|
1195 |
+
parts.append(f"- Object combinations: {', '.join(combos)}")
|
1196 |
+
|
1197 |
+
# 加上文化分析
|
1198 |
+
if "cultural_analysis" in clip_analysis:
|
1199 |
+
parts.append("- Cultural analysis:")
|
1200 |
+
for culture_type, data in clip_analysis["cultural_analysis"].items():
|
1201 |
+
best_desc = data.get("best_description", "")
|
1202 |
+
desc_conf = data.get("confidence", 0)
|
1203 |
+
parts.append(f" * {culture_type}: {best_desc} ({desc_conf:.2f})")
|
1204 |
+
|
1205 |
+
return "\n".join(parts)
|
1206 |
+
|
1207 |
+
def _format_cultural_analysis(self, cultural_analysis: Dict) -> str:
|
1208 |
+
"""格式化文化分析結果"""
|
1209 |
+
if not cultural_analysis:
|
1210 |
+
return "No specific cultural elements detected"
|
1211 |
+
|
1212 |
+
parts = []
|
1213 |
+
for culture_type, data in cultural_analysis.items():
|
1214 |
+
best_desc = data.get("best_description", "")
|
1215 |
+
desc_conf = data.get("confidence", 0)
|
1216 |
+
parts.append(f"{culture_type}: {best_desc} (confidence: {desc_conf:.2f})")
|
1217 |
+
|
1218 |
+
return "\n".join(parts)
|
requirements.txt
CHANGED
@@ -9,3 +9,8 @@ gradio>=3.32.0
|
|
9 |
git+https://github.com/openai/CLIP.git
|
10 |
yt-dlp>=2023.3.4
|
11 |
requests>=2.28.1
|
|
|
|
|
|
|
|
|
|
|
|
9 |
git+https://github.com/openai/CLIP.git
|
10 |
yt-dlp>=2023.3.4
|
11 |
requests>=2.28.1
|
12 |
+
transformers
|
13 |
+
accelerate
|
14 |
+
bitsandbytes
|
15 |
+
sentencepiece
|
16 |
+
huggingface_hub>=0.19.0
|
scene_analyzer.py
CHANGED
@@ -6,6 +6,7 @@ from spatial_analyzer import SpatialAnalyzer
|
|
6 |
from scene_description import SceneDescriptor
|
7 |
from enhance_scene_describer import EnhancedSceneDescriber
|
8 |
from clip_analyzer import CLIPAnalyzer
|
|
|
9 |
from scene_type import SCENE_TYPES
|
10 |
from object_categories import OBJECT_CATEGORIES
|
11 |
|
@@ -14,7 +15,7 @@ class SceneAnalyzer:
|
|
14 |
Core class for scene analysis and understanding based on object detection results.
|
15 |
Analyzes detected objects, their relationships, and infers the scene type.
|
16 |
"""
|
17 |
-
def __init__(self, class_names: Dict[int, str] = None):
|
18 |
"""
|
19 |
Initialize the scene analyzer with optional class name mappings.
|
20 |
Args:
|
@@ -40,6 +41,18 @@ class SceneAnalyzer:
|
|
40 |
print("Scene analysis will proceed without CLIP. Install CLIP with 'pip install clip' for enhanced scene understanding.")
|
41 |
self.use_clip = False
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
def generate_scene_description(self,
|
44 |
scene_type,
|
45 |
detected_objects,
|
@@ -106,8 +119,31 @@ class SceneAnalyzer:
|
|
106 |
Returns:
|
107 |
Dictionary with scene analysis results
|
108 |
"""
|
109 |
-
# If no result or no detections,
|
110 |
if detection_result is None or len(detection_result.boxes) == 0:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
return {
|
112 |
"scene_type": "unknown",
|
113 |
"confidence": 0,
|
@@ -226,6 +262,53 @@ class SceneAnalyzer:
|
|
226 |
functional_zones=functional_zones
|
227 |
)
|
228 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
# Return comprehensive analysis
|
230 |
result = {
|
231 |
"scene_type": best_scene if scene_confidence >= scene_confidence_threshold else "unknown",
|
@@ -233,6 +316,7 @@ class SceneAnalyzer:
|
|
233 |
if scene_confidence >= scene_confidence_threshold else "Unknown Scene",
|
234 |
"confidence": scene_confidence,
|
235 |
"description": scene_description,
|
|
|
236 |
"objects_present": [
|
237 |
{"class_id": obj["class_id"],
|
238 |
"class_name": obj["class_name"],
|
@@ -248,6 +332,12 @@ class SceneAnalyzer:
|
|
248 |
"lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0}
|
249 |
}
|
250 |
|
|
|
|
|
|
|
|
|
|
|
|
|
251 |
# 添加 CLIP 特定的結果
|
252 |
if clip_analysis and "error" not in clip_analysis:
|
253 |
result["clip_analysis"] = {
|
|
|
6 |
from scene_description import SceneDescriptor
|
7 |
from enhance_scene_describer import EnhancedSceneDescriber
|
8 |
from clip_analyzer import CLIPAnalyzer
|
9 |
+
from llm_enhancer import LLMEnhancer
|
10 |
from scene_type import SCENE_TYPES
|
11 |
from object_categories import OBJECT_CATEGORIES
|
12 |
|
|
|
15 |
Core class for scene analysis and understanding based on object detection results.
|
16 |
Analyzes detected objects, their relationships, and infers the scene type.
|
17 |
"""
|
18 |
+
def __init__(self, class_names: Dict[int, str] = None, use_llm: bool = True, llm_model_path: str = None):
|
19 |
"""
|
20 |
Initialize the scene analyzer with optional class name mappings.
|
21 |
Args:
|
|
|
41 |
print("Scene analysis will proceed without CLIP. Install CLIP with 'pip install clip' for enhanced scene understanding.")
|
42 |
self.use_clip = False
|
43 |
|
44 |
+
# 初始化LLM Model
|
45 |
+
self.use_llm = use_llm
|
46 |
+
if use_llm:
|
47 |
+
try:
|
48 |
+
# from llm_enhancer import LLMEnhancer
|
49 |
+
self.llm_enhancer = LLMEnhancer(model_path=llm_model_path)
|
50 |
+
print(f"LLM enhancer initialized successfully.")
|
51 |
+
except Exception as e:
|
52 |
+
print(f"Warning: Could not initialize LLM enhancer: {e}")
|
53 |
+
print("Scene analysis will proceed without LLM. Make sure required packages are installed.")
|
54 |
+
self.use_llm = False
|
55 |
+
|
56 |
def generate_scene_description(self,
|
57 |
scene_type,
|
58 |
detected_objects,
|
|
|
119 |
Returns:
|
120 |
Dictionary with scene analysis results
|
121 |
"""
|
122 |
+
# If no result or no detections, handle with LLM if possible
|
123 |
if detection_result is None or len(detection_result.boxes) == 0:
|
124 |
+
if self.use_llm and self.use_clip and detection_result is not None:
|
125 |
+
# 使用CLIP和LLM分析無物體檢測的情況
|
126 |
+
try:
|
127 |
+
original_image = detection_result.orig_img
|
128 |
+
clip_analysis = self.clip_analyzer.analyze_image(original_image)
|
129 |
+
llm_description = self.llm_enhancer.handle_no_detection(clip_analysis)
|
130 |
+
|
131 |
+
return {
|
132 |
+
"scene_type": "llm_inferred",
|
133 |
+
"confidence": clip_analysis.get("top_scene", ("unknown", 0))[1],
|
134 |
+
"description": "No objects detected by standard detection.",
|
135 |
+
"enhanced_description": llm_description,
|
136 |
+
"objects_present": [],
|
137 |
+
"object_count": 0,
|
138 |
+
"regions": {},
|
139 |
+
"possible_activities": [],
|
140 |
+
"safety_concerns": [],
|
141 |
+
"lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0}
|
142 |
+
}
|
143 |
+
except Exception as e:
|
144 |
+
print(f"Error in LLM no-detection handling: {e}")
|
145 |
+
|
146 |
+
# 如果無法使用LLM/CLIP或處理失敗,返回原始的無檢測結果
|
147 |
return {
|
148 |
"scene_type": "unknown",
|
149 |
"confidence": 0,
|
|
|
262 |
functional_zones=functional_zones
|
263 |
)
|
264 |
|
265 |
+
# 使用LLM進行增強處理
|
266 |
+
enhanced_description = None
|
267 |
+
llm_verification = None
|
268 |
+
|
269 |
+
if self.use_llm:
|
270 |
+
try:
|
271 |
+
# 準備用於LLM的場景數據
|
272 |
+
scene_data = {
|
273 |
+
"original_description": scene_description,
|
274 |
+
"scene_type": best_scene,
|
275 |
+
"scene_name": self.SCENE_TYPES.get(best_scene, {}).get("name", "Unknown"),
|
276 |
+
"detected_objects": detected_objects,
|
277 |
+
"confidence": scene_confidence,
|
278 |
+
"lighting_info": lighting_info,
|
279 |
+
"functional_zones": functional_zones,
|
280 |
+
"activities": activities,
|
281 |
+
"safety_concerns": safety_concerns,
|
282 |
+
"clip_analysis": clip_analysis
|
283 |
+
}
|
284 |
+
|
285 |
+
# 如果CLIP和YOLO結果之間存在顯著差異,使用LLM進行驗證
|
286 |
+
if self.use_clip and clip_analysis and "top_scene" in clip_analysis:
|
287 |
+
clip_top_scene = clip_analysis["top_scene"][0]
|
288 |
+
clip_confidence = clip_analysis["top_scene"][1]
|
289 |
+
|
290 |
+
# 如果CLIP和YOLO的場景預測不同且都有較高的置信度,進行驗證
|
291 |
+
if clip_top_scene != best_scene and clip_confidence > 0.4 and scene_confidence > 0.4:
|
292 |
+
llm_verification = self.llm_enhancer.verify_detection(
|
293 |
+
detected_objects,
|
294 |
+
clip_analysis,
|
295 |
+
best_scene,
|
296 |
+
self.SCENE_TYPES.get(best_scene, {}).get("name", "Unknown"),
|
297 |
+
scene_confidence
|
298 |
+
)
|
299 |
+
|
300 |
+
# 將驗證結果添加到場景數據中
|
301 |
+
scene_data["verification_result"] = llm_verification.get("verification_text", "")
|
302 |
+
|
303 |
+
# 使用LLM生成增強描述
|
304 |
+
enhanced_description = self.llm_enhancer.enhance_description(scene_data)
|
305 |
+
|
306 |
+
except Exception as e:
|
307 |
+
print(f"Error in LLM enhancement: {e}")
|
308 |
+
import traceback
|
309 |
+
traceback.print_exc()
|
310 |
+
enhanced_description = None
|
311 |
+
|
312 |
# Return comprehensive analysis
|
313 |
result = {
|
314 |
"scene_type": best_scene if scene_confidence >= scene_confidence_threshold else "unknown",
|
|
|
316 |
if scene_confidence >= scene_confidence_threshold else "Unknown Scene",
|
317 |
"confidence": scene_confidence,
|
318 |
"description": scene_description,
|
319 |
+
"enhanced_description": enhanced_description, # 添加LLM增強的描述
|
320 |
"objects_present": [
|
321 |
{"class_id": obj["class_id"],
|
322 |
"class_name": obj["class_name"],
|
|
|
332 |
"lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0}
|
333 |
}
|
334 |
|
335 |
+
# 如果有LLM驗證結果,添加到輸出中
|
336 |
+
if llm_verification:
|
337 |
+
result["llm_verification"] = llm_verification.get("verification_text")
|
338 |
+
if llm_verification.get("has_errors", False):
|
339 |
+
result["detection_warnings"] = "LLM detected potential issues with object recognition"
|
340 |
+
|
341 |
# 添加 CLIP 特定的結果
|
342 |
if clip_analysis and "error" not in clip_analysis:
|
343 |
result["clip_analysis"] = {
|
scene_type.py
CHANGED
@@ -282,13 +282,6 @@ SCENE_TYPES = {
|
|
282 |
"minimum_required": 1,
|
283 |
"description": "A traditional Asian temple complex with visitors and cultural elements"
|
284 |
},
|
285 |
-
"european_plaza": {
|
286 |
-
"name": "European Plaza",
|
287 |
-
"required_objects": [0], # person
|
288 |
-
"optional_objects": [1, 2, 4, 9, 24, 26, 67], # bicycle, car, airplane, traffic light, backpack, handbag, cell phone
|
289 |
-
"minimum_required": 1,
|
290 |
-
"description": "A European-style city plaza with historic architecture and pedestrian activity"
|
291 |
-
},
|
292 |
|
293 |
# specific time item
|
294 |
"nighttime_street": {
|
|
|
282 |
"minimum_required": 1,
|
283 |
"description": "A traditional Asian temple complex with visitors and cultural elements"
|
284 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
|
286 |
# specific time item
|
287 |
"nighttime_street": {
|
style.py
CHANGED
@@ -289,13 +289,13 @@ class Style:
|
|
289 |
padding: 15px !important; /* 內邊距,讓文字和邊框有點空間 */
|
290 |
border-radius: 8px !important; /* 圓角 */
|
291 |
margin: 10px 0 20px 0 !important; /* 其他元素的間距,特別是上下的part */
|
292 |
-
display: block !important;
|
293 |
-
width: 100% !important;
|
294 |
-
box-sizing: border-box !important;
|
295 |
}
|
296 |
|
297 |
#scene_analysis_description_text p {
|
298 |
-
margin: 0 !important;
|
299 |
color: #2D3748 !important; /* 確保文字顏色 */
|
300 |
font-family: Arial, sans-serif !important;
|
301 |
font-size: 16px !important; /* 你可以調整文字大小 */
|
@@ -485,6 +485,37 @@ class Style:
|
|
485 |
max-width: 100% !important;
|
486 |
}
|
487 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
488 |
/* 動畫效果, 增加互動感 */
|
489 |
@keyframes fadeIn {
|
490 |
from { opacity: 0; }
|
|
|
289 |
padding: 15px !important; /* 內邊距,讓文字和邊框有點空間 */
|
290 |
border-radius: 8px !important; /* 圓角 */
|
291 |
margin: 10px 0 20px 0 !important; /* 其他元素的間距,特別是上下的part */
|
292 |
+
display: block !important;
|
293 |
+
width: 100% !important;
|
294 |
+
box-sizing: border-box !important;
|
295 |
}
|
296 |
|
297 |
#scene_analysis_description_text p {
|
298 |
+
margin: 0 !important;
|
299 |
color: #2D3748 !important; /* 確保文字顏色 */
|
300 |
font-family: Arial, sans-serif !important;
|
301 |
font-size: 16px !important; /* 你可以調整文字大小 */
|
|
|
485 |
max-width: 100% !important;
|
486 |
}
|
487 |
|
488 |
+
/* LLM 增強描述樣式 */
|
489 |
+
#llm_enhanced_description_text {
|
490 |
+
padding: 15px !important;
|
491 |
+
background-color: #ffffff !important;
|
492 |
+
border-radius: 8px !important;
|
493 |
+
border: 1px solid #e2e8f0 !important;
|
494 |
+
margin-bottom: 20px !important;
|
495 |
+
box-shadow: 0 1px 3px rgba(0,0,0,0.05) !important;
|
496 |
+
font-family: Arial, sans-serif !important;
|
497 |
+
line-height: 1.7 !important;
|
498 |
+
color: #2D3748 !important;
|
499 |
+
font-size: 16px !important;
|
500 |
+
width: 100% !important;
|
501 |
+
box-sizing: border-box !important;
|
502 |
+
min-height: 200px !important;
|
503 |
+
}
|
504 |
+
|
505 |
+
/* 原始描述折疊區域樣式 */
|
506 |
+
#original_scene_analysis_accordion {
|
507 |
+
margin-top: 10px !important;
|
508 |
+
margin-bottom: 20px !important;
|
509 |
+
background-color: #f8f9fa !important;
|
510 |
+
border-radius: 8px !important;
|
511 |
+
border: 1px solid #e2e8f0 !important;
|
512 |
+
}
|
513 |
+
|
514 |
+
/* 確保折疊區域內容與頁面樣式協調 */
|
515 |
+
#original_scene_analysis_accordion > div:nth-child(2) {
|
516 |
+
padding: 15px !important;
|
517 |
+
}
|
518 |
+
|
519 |
/* 動畫效果, 增加互動感 */
|
520 |
@keyframes fadeIn {
|
521 |
from { opacity: 0; }
|