DawnC commited on
Commit
e83cc4c
·
verified ·
1 Parent(s): 81a4715

Upload 8 files

Browse files
Files changed (8) hide show
  1. app.py +181 -31
  2. enhance_scene_describer.py +75 -43
  3. image_processor.py +8 -2
  4. llm_enhancer.py +1218 -0
  5. requirements.txt +5 -0
  6. scene_analyzer.py +92 -2
  7. scene_type.py +0 -7
  8. style.py +35 -4
app.py CHANGED
@@ -1,12 +1,13 @@
 
1
  import os
2
  import numpy as np
3
  import matplotlib.pyplot as plt
4
  import gradio as gr
5
  from typing import Dict, List, Any, Optional, Tuple
6
- import cv2
7
- from PIL import Image
8
- import tempfile
9
- import uuid
10
  import spaces
11
 
12
  from detection_model import DetectionModel
@@ -15,10 +16,11 @@ from evaluation_metrics import EvaluationMetrics
15
  from style import Style
16
  from image_processor import ImageProcessor
17
  from video_processor import VideoProcessor
 
18
 
19
- # Initialize Processors
20
- image_processor = ImageProcessor()
21
- video_processor = VideoProcessor(image_processor)
22
 
23
  # Helper Function
24
  def get_all_classes():
@@ -56,10 +58,15 @@ def get_all_classes():
56
  return sorted(default_classes.items())
57
 
58
  @spaces.GPU
59
- def handle_image_upload(image, model_name, confidence_threshold, filter_classes=None):
60
  """Processes a single uploaded image."""
61
- print(f"Processing image with model: {model_name}, confidence: {confidence_threshold}")
62
  try:
 
 
 
 
 
63
  class_ids_to_filter = None
64
  if filter_classes:
65
  class_ids_to_filter = []
@@ -118,8 +125,127 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
118
  scene_desc = scene_analysis.get("description", "Scene analysis requires detected objects.")
119
  # Ensure scene_desc is a string before adding HTML
120
  if not isinstance(scene_desc, str):
121
- scene_desc = str(scene_desc)
122
- scene_desc_html = f"<div style='padding:10px; font-family:Arial, sans-serif; line-height:1.7;'>{scene_desc}</div>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
  # Prepare activities list
125
  activities_list = scene_analysis.get("possible_activities", [])
@@ -138,8 +264,15 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
138
  zones = scene_analysis.get("functional_zones", {})
139
  lighting = scene_analysis.get("lighting_conditions", {"time_of_day": "unknown", "confidence": 0})
140
 
 
 
 
 
 
 
141
  return (result_image, result_text, formatted_stats, plot_figure,
142
- scene_desc_html, activities_list_data, safety_data, zones, lighting)
 
143
 
144
  except Exception as e:
145
  print(f"Error in handle_image_upload: {e}")
@@ -149,8 +282,8 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
149
  ax.text(0.5, 0.5, "Processing Error", color="red", ha="center", va="center")
150
  ax.axis('off')
151
  # Ensure return structure matches outputs even on error
152
- return (None, error_msg, {}, fig, f"<div>Error: {str(e)}</div>",
153
- [["Error"]], [["Error"]], {}, {"time_of_day": "error", "confidence": 0})
154
 
155
  def download_video_from_url(video_url, max_duration_minutes=10):
156
  """
@@ -273,7 +406,7 @@ def handle_video_upload(video_input, video_url, input_type, model_name, confiden
273
  return None, error_html, {"error": str(e)}
274
 
275
 
276
- # Create Gradio Interface
277
  def create_interface():
278
  """Creates the Gradio interface with Tabs."""
279
  css = Style.get_css()
@@ -283,7 +416,7 @@ def create_interface():
283
 
284
  with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue="teal", secondary_hue="blue")) as demo:
285
 
286
- # Header
287
  with gr.Group(elem_classes="app-header"):
288
  gr.HTML("""
289
  <div style="text-align: center; width: 100%; padding: 2rem 0 3rem 0; background: linear-gradient(135deg, #f0f9ff, #e1f5fe);">
@@ -303,10 +436,10 @@ def create_interface():
303
  </div>
304
  """)
305
 
306
- # Main Content with Tabs
307
  with gr.Tabs(elem_classes="tabs"):
308
 
309
- # Tab 1: Image Processing
310
  with gr.Tab("Image Processing"):
311
  current_image_model = gr.State("yolov8m.pt") # State for image model selection
312
  with gr.Row(equal_height=False): # Allow columns to have different heights
@@ -331,6 +464,13 @@ def create_interface():
331
  label="Confidence Threshold",
332
  info="Minimum confidence for displaying a detected object"
333
  )
 
 
 
 
 
 
 
334
  with gr.Accordion("Filter Classes", open=False):
335
  gr.HTML('<div class="section-heading" style="font-size: 1rem;">Common Categories</div>')
336
  with gr.Row():
@@ -350,11 +490,12 @@ def create_interface():
350
  with gr.Group(elem_classes="how-to-use"):
351
  gr.HTML('<div class="section-heading">How to Use (Image)</div>')
352
  gr.Markdown("""
353
- 1. Upload an image or use the camera
354
- 2. (Optional) Adjust settings like confidence threshold or model size (n, m, x)
355
- 3. Optionally filter to specific object classes
356
- 4. Click **Detect Objects** button
357
- """)
 
358
  # Image Examples
359
  gr.Examples(
360
  examples=[
@@ -392,8 +533,18 @@ def create_interface():
392
  </details>
393
  """)
394
 
395
- # Wrap HTML description for potential styling
396
- image_scene_description_html = gr.HTML(label=None, elem_id="scene_analysis_description_text")
 
 
 
 
 
 
 
 
 
 
397
 
398
  with gr.Row():
399
  with gr.Column(scale=1):
@@ -419,7 +570,7 @@ def create_interface():
419
  gr.HTML('<div class="section-heading">Detection Statistics</div>')
420
  image_stats_json = gr.JSON(label=None, elem_classes="enhanced-json-display")
421
 
422
- # Tab 2: Video Processing
423
  with gr.Tab("Video Processing"):
424
  with gr.Row(equal_height=False):
425
  # Left Column: Video Input & Controls
@@ -525,7 +676,7 @@ def create_interface():
525
  gr.HTML('<div class="section-heading">Aggregated Statistics</div>')
526
  video_stats_json = gr.JSON(label=None, elem_classes="video-stats-display") # Display statistics
527
 
528
- # Event Listeners
529
  # Image Model Change Handler
530
  image_model_dropdown.change(
531
  fn=lambda model: (model, DetectionModel.get_model_description(model)),
@@ -556,13 +707,12 @@ def create_interface():
556
  outputs=[video_input, video_url_input]
557
  )
558
 
559
- # Image Processing Button Click
560
  image_detect_btn.click(
561
  fn=handle_image_upload,
562
- inputs=[image_input, image_model_dropdown, image_confidence, image_class_filter],
563
  outputs=[
564
  image_result_image, image_result_text, image_stats_json, image_plot_output,
565
- image_scene_description_html, image_activities_list, image_safety_list, image_zones_json,
566
  image_lighting_info
567
  ]
568
  )
@@ -584,7 +734,7 @@ def create_interface():
584
  gr.HTML("""
585
  <div class="footer" style="padding: 25px 0; text-align: center; background: linear-gradient(to right, #f5f9fc, #e1f5fe); border-top: 1px solid #e2e8f0; margin-top: 30px;">
586
  <div style="margin-bottom: 15px;">
587
- <p style="font-size: 14px; color: #4A5568; margin: 5px 0;">Powered by YOLOv8, CLIP and Ultralytics • Created with Gradio</p>
588
  </div>
589
  <div style="display: flex; align-items: center; justify-content: center; gap: 20px; margin-top: 15px;">
590
  <p style="font-family: 'Arial', sans-serif; font-size: 14px; font-weight: 500; letter-spacing: 2px; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0; text-transform: uppercase; display: inline-block;">EXPLORE THE CODE →</p>
 
1
+ import re
2
  import os
3
  import numpy as np
4
  import matplotlib.pyplot as plt
5
  import gradio as gr
6
  from typing import Dict, List, Any, Optional, Tuple
7
+ import cv2
8
+ from PIL import Image
9
+ import tempfile
10
+ import uuid
11
  import spaces
12
 
13
  from detection_model import DetectionModel
 
16
  from style import Style
17
  from image_processor import ImageProcessor
18
  from video_processor import VideoProcessor
19
+ from llm_enhancer import LLMEnhancer
20
 
21
+ # Initialize Processors with LLM support
22
+ image_processor = ImageProcessor(use_llm=True, llm_model_path="meta-llama/Llama-3.2-3B-Instruct")
23
+ video_processor = VideoProcessor(image_processor)
24
 
25
  # Helper Function
26
  def get_all_classes():
 
58
  return sorted(default_classes.items())
59
 
60
  @spaces.GPU
61
+ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=None, use_llm=True):
62
  """Processes a single uploaded image."""
63
+ print(f"Processing image with model: {model_name}, confidence: {confidence_threshold}, use_llm: {use_llm}")
64
  try:
65
+ image_processor.use_llm = use_llm
66
+ if hasattr(image_processor, 'scene_analyzer'):
67
+ image_processor.scene_analyzer.use_llm = use_llm
68
+ print(f"Updated existing scene_analyzer use_llm setting to: {use_llm}")
69
+
70
  class_ids_to_filter = None
71
  if filter_classes:
72
  class_ids_to_filter = []
 
125
  scene_desc = scene_analysis.get("description", "Scene analysis requires detected objects.")
126
  # Ensure scene_desc is a string before adding HTML
127
  if not isinstance(scene_desc, str):
128
+ scene_desc = str(scene_desc)
129
+
130
+ def clean_description(desc):
131
+ if not desc:
132
+ return ""
133
+
134
+ # 先過濾問答格式
135
+ if "Questions:" in desc:
136
+ desc = desc.split("Questions:")[0].strip()
137
+ if "Answers:" in desc:
138
+ desc = desc.split("Answers:")[0].strip()
139
+
140
+ # 然後按行過濾代碼和其他非敘述內容
141
+ lines = desc.split('\n')
142
+ clean_lines = []
143
+ skip_block = False
144
+
145
+ for line in lines:
146
+ # 檢測問題格式
147
+ if re.match(r'^\d+\.\s+(What|How|Why|When|Where|Who|The)', line):
148
+ continue
149
+
150
+ # 檢查需要跳過的行
151
+ if line.strip().startswith(':param') or line.strip().startswith('"""'):
152
+ continue
153
+ if line.strip().startswith("Exercise") or "class SceneDescriptionSystem" in line:
154
+ skip_block = True
155
+ continue
156
+ if ('def generate_scene_description' in line or
157
+ 'def enhance_scene_descriptions' in line or
158
+ 'def __init__' in line):
159
+ skip_block = True
160
+ continue
161
+ if line.strip().startswith('#TEST'):
162
+ skip_block = True
163
+ continue
164
+
165
+ if skip_block and line.strip() == "":
166
+ skip_block = False
167
+
168
+ # 如果不需要跳過
169
+ if not skip_block:
170
+ clean_lines.append(line)
171
+
172
+ cleaned_text = '\n'.join(clean_lines)
173
+
174
+ # 如果清理後為空,返回原始描述的第一段作為保險
175
+ if not cleaned_text.strip():
176
+ paragraphs = [p.strip() for p in desc.split('\n\n') if p.strip()]
177
+ if paragraphs:
178
+ return paragraphs[0]
179
+ return desc
180
+
181
+ return cleaned_text
182
+
183
+ # 獲取和處理場景描述
184
+ scene_analysis = stats.get("scene_analysis", {})
185
+ print("Processing scene_analysis:", scene_analysis.keys())
186
+
187
+ # 獲取原始描述
188
+ scene_desc = scene_analysis.get("description", "Scene analysis requires detected objects.")
189
+ if not isinstance(scene_desc, str):
190
+ scene_desc = str(scene_desc)
191
+
192
+ print(f"Original scene description (first 50 chars): {scene_desc[:50]}...")
193
+
194
+ # 確保使用的是有效的描述
195
+ clean_scene_desc = clean_description(scene_desc)
196
+ print(f"Cleaned scene description (first 50 chars): {clean_scene_desc[:50]}...")
197
+
198
+ # 即使清理後為空也確保顯示原始內容
199
+ if not clean_scene_desc.strip():
200
+ clean_scene_desc = scene_desc
201
+
202
+ # 創建原始描述的HTML
203
+ scene_desc_html = f"<div>{clean_scene_desc}</div>"
204
+
205
+ # 獲取LLM增強描述並且確保設置默認值為空字符串而非 None,不然會有None type Error
206
+ enhanced_description = scene_analysis.get("enhanced_description", "")
207
+ if enhanced_description is None:
208
+ enhanced_description = ""
209
+
210
+ if not enhanced_description or not enhanced_description.strip():
211
+ print("WARNING: LLM enhanced description is empty!")
212
+
213
+ # 準備徽章和描述標籤
214
+ llm_badge = ""
215
+ description_to_show = ""
216
+
217
+ if use_llm and enhanced_description:
218
+ llm_badge = '<span style="display:inline-block; margin-left:8px; padding:3px 10px; border-radius:12px; background: linear-gradient(90deg, #38b2ac, #4299e1); color:white; font-size:0.7rem; font-weight:bold; box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); border: 1px solid rgba(255, 255, 255, 0.2);">LLM Enhanced</span>'
219
+ description_to_show = enhanced_description
220
+ # 在 Original Scene Analysis 折疊區顯示原始的描述
221
+ else:
222
+ llm_badge = '<span style="display:inline-block; margin-left:8px; padding:3px 10px; border-radius:12px; background-color:#718096; color:white; font-size:0.7rem; font-weight:bold;">Basic</span>'
223
+ description_to_show = clean_scene_desc
224
+ # 不使用 LLM 時,折疊區不顯示內容
225
+
226
+ # 使用LLM敘述時會有徽章標籤在標題上
227
+ scene_description_html = f'''
228
+ <div>
229
+ <div class="section-heading" style="font-size:1.2rem; margin-top:15px;">Scene Description {llm_badge}
230
+ <span style="font-size:0.8rem; color:#666; font-weight:normal; display:block; margin-top:2px;">
231
+ {('(Enhanced by AI language model)' if use_llm and enhanced_description else '(Based on object detection)')}
232
+ </span>
233
+ </div>
234
+ <div style="padding:15px; background-color:#ffffff; border-radius:8px; border:1px solid #e2e8f0; margin-bottom:20px; box-shadow:0 1px 3px rgba(0,0,0,0.05);">
235
+ {description_to_show}
236
+ </div>
237
+ </div>
238
+ '''
239
+
240
+ # 原始描述只在使用 LLM 且有增強描述時在折疊區顯示
241
+ original_desc_visibility = "block" if use_llm and enhanced_description else "none"
242
+ original_desc_html = f'''
243
+ <div id="original_scene_analysis_accordion" style="display: {original_desc_visibility};">
244
+ <div style="padding:15px; background-color:#f0f0f0; border-radius:8px; border:1px solid #e2e8f0;">
245
+ {clean_scene_desc}
246
+ </div>
247
+ </div>
248
+ '''
249
 
250
  # Prepare activities list
251
  activities_list = scene_analysis.get("possible_activities", [])
 
264
  zones = scene_analysis.get("functional_zones", {})
265
  lighting = scene_analysis.get("lighting_conditions", {"time_of_day": "unknown", "confidence": 0})
266
 
267
+ # 如果描述為空,記錄警告
268
+ if not clean_scene_desc.strip():
269
+ print("WARNING: Scene description is empty after cleaning!")
270
+ if not enhanced_description.strip():
271
+ print("WARNING: LLM enhanced description is empty!")
272
+
273
  return (result_image, result_text, formatted_stats, plot_figure,
274
+ scene_description_html, original_desc_html,
275
+ activities_list_data, safety_data, zones, lighting)
276
 
277
  except Exception as e:
278
  print(f"Error in handle_image_upload: {e}")
 
282
  ax.text(0.5, 0.5, "Processing Error", color="red", ha="center", va="center")
283
  ax.axis('off')
284
  # Ensure return structure matches outputs even on error
285
+ return (None, error_msg, {}, fig, f"<div>Error: {str(e)}</div>", "Error",
286
+ [["Error"]], [["Error"]], {}, {"time_of_day": "error", "confidence": 0})
287
 
288
  def download_video_from_url(video_url, max_duration_minutes=10):
289
  """
 
406
  return None, error_html, {"error": str(e)}
407
 
408
 
409
+ # Create Gradio Interface
410
  def create_interface():
411
  """Creates the Gradio interface with Tabs."""
412
  css = Style.get_css()
 
416
 
417
  with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue="teal", secondary_hue="blue")) as demo:
418
 
419
+ # Header
420
  with gr.Group(elem_classes="app-header"):
421
  gr.HTML("""
422
  <div style="text-align: center; width: 100%; padding: 2rem 0 3rem 0; background: linear-gradient(135deg, #f0f9ff, #e1f5fe);">
 
436
  </div>
437
  """)
438
 
439
+ # Main Content with Tabs
440
  with gr.Tabs(elem_classes="tabs"):
441
 
442
+ # Tab 1: Image Processing
443
  with gr.Tab("Image Processing"):
444
  current_image_model = gr.State("yolov8m.pt") # State for image model selection
445
  with gr.Row(equal_height=False): # Allow columns to have different heights
 
464
  label="Confidence Threshold",
465
  info="Minimum confidence for displaying a detected object"
466
  )
467
+
468
+ use_llm = gr.Checkbox(
469
+ label="Use LLM for enhanced scene descriptions",
470
+ value=True,
471
+ info="Provides more detailed and natural language descriptions (may increase processing time)"
472
+ )
473
+
474
  with gr.Accordion("Filter Classes", open=False):
475
  gr.HTML('<div class="section-heading" style="font-size: 1rem;">Common Categories</div>')
476
  with gr.Row():
 
490
  with gr.Group(elem_classes="how-to-use"):
491
  gr.HTML('<div class="section-heading">How to Use (Image)</div>')
492
  gr.Markdown("""
493
+ 1. Upload an image or use the camera
494
+ 2. (Optional) Adjust settings like confidence threshold or model size (n, m=balanced, x=accurate)
495
+ 3. In Analysis Settings, you can uncheck "Use LLM for enhanced scene descriptions" if you prefer faster processing
496
+ 4. Optionally filter to specific object classes
497
+ 5. Click **Detect Objects** button
498
+ """)
499
  # Image Examples
500
  gr.Examples(
501
  examples=[
 
533
  </details>
534
  """)
535
 
536
+ gr.HTML('''
537
+ <div style="margin-top: 5px; padding: 6px 10px; background-color: #f0f9ff; border-radius: 4px; border-left: 3px solid #63b3ed; font-size: 12px; margin-bottom: 10px;">
538
+ <p style="margin: 0; color: #4a5568;">
539
+ <b>Note:</b> AI descriptions may vary slightly with each generation, reflecting the creative nature of AI. This is similar to how a person might use different words each time they describe the same image. Processing time may be longer during first use or when analyzing complex scenes, as the LLM enhancement requires additional computational resources.
540
+ </p>
541
+ </div>
542
+ ''')
543
+ image_scene_description_html = gr.HTML(label=None, elem_id="scene_analysis_description_text")
544
+
545
+ # 使用LLM增強敘述時也會顯示原本敘述內容
546
+ with gr.Accordion("Original Scene Analysis", open=False, elem_id="original_scene_analysis_accordion"):
547
+ image_llm_description = gr.HTML(label=None, elem_id="original_scene_description_text")
548
 
549
  with gr.Row():
550
  with gr.Column(scale=1):
 
570
  gr.HTML('<div class="section-heading">Detection Statistics</div>')
571
  image_stats_json = gr.JSON(label=None, elem_classes="enhanced-json-display")
572
 
573
+ # Tab 2: Video Processing
574
  with gr.Tab("Video Processing"):
575
  with gr.Row(equal_height=False):
576
  # Left Column: Video Input & Controls
 
676
  gr.HTML('<div class="section-heading">Aggregated Statistics</div>')
677
  video_stats_json = gr.JSON(label=None, elem_classes="video-stats-display") # Display statistics
678
 
679
+ # Event Listeners
680
  # Image Model Change Handler
681
  image_model_dropdown.change(
682
  fn=lambda model: (model, DetectionModel.get_model_description(model)),
 
707
  outputs=[video_input, video_url_input]
708
  )
709
 
 
710
  image_detect_btn.click(
711
  fn=handle_image_upload,
712
+ inputs=[image_input, image_model_dropdown, image_confidence, image_class_filter, use_llm],
713
  outputs=[
714
  image_result_image, image_result_text, image_stats_json, image_plot_output,
715
+ image_scene_description_html, image_llm_description, image_activities_list, image_safety_list, image_zones_json,
716
  image_lighting_info
717
  ]
718
  )
 
734
  gr.HTML("""
735
  <div class="footer" style="padding: 25px 0; text-align: center; background: linear-gradient(to right, #f5f9fc, #e1f5fe); border-top: 1px solid #e2e8f0; margin-top: 30px;">
736
  <div style="margin-bottom: 15px;">
737
+ <p style="font-size: 14px; color: #4A5568; margin: 5px 0;">Powered by YOLOv8, CLIP, Meta Llama3.2 and Ultralytics • Created with Gradio</p>
738
  </div>
739
  <div style="display: flex; align-items: center; justify-content: center; gap: 20px; margin-top: 15px;">
740
  <p style="font-family: 'Arial', sans-serif; font-size: 14px; font-weight: 500; letter-spacing: 2px; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0; text-transform: uppercase; display: inline-block;">EXPLORE THE CODE →</p>
enhance_scene_describer.py CHANGED
@@ -164,8 +164,8 @@ class EnhancedSceneDescriber:
164
  "elevated_threshold": 0.6, # Objects mostly in middle/bottom
165
  "elevated_top_threshold": 0.3 # Few objects at top of frame
166
  }
167
-
168
-
169
  def generate_description(self,
170
  scene_type: str,
171
  detected_objects: List[Dict],
@@ -193,7 +193,7 @@ class EnhancedSceneDescriber:
193
  return self._format_final_description(self._generate_generic_description(detected_objects, lighting_info))
194
 
195
  # Detect viewpoint
196
- viewpoint = self._detect_viewpoint(detected_objects)
197
 
198
  # Process aerial viewpoint scene types
199
  if viewpoint == "aerial":
@@ -326,7 +326,7 @@ class EnhancedSceneDescriber:
326
  r"with \d+ people",
327
  r"with \d+ person"
328
  ]
329
-
330
  # Check and remove each pattern
331
  filtered_description = description
332
  for pattern in small_people_patterns:
@@ -358,40 +358,72 @@ class EnhancedSceneDescriber:
358
  # Final formatting to ensure correct punctuation and capitalization
359
  description = self._format_final_description(description)
360
 
361
- return description
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
 
363
  def _smart_append(self, current_text: str, new_fragment: str) -> str:
364
  """
365
  Intelligently append a new text fragment to the current text,
366
  handling punctuation and capitalization correctly.
367
-
368
  Args:
369
  current_text: The existing text to append to
370
  new_fragment: The new text fragment to append
371
-
372
  Returns:
373
  str: The combined text with proper formatting
374
  """
375
  # Handle empty cases
376
  if not new_fragment:
377
  return current_text
378
-
379
  if not current_text:
380
  # Ensure first character is uppercase for the first fragment
381
  return new_fragment[0].upper() + new_fragment[1:] if new_fragment else ""
382
-
383
  # Clean up existing text
384
  current_text = current_text.rstrip()
385
-
386
  # Check for ending punctuation
387
  ends_with_sentence = current_text.endswith(('.', '!', '?'))
388
  ends_with_comma = current_text.endswith(',')
389
-
390
  # Specifically handle the "A xxx A yyy" pattern that's causing issues
391
  if (current_text.startswith("A ") or current_text.startswith("An ")) and \
392
  (new_fragment.startswith("A ") or new_fragment.startswith("An ")):
393
  return current_text + ". " + new_fragment
394
-
395
  # Decide how to join the texts
396
  if ends_with_sentence:
397
  # After a sentence, start with uppercase and add proper spacing
@@ -406,7 +438,7 @@ class EnhancedSceneDescriber:
406
  # When adding a new sentence about the scene, use a period
407
  joined_text = current_text + ". " + new_fragment
408
  else:
409
- # For other cases, decide based on the content
410
  if self._is_related_phrases(current_text, new_fragment):
411
  if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper():
412
  joined_text = current_text + ", " + new_fragment
@@ -415,18 +447,18 @@ class EnhancedSceneDescriber:
415
  else:
416
  # Use period for unrelated phrases
417
  joined_text = current_text + ". " + (new_fragment[0].upper() + new_fragment[1:])
418
-
419
  return joined_text
420
 
421
  def _is_related_phrases(self, text1: str, text2: str) -> bool:
422
  """
423
  Determine if two phrases are related and should be connected with a comma
424
  rather than separated with a period.
425
-
426
  Args:
427
  text1: The first text fragment
428
  text2: The second text fragment to be appended
429
-
430
  Returns:
431
  bool: Whether the phrases appear to be related
432
  """
@@ -434,61 +466,61 @@ class EnhancedSceneDescriber:
434
  if (text1.startswith("A ") or text1.startswith("An ")) and \
435
  (text2.startswith("A ") or text2.startswith("An ")):
436
  return False # These are separate descriptions, not related phrases
437
-
438
  # Check if the second phrase starts with a connecting word
439
- connecting_words = ["which", "where", "who", "whom", "whose", "with", "without",
440
  "this", "these", "that", "those", "and", "or", "but"]
441
-
442
  first_word = text2.split()[0].lower() if text2 else ""
443
  if first_word in connecting_words:
444
  return True
445
-
446
  # Check if the first phrase ends with something that suggests continuity
447
- ending_patterns = ["such as", "including", "like", "especially", "particularly",
448
  "for example", "for instance", "namely", "specifically"]
449
-
450
  for pattern in ending_patterns:
451
  if text1.lower().endswith(pattern):
452
  return True
453
-
454
  # Check if both phrases are about the scene
455
  if "scene" in text1.lower() and "scene" in text2.lower():
456
  return False # Separate statements about the scene should be separate sentences
457
-
458
  return False
459
 
460
  def _format_final_description(self, text: str) -> str:
461
  """
462
  Format the final description text to ensure correct punctuation,
463
  capitalization, and spacing.
464
-
465
  Args:
466
  text: The text to format
467
-
468
  Returns:
469
  str: The properly formatted text
470
  """
471
  import re
472
-
473
  if not text:
474
  return ""
475
-
476
  # 1. 特別處理連續以"A"開頭的片段 (這是一個常見問題)
477
  text = re.sub(r'(A\s[^.!?]+?)\s+(A\s)', r'\1. \2', text, flags=re.IGNORECASE)
478
  text = re.sub(r'(An\s[^.!?]+?)\s+(An?\s)', r'\1. \2', text, flags=re.IGNORECASE)
479
-
480
  # 2. 確保第一個字母大寫
481
  text = text[0].upper() + text[1:] if text else ""
482
-
483
  # 3. 修正詞之間的空格問題
484
  text = re.sub(r'\s{2,}', ' ', text) # 多個空格改為一個
485
  text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text) # 小寫後大寫間加空格
486
-
487
  # 4. 修正詞連接問題
488
  text = re.sub(r'([a-zA-Z])and', r'\1 and', text) # "xxx"和"and"間加空格
489
  text = re.sub(r'([a-zA-Z])with', r'\1 with', text) # "xxx"和"with"間加空格
490
  text = re.sub(r'plants(and|with|or)', r'plants \1', text) # 修正"plantsand"這類問題
491
-
492
  # 5. 修正標點符號後的大小寫問題
493
  text = re.sub(r'\.(\s+)([a-z])', lambda m: f'.{m.group(1)}{m.group(2).upper()}', text) # 句號後大寫
494
 
@@ -498,46 +530,46 @@ class EnhancedSceneDescriber:
498
  # 例外情況:保留專有名詞、人稱代詞等的大寫
499
  if word in ["I", "I'm", "I've", "I'd", "I'll"]:
500
  return match.group(0) # 保持原樣
501
-
502
  # 保留月份、星期、地名等專有名詞的大寫
503
- proper_nouns = ["January", "February", "March", "April", "May", "June", "July",
504
  "August", "September", "October", "November", "December",
505
  "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
506
  if word in proper_nouns:
507
  return match.group(0) # 保持原樣
508
-
509
  # 其他情況:將首字母改為小寫
510
  return match.group(1) + word[0].lower() + word[1:]
511
-
512
  # 匹配逗號後接空格再接大寫單詞的模式
513
  text = re.sub(r'(,\s+)([A-Z][a-zA-Z]*)', fix_capitalization_after_comma, text)
514
-
515
-
516
  common_phrases = [
517
  (r'Social or seating area', r'social or seating area'),
518
  (r'Sleeping area', r'sleeping area'),
519
  (r'Dining area', r'dining area'),
520
  (r'Living space', r'living space')
521
  ]
522
-
523
  for phrase, replacement in common_phrases:
524
  # 只修改句中的術語,保留句首的大寫
525
  text = re.sub(r'(?<=[.!?]\s)' + phrase, replacement, text)
526
  # 修改句中的術語,但保留句首的大寫
527
  text = re.sub(r'(?<=,\s)' + phrase, replacement, text)
528
-
529
  # 7. 確保標點符號後有空格
530
  text = re.sub(r'\s+([.,;:!?])', r'\1', text) # 標點符號前不要空格
531
  text = re.sub(r'([.,;:!?])([a-zA-Z0-9])', r'\1 \2', text) # 標點符號後要有空格
532
-
533
  # 8. 修正重複標點符號
534
  text = re.sub(r'\.{2,}', '.', text) # 多個句號變一個
535
  text = re.sub(r',{2,}', ',', text) # 多個逗號變一個
536
-
537
  # 9. 確保文本以標點結束
538
  if text and not text[-1] in '.!?':
539
  text += '.'
540
-
541
  return text
542
 
543
  def _is_intersection(self, detected_objects: List[Dict]) -> bool:
 
164
  "elevated_threshold": 0.6, # Objects mostly in middle/bottom
165
  "elevated_top_threshold": 0.3 # Few objects at top of frame
166
  }
167
+
168
+
169
  def generate_description(self,
170
  scene_type: str,
171
  detected_objects: List[Dict],
 
193
  return self._format_final_description(self._generate_generic_description(detected_objects, lighting_info))
194
 
195
  # Detect viewpoint
196
+ viewpoint = self._detect_viewpoint(detected_objects)
197
 
198
  # Process aerial viewpoint scene types
199
  if viewpoint == "aerial":
 
326
  r"with \d+ people",
327
  r"with \d+ person"
328
  ]
329
+
330
  # Check and remove each pattern
331
  filtered_description = description
332
  for pattern in small_people_patterns:
 
358
  # Final formatting to ensure correct punctuation and capitalization
359
  description = self._format_final_description(description)
360
 
361
+ description_lines = description.split('\n')
362
+ clean_description = []
363
+ skip_block = False # 添加這個變數的定義
364
+
365
+ for line in description_lines:
366
+ # 檢查是否需要跳過這行
367
+ if line.strip().startswith(':param') or line.strip().startswith('"""'):
368
+ continue
369
+ if line.strip().startswith("Exercise") or "class SceneDescriptionSystem" in line:
370
+ skip_block = True
371
+ continue
372
+ if ('def generate_scene_description' in line or
373
+ 'def enhance_scene_descriptions' in line or
374
+ 'def __init__' in line):
375
+ skip_block = True
376
+ continue
377
+ if line.strip().startswith('#TEST'):
378
+ skip_block = True
379
+ continue
380
+
381
+ # 空行結束跳過模式
382
+ if skip_block and line.strip() == "":
383
+ skip_block = False
384
+
385
+ # 如果不需要跳過,添加這行到結果
386
+ if not skip_block:
387
+ clean_description.append(line)
388
+
389
+ # 如果過濾後的描述為空,返回原始描述
390
+ if not clean_description:
391
+ return description
392
+ else:
393
+ return '\n'.join(clean_description)
394
 
395
  def _smart_append(self, current_text: str, new_fragment: str) -> str:
396
  """
397
  Intelligently append a new text fragment to the current text,
398
  handling punctuation and capitalization correctly.
399
+
400
  Args:
401
  current_text: The existing text to append to
402
  new_fragment: The new text fragment to append
403
+
404
  Returns:
405
  str: The combined text with proper formatting
406
  """
407
  # Handle empty cases
408
  if not new_fragment:
409
  return current_text
410
+
411
  if not current_text:
412
  # Ensure first character is uppercase for the first fragment
413
  return new_fragment[0].upper() + new_fragment[1:] if new_fragment else ""
414
+
415
  # Clean up existing text
416
  current_text = current_text.rstrip()
417
+
418
  # Check for ending punctuation
419
  ends_with_sentence = current_text.endswith(('.', '!', '?'))
420
  ends_with_comma = current_text.endswith(',')
421
+
422
  # Specifically handle the "A xxx A yyy" pattern that's causing issues
423
  if (current_text.startswith("A ") or current_text.startswith("An ")) and \
424
  (new_fragment.startswith("A ") or new_fragment.startswith("An ")):
425
  return current_text + ". " + new_fragment
426
+
427
  # Decide how to join the texts
428
  if ends_with_sentence:
429
  # After a sentence, start with uppercase and add proper spacing
 
438
  # When adding a new sentence about the scene, use a period
439
  joined_text = current_text + ". " + new_fragment
440
  else:
441
+ # For other cases, decide based on the content
442
  if self._is_related_phrases(current_text, new_fragment):
443
  if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper():
444
  joined_text = current_text + ", " + new_fragment
 
447
  else:
448
  # Use period for unrelated phrases
449
  joined_text = current_text + ". " + (new_fragment[0].upper() + new_fragment[1:])
450
+
451
  return joined_text
452
 
453
  def _is_related_phrases(self, text1: str, text2: str) -> bool:
454
  """
455
  Determine if two phrases are related and should be connected with a comma
456
  rather than separated with a period.
457
+
458
  Args:
459
  text1: The first text fragment
460
  text2: The second text fragment to be appended
461
+
462
  Returns:
463
  bool: Whether the phrases appear to be related
464
  """
 
466
  if (text1.startswith("A ") or text1.startswith("An ")) and \
467
  (text2.startswith("A ") or text2.startswith("An ")):
468
  return False # These are separate descriptions, not related phrases
469
+
470
  # Check if the second phrase starts with a connecting word
471
+ connecting_words = ["which", "where", "who", "whom", "whose", "with", "without",
472
  "this", "these", "that", "those", "and", "or", "but"]
473
+
474
  first_word = text2.split()[0].lower() if text2 else ""
475
  if first_word in connecting_words:
476
  return True
477
+
478
  # Check if the first phrase ends with something that suggests continuity
479
+ ending_patterns = ["such as", "including", "like", "especially", "particularly",
480
  "for example", "for instance", "namely", "specifically"]
481
+
482
  for pattern in ending_patterns:
483
  if text1.lower().endswith(pattern):
484
  return True
485
+
486
  # Check if both phrases are about the scene
487
  if "scene" in text1.lower() and "scene" in text2.lower():
488
  return False # Separate statements about the scene should be separate sentences
489
+
490
  return False
491
 
492
  def _format_final_description(self, text: str) -> str:
493
  """
494
  Format the final description text to ensure correct punctuation,
495
  capitalization, and spacing.
496
+
497
  Args:
498
  text: The text to format
499
+
500
  Returns:
501
  str: The properly formatted text
502
  """
503
  import re
504
+
505
  if not text:
506
  return ""
507
+
508
  # 1. 特別處理連續以"A"開頭的片段 (這是一個常見問題)
509
  text = re.sub(r'(A\s[^.!?]+?)\s+(A\s)', r'\1. \2', text, flags=re.IGNORECASE)
510
  text = re.sub(r'(An\s[^.!?]+?)\s+(An?\s)', r'\1. \2', text, flags=re.IGNORECASE)
511
+
512
  # 2. 確保第一個字母大寫
513
  text = text[0].upper() + text[1:] if text else ""
514
+
515
  # 3. 修正詞之間的空格問題
516
  text = re.sub(r'\s{2,}', ' ', text) # 多個空格改為一個
517
  text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text) # 小寫後大寫間加空格
518
+
519
  # 4. 修正詞連接問題
520
  text = re.sub(r'([a-zA-Z])and', r'\1 and', text) # "xxx"和"and"間加空格
521
  text = re.sub(r'([a-zA-Z])with', r'\1 with', text) # "xxx"和"with"間加空格
522
  text = re.sub(r'plants(and|with|or)', r'plants \1', text) # 修正"plantsand"這類問題
523
+
524
  # 5. 修正標點符號後的大小寫問題
525
  text = re.sub(r'\.(\s+)([a-z])', lambda m: f'.{m.group(1)}{m.group(2).upper()}', text) # 句號後大寫
526
 
 
530
  # 例外情況:保留專有名詞、人稱代詞等的大寫
531
  if word in ["I", "I'm", "I've", "I'd", "I'll"]:
532
  return match.group(0) # 保持原樣
533
+
534
  # 保留月份、星期、地名等專有名詞的大寫
535
+ proper_nouns = ["January", "February", "March", "April", "May", "June", "July",
536
  "August", "September", "October", "November", "December",
537
  "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
538
  if word in proper_nouns:
539
  return match.group(0) # 保持原樣
540
+
541
  # 其他情況:將首字母改為小寫
542
  return match.group(1) + word[0].lower() + word[1:]
543
+
544
  # 匹配逗號後接空格再接大寫單詞的模式
545
  text = re.sub(r'(,\s+)([A-Z][a-zA-Z]*)', fix_capitalization_after_comma, text)
546
+
547
+
548
  common_phrases = [
549
  (r'Social or seating area', r'social or seating area'),
550
  (r'Sleeping area', r'sleeping area'),
551
  (r'Dining area', r'dining area'),
552
  (r'Living space', r'living space')
553
  ]
554
+
555
  for phrase, replacement in common_phrases:
556
  # 只修改句中的術語,保留句首的大寫
557
  text = re.sub(r'(?<=[.!?]\s)' + phrase, replacement, text)
558
  # 修改句中的術語,但保留句首的大寫
559
  text = re.sub(r'(?<=,\s)' + phrase, replacement, text)
560
+
561
  # 7. 確保標點符號後有空格
562
  text = re.sub(r'\s+([.,;:!?])', r'\1', text) # 標點符號前不要空格
563
  text = re.sub(r'([.,;:!?])([a-zA-Z0-9])', r'\1 \2', text) # 標點符號後要有空格
564
+
565
  # 8. 修正重複標點符號
566
  text = re.sub(r'\.{2,}', '.', text) # 多個句號變一個
567
  text = re.sub(r',{2,}', ',', text) # 多個逗號變一個
568
+
569
  # 9. 確保文本以標點結束
570
  if text and not text[-1] in '.!?':
571
  text += '.'
572
+
573
  return text
574
 
575
  def _is_intersection(self, detected_objects: List[Dict]) -> bool:
image_processor.py CHANGED
@@ -20,11 +20,13 @@ class ImageProcessor:
20
  Separates processing logic from UI components
21
  """
22
 
23
- def __init__(self):
24
  """Initialize the image processor with required components"""
25
  self.color_mapper = ColorMapper()
26
  self.model_instances = {}
27
  self.lighting_analyzer = LightingAnalyzer()
 
 
28
 
29
  def get_model_instance(self, model_name: str, confidence: float = 0.25, iou: float = 0.25) -> DetectionModel:
30
  """
@@ -65,7 +67,11 @@ class ImageProcessor:
65
  try:
66
  # Initialize scene analyzer if not already done
67
  if not hasattr(self, 'scene_analyzer'):
68
- self.scene_analyzer = SceneAnalyzer(class_names=detection_result.names)
 
 
 
 
69
 
70
  # 確保類名正確更新
71
  if self.scene_analyzer.class_names is None:
 
20
  Separates processing logic from UI components
21
  """
22
 
23
+ def __init__(self, use_llm=True, llm_model_path=None):
24
  """Initialize the image processor with required components"""
25
  self.color_mapper = ColorMapper()
26
  self.model_instances = {}
27
  self.lighting_analyzer = LightingAnalyzer()
28
+ self.use_llm = use_llm
29
+ self.llm_model_path = llm_model_path
30
 
31
  def get_model_instance(self, model_name: str, confidence: float = 0.25, iou: float = 0.25) -> DetectionModel:
32
  """
 
67
  try:
68
  # Initialize scene analyzer if not already done
69
  if not hasattr(self, 'scene_analyzer'):
70
+ self.scene_analyzer = SceneAnalyzer(
71
+ class_names=detection_result.names,
72
+ use_llm=self.use_llm,
73
+ llm_model_path=self.llm_model_path
74
+ )
75
 
76
  # 確保類名正確更新
77
  if self.scene_analyzer.class_names is None:
llm_enhancer.py ADDED
@@ -0,0 +1,1218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import torch
4
+ from typing import Dict, List, Tuple, Any, Optional
5
+ import logging
6
+
7
+ class LLMEnhancer:
8
+ """
9
+ 負責使用LLM (Large Language Model) 增強場景理解和描述。
10
+ 未來可以再整合Llama或其他LLM模型進行場景描述的生成和豐富化。
11
+ """
12
+
13
+ def __init__(self,
14
+ model_path: Optional[str] = None,
15
+ tokenizer_path: Optional[str] = None,
16
+ device: Optional[str] = None,
17
+ max_length: int = 2048,
18
+ temperature: float = 0.3,
19
+ top_p: float = 0.85):
20
+ """
21
+ 初始化LLM增強器
22
+
23
+ Args:
24
+ model_path: LLM模型的路徑或HuggingFace log in,默認使用Llama 3.2
25
+ tokenizer_path: token處理器的路徑,通常與model_path相同
26
+ device: 設備檢查 ('cpu'或'cuda')
27
+ max_length: 生成文本的最大長度
28
+ temperature: 生成文本的溫度(較高比較有創意,較低會偏保守)
29
+ top_p: 生成文本時的核心採樣機率閾值
30
+ """
31
+ self.logger = logging.getLogger("LLMEnhancer")
32
+ self.logger.setLevel(logging.INFO)
33
+ handler = logging.StreamHandler()
34
+ handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
35
+ self.logger.addHandler(handler)
36
+
37
+ # 設置默認模型路徑就是用Llama3.2
38
+ self.model_path = model_path or "meta-llama/Llama-3.2-3B-Instruct"
39
+ self.tokenizer_path = tokenizer_path or self.model_path
40
+
41
+ # 確定運行設備
42
+ self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
43
+ self.logger.info(f"Using device: {self.device}")
44
+
45
+ # create parameters
46
+ self.max_length = max_length
47
+ self.temperature = temperature
48
+ self.top_p = top_p
49
+
50
+ self.model = None
51
+ self.tokenizer = None
52
+
53
+ # 計數器,用來追蹤模型調用次數
54
+ self.call_count = 0
55
+
56
+ self._initialize_prompts()
57
+
58
+ # 只在需要時加載模型
59
+ self._model_loaded = False
60
+
61
+ try:
62
+ self.hf_token = os.environ.get("HF_TOKEN")
63
+ if self.hf_token:
64
+ self.logger.info("Logging in to Hugging Face with token")
65
+ from huggingface_hub import login
66
+ login(token=self.hf_token)
67
+ else:
68
+ self.logger.warning("HF_TOKEN not found in environment variables. Access to gated models may be limited.")
69
+ except Exception as e:
70
+ self.logger.error(f"Error during Hugging Face login: {e}")
71
+
72
+ def _load_model(self):
73
+ """懶加載模型 - 僅在首次需要時加載,使用 8 位量化以節省記憶體"""
74
+ if self._model_loaded:
75
+ return
76
+
77
+ try:
78
+ self.logger.info(f"Loading LLM model from {self.model_path} with 8-bit quantization")
79
+ import torch
80
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
81
+ torch.cuda.empty_cache()
82
+
83
+ # 打印可用 GPU 記憶體
84
+ if torch.cuda.is_available():
85
+ free_in_GB = torch.cuda.get_device_properties(0).total_memory / 1024**3
86
+ print(f"Total GPU memory: {free_in_GB:.2f} GB")
87
+
88
+ # 設置 8 位元量化配置
89
+ quantization_config = BitsAndBytesConfig(
90
+ load_in_8bit=True,
91
+ llm_int8_enable_fp32_cpu_offload=True
92
+ )
93
+
94
+ # 加載詞元處理器
95
+ self.tokenizer = AutoTokenizer.from_pretrained(
96
+ self.tokenizer_path,
97
+ padding_side="left",
98
+ use_fast=False,
99
+ token=self.hf_token
100
+ )
101
+
102
+ # 設置特殊標記
103
+ self.tokenizer.pad_token = self.tokenizer.eos_token
104
+
105
+ # 加載 8 位量化模型
106
+ self.model = AutoModelForCausalLM.from_pretrained(
107
+ self.model_path,
108
+ quantization_config=quantization_config,
109
+ device_map="auto",
110
+ low_cpu_mem_usage=True,
111
+ token=self.hf_token
112
+ )
113
+
114
+ self.logger.info("Model loaded successfully with 8-bit quantization")
115
+ self._model_loaded = True
116
+
117
+ except Exception as e:
118
+ self.logger.error(f"Error loading LLM model: {e}")
119
+ import traceback
120
+ traceback.print_exc()
121
+ raise
122
+
123
+ def _initialize_prompts(self):
124
+ """Return an optimized prompt template specifically for Zephyr model"""
125
+ # the prompt for the model
126
+ self.enhance_description_template = """
127
+ <|system|>
128
+ You are an expert visual analyst. Your task is to improve the readability and fluency of scene descriptions using STRICT factual accuracy.
129
+
130
+ Your **top priority is to avoid hallucination** or fabrication. You are working in a computer vision pipeline using object detection (YOLO) and image embeddings. You MUST treat the input object list as a whitelist. Do not speculate beyond this list.
131
+
132
+ </|system|>
133
+
134
+ <|user|>
135
+ Rewrite the following scene description to be fluent and clear. DO NOT add any objects, events, or spatial relationships that are not explicitly present in the original or object list.
136
+
137
+ ORIGINAL:
138
+ {original_description}
139
+
140
+ CRITICAL RULES:
141
+ 1. NEVER assume room type, object function, or scene purpose unless directly stated.
142
+ 2. NEVER invent object types. You are limited to: {object_list}
143
+ 3. NEVER speculate on object quantity. If the description says "10 people" , DO NOT say "dozens" or "many". Maintain the original quantity unless specified.
144
+ 4. Use terms like "in the scene", "visible in the background", or "positioned in the lower left" instead of assuming direction or layout logic.
145
+ 5. You MAY describe confirmed materials, colors, and composition style if visually obvious and non-speculative.
146
+ 6. Write 2–4 complete, well-structured sentences with punctuation.
147
+ 7. Final output MUST be a single fluent paragraph of 60–200 words (not longer).
148
+ 8. NEVER include explanations, reasoning, or tags. ONLY provide the enhanced description.
149
+ 9. Do not repeat any sentence structure or phrase more than once.
150
+ </|user|>
151
+
152
+ <|assistant|>
153
+ """
154
+
155
+
156
+ # 錯誤檢測提示
157
+ self.verify_detection_template = """
158
+ Task: You are an advanced vision system that verifies computer vision detections for accuracy.
159
+
160
+ Analyze the following detection results and identify any potential errors or inconsistencies:
161
+
162
+ SCENE TYPE: {scene_type}
163
+ SCENE NAME: {scene_name}
164
+ CONFIDENCE: {confidence:.2f}
165
+
166
+ DETECTED OBJECTS: {detected_objects}
167
+
168
+ CLIP ANALYSIS RESULTS:
169
+ {clip_analysis}
170
+
171
+ Possible Errors to Check:
172
+ 1. Objects misidentified (e.g., architectural elements labeled as vehicles)
173
+ 2. Cultural elements misunderstood (e.g., Asian temple structures labeled as boats)
174
+ 3. Objects that seem out of place for this type of scene
175
+ 4. Inconsistencies between different detection systems
176
+
177
+ If you find potential errors, list them clearly with explanations. If the detections seem reasonable, state that they appear accurate.
178
+
179
+ Verification Results:
180
+ """
181
+
182
+ # 無檢測處理提示
183
+ self.no_detection_template = """
184
+ Task: You are an advanced scene understanding system analyzing an image where standard object detection failed to identify specific objects.
185
+
186
+ Based on advanced image embeddings (CLIP analysis), we have the following information:
187
+
188
+ MOST LIKELY SCENE: {top_scene} (confidence: {top_confidence:.2f})
189
+ VIEWPOINT: {viewpoint}
190
+ LIGHTING: {lighting_condition}
191
+
192
+ CULTURAL ANALYSIS: {cultural_analysis}
193
+
194
+ Create a detailed description of what might be in this scene, considering:
195
+ 1. The most likely type of location or setting
196
+ 2. Possible architectural or natural elements present
197
+ 3. The lighting and atmosphere
198
+ 4. Potential cultural or regional characteristics
199
+
200
+ Your description should be natural, flowing, and offer insights into what the image likely contains despite the lack of specific object detection.
201
+
202
+ Scene Description:
203
+ """
204
+
205
+ def _clean_llama_response(self, response: str) -> str:
206
+ """處理 Llama 模型特有的輸出格式問題"""
207
+ # 首先應用通用清理
208
+ response = self._clean_model_response(response)
209
+
210
+ # 移除 Llama 常見的前綴短語
211
+ prefixes_to_remove = [
212
+ "Here's the enhanced description:",
213
+ "Enhanced description:",
214
+ "Here is the enhanced scene description:",
215
+ "I've enhanced the description while preserving all factual details:"
216
+ ]
217
+
218
+ for prefix in prefixes_to_remove:
219
+ if response.lower().startswith(prefix.lower()):
220
+ response = response[len(prefix):].strip()
221
+
222
+ # 移除可能的後綴說明
223
+ suffixes_to_remove = [
224
+ "I've maintained all the key factual elements",
225
+ "I've preserved all the factual details",
226
+ "All factual elements have been maintained"
227
+ ]
228
+
229
+ for suffix in suffixes_to_remove:
230
+ if response.lower().endswith(suffix.lower()):
231
+ response = response[:response.rfind(suffix)].strip()
232
+
233
+ return response
234
+
235
+ def _detect_scene_type(self, detected_objects: List[Dict]) -> str:
236
+ """
237
+ Detect scene type based on object distribution and patterns
238
+ """
239
+ # Default scene type
240
+ scene_type = "intersection"
241
+
242
+ # Count objects by class
243
+ object_counts = {}
244
+ for obj in detected_objects:
245
+ class_name = obj.get("class_name", "")
246
+ if class_name not in object_counts:
247
+ object_counts[class_name] = 0
248
+ object_counts[class_name] += 1
249
+
250
+ # 辨識人
251
+ people_count = object_counts.get("person", 0)
252
+
253
+ # 交通工具的
254
+ car_count = object_counts.get("car", 0)
255
+ bus_count = object_counts.get("bus", 0)
256
+ truck_count = object_counts.get("truck", 0)
257
+ total_vehicles = car_count + bus_count + truck_count
258
+
259
+ # Simple scene type detection logic
260
+ if people_count > 8 and total_vehicles < 2:
261
+ scene_type = "pedestrian_crossing"
262
+ elif people_count > 5 and total_vehicles > 2:
263
+ scene_type = "busy_intersection"
264
+ elif people_count < 3 and total_vehicles > 3:
265
+ scene_type = "traffic_junction"
266
+
267
+ return scene_type
268
+
269
+ def _clean_scene_type(self, scene_type: str) -> str:
270
+ """清理場景類型,使其更適合用於提示詞"""
271
+ if not scene_type:
272
+ return "scene"
273
+
274
+ # replace underline to space or sometime capital letter
275
+ if '_' in scene_type:
276
+ return ' '.join(word.capitalize() for word in scene_type.split('_'))
277
+
278
+ return scene_type
279
+
280
+ def _clean_model_response(self, response: str) -> str:
281
+ """清理模型回應以移除常見的標記和前綴"""
282
+ # 移除任何可能殘留的系統樣式標記
283
+ response = re.sub(r'<\|.*?\|>', '', response)
284
+
285
+ # 移除任何 "This european_plaza" 或類似前綴
286
+ response = re.sub(r'^This [a-z_]+\s+', '', response)
287
+
288
+ # 確保響應以大寫字母開頭
289
+ if response and not response[0].isupper():
290
+ response = response[0].upper() + response[1:]
291
+
292
+ return response.strip()
293
+
294
+ def _validate_scene_facts(self, enhanced_desc: str, original_desc: str, people_count: int) -> str:
295
+ """Validate key facts in enhanced description"""
296
+ # Check if people count is preserved
297
+ if people_count > 0:
298
+ people_pattern = re.compile(r'(\d+)\s+(?:people|persons|pedestrians|individuals)', re.IGNORECASE)
299
+ people_match = people_pattern.search(enhanced_desc)
300
+
301
+ if not people_match or int(people_match.group(1)) != people_count:
302
+ # Replace incorrect count or add if missing
303
+ if people_match:
304
+ enhanced_desc = people_pattern.sub(f"{people_count} people", enhanced_desc)
305
+ else:
306
+ enhanced_desc = f"The scene shows {people_count} people. " + enhanced_desc
307
+
308
+ # Ensure aerial perspective is mentioned
309
+ if "aerial" in original_desc.lower() and "aerial" not in enhanced_desc.lower():
310
+ enhanced_desc = "From an aerial perspective, " + enhanced_desc[0].lower() + enhanced_desc[1:]
311
+
312
+ return enhanced_desc
313
+
314
+ def reset_context(self):
315
+ """在處理新圖像前重置模型上下文"""
316
+ if self._model_loaded:
317
+ # 清除 GPU 緩存
318
+ torch.cuda.empty_cache()
319
+ self.logger.info("Model context reset")
320
+ else:
321
+ self.logger.info("Model not loaded, no context to reset")
322
+
323
+ def _remove_introduction_sentences(self, response: str) -> str:
324
+ """移除生成文本中可能的介紹性句子"""
325
+ # 識別常見的介紹性模式
326
+ intro_patterns = [
327
+ r'^Here is the (?:rewritten|enhanced) .*?description:',
328
+ r'^The (?:rewritten|enhanced) description:',
329
+ r'^Here\'s the (?:rewritten|enhanced) description of .*?:'
330
+ ]
331
+
332
+ for pattern in intro_patterns:
333
+ if re.match(pattern, response, re.IGNORECASE):
334
+ # 找到冒號後的內容
335
+ parts = re.split(r':', response, 1)
336
+ if len(parts) > 1:
337
+ return parts[1].strip()
338
+
339
+ return response
340
+
341
+ def enhance_description(self, scene_data: Dict[str, Any]) -> str:
342
+ """改進的場景描述增強器,處理各種場景類型並保留視角與光照資訊,並作為總窗口可運用於其他class"""
343
+ try:
344
+ # 重置上下文
345
+ self.reset_context()
346
+
347
+ # 確保模型已加載
348
+ if not self._model_loaded:
349
+ self._load_model()
350
+
351
+ # extract original description
352
+ original_desc = scene_data.get("original_description", "")
353
+ if not original_desc:
354
+ return "No original description provided."
355
+
356
+ # 獲取scene type 並標準化
357
+ scene_type = scene_data.get("scene_type", "unknown scene")
358
+ scene_type = self._clean_scene_type(scene_type)
359
+
360
+ # 提取檢測到的物件並過濾低置信度物件
361
+ detected_objects = scene_data.get("detected_objects", [])
362
+ filtered_objects = []
363
+
364
+ # 高置信度閾值,嚴格過濾物件
365
+ high_confidence_threshold = 0.65
366
+
367
+ for obj in detected_objects:
368
+ confidence = obj.get("confidence", 0)
369
+ class_name = obj.get("class_name", "")
370
+
371
+ # 為特殊類別設置更高閾值
372
+ special_classes = ["airplane", "helicopter", "boat"]
373
+ if class_name in special_classes:
374
+ if confidence < 0.75: # 為這些類別設置更高閾值
375
+ continue
376
+
377
+ # 僅保留高置信度物件
378
+ if confidence >= high_confidence_threshold:
379
+ filtered_objects.append(obj)
380
+
381
+ # 計算物件列表和數量 - 僅使用過濾後的高置信度物件
382
+ object_counts = {}
383
+ for obj in filtered_objects:
384
+ class_name = obj.get("class_name", "")
385
+ if class_name not in object_counts:
386
+ object_counts[class_name] = 0
387
+ object_counts[class_name] += 1
388
+
389
+ # 將高置信度物件格式化為清單
390
+ high_confidence_objects = ", ".join([f"{count} {obj}" for obj, count in object_counts.items()])
391
+
392
+ # 如果沒有高置信度物件,回退到使用原始描述中的關鍵詞
393
+ if not high_confidence_objects:
394
+ # 從原始描述中提取物件提及
395
+ object_keywords = self._extract_objects_from_description(original_desc)
396
+ high_confidence_objects = ", ".join(object_keywords) if object_keywords else "objects visible in the scene"
397
+
398
+ # 保留原始描述中的關鍵視角信息
399
+ perspective = self._extract_perspective_from_description(original_desc)
400
+
401
+ # 提取光照資訊
402
+ lighting_description = "unknown lighting"
403
+ if "lighting_info" in scene_data:
404
+ lighting_info = scene_data.get("lighting_info", {})
405
+ time_of_day = lighting_info.get("time_of_day", "unknown")
406
+ is_indoor = lighting_info.get("is_indoor", False)
407
+ lighting_description = f"{'indoor' if is_indoor else 'outdoor'} {time_of_day} lighting"
408
+
409
+ # 構建提示詞,整合所有關鍵資訊
410
+ prompt = self.enhance_description_template.format(
411
+ scene_type=scene_type,
412
+ object_list=high_confidence_objects,
413
+ original_description=original_desc,
414
+ perspective=perspective,
415
+ lighting_description=lighting_description
416
+ )
417
+
418
+ # 生成增強描述
419
+ self.logger.info("Generating LLM response...")
420
+ response = self._generate_llm_response(prompt)
421
+
422
+ # 檢查回應完整性的更嚴格標準
423
+ is_incomplete = (
424
+ len(response) < 100 or # 太短
425
+ (len(response) < 200 and "." not in response[-30:]) or # 結尾沒有適當標點
426
+ any(response.endswith(phrase) for phrase in ["in the", "with the", "and the"]) # 以不完整短語結尾
427
+ )
428
+
429
+ max_retries = 3
430
+ attempts = 0
431
+ while attempts < max_retries and is_incomplete:
432
+ self.logger.warning(f"Generated incomplete response, retrying... Attempt {attempts+1}/{max_retries}")
433
+ # 重新生成
434
+ response = self._generate_llm_response(prompt)
435
+ attempts += 1
436
+
437
+ # 重新檢查完整性
438
+ is_incomplete = (len(response) < 100 or
439
+ (len(response) < 200 and "." not in response[-30:]) or
440
+ any(response.endswith(phrase) for phrase in ["in the", "with the", "and the"]))
441
+
442
+ # 確保響應不為空
443
+ if not response or len(response.strip()) < 10:
444
+ self.logger.warning("Generated response was empty or too short, returning original description")
445
+ return original_desc
446
+
447
+ # 清理響應 - 使用與模型相符的清理方法
448
+ if "llama" in self.model_path.lower():
449
+ result = self._clean_llama_response(response)
450
+ else:
451
+ result = self._clean_model_response(response)
452
+
453
+ # 移除介紹性句子
454
+ result = self._remove_introduction_sentences(result)
455
+
456
+ # 移除解釋性注釋
457
+ result = self._remove_explanatory_notes(result)
458
+
459
+ # 進行事實準確性檢查
460
+ result = self._verify_factual_accuracy(original_desc, result, high_confidence_objects)
461
+
462
+ # 確保場景類型和視角一致性
463
+ result = self._ensure_scene_type_consistency(result, scene_type, original_desc)
464
+ if perspective and perspective.lower() not in result.lower():
465
+ result = f"{perspective}, {result[0].lower()}{result[1:]}"
466
+
467
+ return str(result)
468
+
469
+ except Exception as e:
470
+ self.logger.error(f"Enhancement failed: {str(e)}")
471
+ import traceback
472
+ self.logger.error(traceback.format_exc())
473
+ return original_desc # 發生任何錯誤時返回原始描述
474
+
475
+ def _verify_factual_accuracy(self, original: str, generated: str, object_list: str) -> str:
476
+ """驗證生成的描述不包含原始描述或物體列表中沒有的信息"""
477
+
478
+ # 將原始描述和物體列表合併為授權詞彙源
479
+ authorized_content = original.lower() + " " + object_list.lower()
480
+
481
+ # 提取生成描述中具有實質意義的名詞
482
+ # 創建常見地點、文化和地域詞彙的列表
483
+ location_terms = ["plaza", "square", "market", "mall", "avenue", "boulevard"]
484
+ cultural_terms = ["european", "asian", "american", "african", "western", "eastern"]
485
+
486
+ # 檢查生成文本中的每個詞
487
+ for term in location_terms + cultural_terms:
488
+ # 僅當該詞出現在生成文本但不在授權內容中時進行替換
489
+ if term in generated.lower() and term not in authorized_content:
490
+ # 根據詞語類型選擇適當的替換詞
491
+ if term in location_terms:
492
+ replacement = "area"
493
+ else:
494
+ replacement = "scene"
495
+
496
+ # 使用正則表達式進行完整詞匹配替換
497
+ pattern = re.compile(r'\b' + term + r'\b', re.IGNORECASE)
498
+ generated = pattern.sub(replacement, generated)
499
+
500
+ return generated
501
+
502
+
503
+ def verify_detection(self,
504
+ detected_objects: List[Dict],
505
+ clip_analysis: Dict[str, Any],
506
+ scene_type: str,
507
+ scene_name: str,
508
+ confidence: float) -> Dict[str, Any]:
509
+ """
510
+ 驗證並可能修正YOLO的檢測結果
511
+
512
+ Args:
513
+ detected_objects: YOLO檢測到的物體列表
514
+ clip_analysis: CLIP分析結果
515
+ scene_type: 識別的場景類型
516
+ scene_name: 場景名稱
517
+ confidence: 場景分類的信心度
518
+
519
+ Returns:
520
+ Dict: 包含驗證結果和建議的字典
521
+ """
522
+ # 確保模型已加載
523
+ self._load_model()
524
+
525
+ # 格式化數據
526
+ objects_str = self._format_objects_for_prompt(detected_objects)
527
+ clip_str = self._format_clip_results(clip_analysis)
528
+
529
+ # 構建提示
530
+ prompt = self.verify_detection_template.format(
531
+ scene_type=scene_type,
532
+ scene_name=scene_name,
533
+ confidence=confidence,
534
+ detected_objects=objects_str,
535
+ clip_analysis=clip_str
536
+ )
537
+
538
+ # 調用LLM進行驗證
539
+ verification_result = self._generate_llm_response(prompt)
540
+
541
+ # 解析驗證結果
542
+ result = {
543
+ "verification_text": verification_result,
544
+ "has_errors": "appear accurate" not in verification_result.lower(),
545
+ "corrected_objects": None # 可能在未來版本實現詳細錯誤修正
546
+ }
547
+
548
+ return result
549
+
550
+ def _validate_content_consistency(self, original_desc: str, enhanced_desc: str) -> str:
551
+ """驗證增強描述的內容與原始描述一致"""
552
+ # 提取原始描述中的關鍵數值
553
+ people_count_match = re.search(r'(\d+)\s+people', original_desc, re.IGNORECASE)
554
+ people_count = int(people_count_match.group(1)) if people_count_match else None
555
+
556
+ # 驗證人數一致性
557
+ if people_count:
558
+ enhanced_count_match = re.search(r'(\d+)\s+people', enhanced_desc, re.IGNORECASE)
559
+ if not enhanced_count_match or int(enhanced_count_match.group(1)) != people_count:
560
+ # 保留原始人數
561
+ if enhanced_count_match:
562
+ enhanced_desc = re.sub(r'\b\d+\s+people\b', f"{people_count} people", enhanced_desc, flags=re.IGNORECASE)
563
+ elif "people" in enhanced_desc.lower():
564
+ enhanced_desc = re.sub(r'\bpeople\b', f"{people_count} people", enhanced_desc, flags=re.IGNORECASE)
565
+
566
+ # 驗證視角/透視一致性
567
+ perspective_terms = ["aerial", "bird's-eye", "overhead", "ground level", "eye level"]
568
+
569
+ for term in perspective_terms:
570
+ if term in original_desc.lower() and term not in enhanced_desc.lower():
571
+ # 添加缺失的視角信息
572
+ if enhanced_desc[0].isupper():
573
+ enhanced_desc = f"From {term} view, {enhanced_desc[0].lower()}{enhanced_desc[1:]}"
574
+ else:
575
+ enhanced_desc = f"From {term} view, {enhanced_desc}"
576
+ break
577
+
578
+ return enhanced_desc
579
+
580
+ def _remove_explanatory_notes(self, response: str) -> str:
581
+ """移除解釋性注釋、說明和其他非描述性內容"""
582
+
583
+ # 識別常見的注釋和解釋模式
584
+ note_patterns = [
585
+ r'(?:^|\n)Note:.*?(?:\n|$)',
586
+ r'(?:^|\n)I have (?:followed|adhered to|ensured).*?(?:\n|$)',
587
+ r'(?:^|\n)This description (?:follows|adheres to|maintains).*?(?:\n|$)',
588
+ r'(?:^|\n)The enhanced description (?:maintains|preserves).*?(?:\n|$)'
589
+ ]
590
+
591
+ # 尋找第一段完整的描述內容
592
+ paragraphs = [p.strip() for p in response.split('\n\n') if p.strip()]
593
+
594
+ # 如果只有一個段落,檢查並清理它
595
+ if len(paragraphs) == 1:
596
+ for pattern in note_patterns:
597
+ paragraphs[0] = re.sub(pattern, '', paragraphs[0], flags=re.IGNORECASE)
598
+ return paragraphs[0].strip()
599
+
600
+ # 如果有多個段落,識別並移除注釋段落
601
+ content_paragraphs = []
602
+ for paragraph in paragraphs:
603
+ is_note = False
604
+ for pattern in note_patterns:
605
+ if re.search(pattern, paragraph, flags=re.IGNORECASE):
606
+ is_note = True
607
+ break
608
+
609
+ # 檢查段落是否以常見的注釋詞開頭
610
+ if paragraph.lower().startswith(('note:', 'please note:', 'remember:')):
611
+ is_note = True
612
+
613
+ if not is_note:
614
+ content_paragraphs.append(paragraph)
615
+
616
+ # 返回清理後的內容
617
+ return '\n\n'.join(content_paragraphs).strip()
618
+
619
+ def handle_no_detection(self, clip_analysis: Dict[str, Any]) -> str:
620
+ """
621
+ 處理YOLO未檢測到物體的情況
622
+
623
+ Args:
624
+ clip_analysis: CLIP分析結果
625
+
626
+ Returns:
627
+ str: 生成的場景描述
628
+ """
629
+ # 確保模型已加載
630
+ self._load_model()
631
+
632
+ # 提取CLIP結果
633
+ top_scene, top_confidence = clip_analysis.get("top_scene", ("unknown", 0))
634
+ viewpoint = clip_analysis.get("viewpoint", ("standard", 0))[0]
635
+ lighting = clip_analysis.get("lighting_condition", ("unknown", 0))[0]
636
+
637
+ # 格式化文化分析
638
+ cultural_str = self._format_cultural_analysis(clip_analysis.get("cultural_analysis", {}))
639
+
640
+ # 構建提示
641
+ prompt = self.no_detection_template.format(
642
+ top_scene=top_scene,
643
+ top_confidence=top_confidence,
644
+ viewpoint=viewpoint,
645
+ lighting_condition=lighting,
646
+ cultural_analysis=cultural_str
647
+ )
648
+
649
+ # 調用LLM生成描述
650
+ description = self._generate_llm_response(prompt)
651
+
652
+ # 優化輸出
653
+ return self._clean_llm_response(description)
654
+
655
+ def _clean_input_text(self, text: str) -> str:
656
+ """
657
+ 對輸入文本進行通用的格式清理,處理常見的格式問題。
658
+
659
+ Args:
660
+ text: 輸入文本
661
+
662
+ Returns:
663
+ 清理後的文本
664
+ """
665
+ if not text:
666
+ return ""
667
+
668
+ # 清理格式的問題
669
+ # 1. 處理連續標點符號問題
670
+ text = re.sub(r'([.,;:!?])\1+', r'\1', text)
671
+
672
+ # 2. 修復不完整句子的標點(如 "Something," 後沒有繼續句子)
673
+ text = re.sub(r',\s*$', '.', text)
674
+
675
+ # 3. 修復如 "word." 後未加空格即接下一句的問題
676
+ text = re.sub(r'([.!?])([A-Z])', r'\1 \2', text)
677
+
678
+ # 4. 移除多餘空格
679
+ text = re.sub(r'\s+', ' ', text).strip()
680
+
681
+ # 5. 確保句子正確結束(句尾加句號)
682
+ if text and not text[-1] in '.!?':
683
+ text += '.'
684
+
685
+ return text
686
+
687
+ def _fact_check_description(self, original_desc: str, enhanced_desc: str, scene_type: str, detected_objects: List[str]) -> str:
688
+ """
689
+ 驗證並可能修正增強後的描述,確保其保持事實準確性,針對普遍事實而非特定場景。
690
+
691
+ Args:
692
+ original_desc: 原始場景描述
693
+ enhanced_desc: 增強後的描述待驗證
694
+ scene_type: 場景類型
695
+ detected_objects: 檢測到的物體名稱列表
696
+
697
+ Returns:
698
+ 經過事實檢查的描述
699
+ """
700
+ # 如果增強描述為空或太短,返回原始描述
701
+ if not enhanced_desc or len(enhanced_desc) < 30:
702
+ return original_desc
703
+
704
+ # 1. 檢查數值一致性(如人數、物體數量等)
705
+ # 從原始描述中提取數字和相關名詞
706
+ number_patterns = [
707
+ (r'(\d+)\s+(people|person|pedestrians|individuals)', r'\1', r'\2'), # 人數
708
+ (r'(\d+)\s+(cars|vehicles|automobiles)', r'\1', r'\2'), # 車輛數
709
+ (r'(\d+)\s+(buildings|structures)', r'\1', r'\2') # 建築數
710
+ ]
711
+
712
+ # 檢查原始描述中的每個數字
713
+ for pattern, num_group, word_group in number_patterns:
714
+ original_matches = re.finditer(pattern, original_desc, re.IGNORECASE)
715
+ for match in original_matches:
716
+ number = match.group(1)
717
+ noun = match.group(2)
718
+
719
+ # 檢查增強描述中是否保留了這個數字
720
+ # 創建一個更通用的模式來檢查增強描述中是否包含此數字和對象類別
721
+ enhanced_pattern = r'(\d+)\s+(' + re.escape(noun) + r'|' + re.escape(noun.rstrip('s')) + r'|' + re.escape(noun + 's') + r')'
722
+ enhanced_matches = list(re.finditer(enhanced_pattern, enhanced_desc, re.IGNORECASE))
723
+
724
+ if not enhanced_matches:
725
+ # 數字+名詞未在增強描述中找到
726
+ plural_form = noun if noun.endswith('s') or number == '1' else noun + 's'
727
+ if enhanced_desc.startswith("This") or enhanced_desc.startswith("The"):
728
+ enhanced_desc = enhanced_desc.replace("This ", f"This scene with {number} {plural_form} ", 1)
729
+ enhanced_desc = enhanced_desc.replace("The ", f"The scene with {number} {plural_form} ", 1)
730
+ else:
731
+ enhanced_desc = f"The scene includes {number} {plural_form}. " + enhanced_desc
732
+ elif enhanced_matches and match.group(1) != number:
733
+ # 存在但數字不一致,就要更正數字
734
+ for ematch in enhanced_matches:
735
+ wrong_number = ematch.group(1)
736
+ enhanced_desc = enhanced_desc.replace(f"{wrong_number} {ematch.group(2)}", f"{number} {ematch.group(2)}")
737
+
738
+ # 2. 檢查視角的一致性
739
+ perspective_terms = {
740
+ "aerial": ["aerial", "bird's-eye", "overhead", "top-down", "above", "looking down"],
741
+ "ground": ["street-level", "ground level", "eye-level", "standing"],
742
+ "indoor": ["inside", "interior", "indoor", "within"],
743
+ "close-up": ["close-up", "detailed view", "close shot"]
744
+ }
745
+
746
+ # 確定原始視角
747
+ original_perspective = None
748
+ for persp, terms in perspective_terms.items():
749
+ if any(term in original_desc.lower() for term in terms):
750
+ original_perspective = persp
751
+ break
752
+
753
+ # 檢查是否保留了視角方面
754
+ if original_perspective:
755
+ enhanced_has_perspective = any(term in enhanced_desc.lower() for term in perspective_terms[original_perspective])
756
+
757
+ if not enhanced_has_perspective:
758
+ # 添加之前缺的視角方面
759
+ perspective_prefixes = {
760
+ "aerial": "From an aerial perspective, ",
761
+ "ground": "From street level, ",
762
+ "indoor": "In this indoor setting, ",
763
+ "close-up": "In this close-up view, "
764
+ }
765
+
766
+ prefix = perspective_prefixes.get(original_perspective, "")
767
+ if prefix:
768
+ if enhanced_desc[0].isupper():
769
+ enhanced_desc = prefix + enhanced_desc[0].lower() + enhanced_desc[1:]
770
+ else:
771
+ enhanced_desc = prefix + enhanced_desc
772
+
773
+ # 3. 檢查場景類型一致性
774
+ if scene_type and scene_type.lower() != "unknown" and scene_type.lower() not in enhanced_desc.lower():
775
+ # 優雅地添加場景類型
776
+ if enhanced_desc.startswith("This ") or enhanced_desc.startswith("The "):
777
+ # 避免產生 "This scene" 和 "This intersection" 的重複
778
+ if "scene" in enhanced_desc[:15].lower():
779
+ fixed_type = scene_type.lower()
780
+ enhanced_desc = enhanced_desc.replace("scene", fixed_type, 1)
781
+ else:
782
+ enhanced_desc = enhanced_desc.replace("This ", f"This {scene_type} ", 1)
783
+ enhanced_desc = enhanced_desc.replace("The ", f"The {scene_type} ", 1)
784
+ else:
785
+ enhanced_desc = f"This {scene_type} " + enhanced_desc
786
+
787
+ # 4. 確保文字長度適當,這邊的限制要與prompt相同,否則會產生矛盾
788
+ words = enhanced_desc.split()
789
+ if len(words) > 200:
790
+ # 找尋接近字數限制的句子結束處
791
+ truncated = ' '.join(words[:200])
792
+ last_period = max(truncated.rfind('.'), truncated.rfind('!'), truncated.rfind('?'))
793
+
794
+ if last_period > 0:
795
+ enhanced_desc = truncated[:last_period+1]
796
+ else:
797
+ enhanced_desc = truncated + '.'
798
+
799
+ return enhanced_desc
800
+
801
+ def _extract_perspective_from_description(self, description: str) -> str:
802
+ """從原始描述中提取視角/透視信息"""
803
+ perspective_terms = {
804
+ "aerial": ["aerial perspective", "aerial view", "bird's-eye view", "overhead view", "from above"],
805
+ "ground": ["ground level", "eye level", "street level"],
806
+ "indoor": ["indoor setting", "inside", "interior"]
807
+ }
808
+
809
+ for persp_type, terms in perspective_terms.items():
810
+ for term in terms:
811
+ if term.lower() in description.lower():
812
+ return term
813
+
814
+ return ""
815
+
816
+ def _extract_objects_from_description(self, description: str) -> List[str]:
817
+ """從原始描述中提取物件提及"""
818
+ # 常見物件正則表達式模式
819
+ object_patterns = [
820
+ r'(\d+)\s+(people|persons|pedestrians|individuals)',
821
+ r'(\d+)\s+(cars|vehicles|automobiles)',
822
+ r'(\d+)\s+(buildings|structures)',
823
+ r'(\d+)\s+(plants|potted plants|flowers)',
824
+ r'(\d+)\s+(beds|furniture|tables|chairs)'
825
+ ]
826
+
827
+ extracted_objects = []
828
+
829
+ for pattern in object_patterns:
830
+ matches = re.finditer(pattern, description, re.IGNORECASE)
831
+ for match in matches:
832
+ number = match.group(1)
833
+ object_type = match.group(2)
834
+ extracted_objects.append(f"{number} {object_type}")
835
+
836
+ return extracted_objects
837
+
838
+ def _ensure_scene_type_consistency(self, description: str, scene_type: str, original_desc: str) -> str:
839
+ """確保描述中的場景類型與指定的場景類型一致"""
840
+ # 禁止使用的錯誤場景詞列表
841
+ prohibited_scene_words = ["plaza", "square", "european", "asian", "american"]
842
+
843
+ # 檢查是否包含禁止的場景詞
844
+ for word in prohibited_scene_words:
845
+ if word in description.lower() and word not in original_desc.lower() and word not in scene_type.lower():
846
+ # 替換錯誤場景詞為正確場景類型
847
+ pattern = re.compile(r'\b' + word + r'\b', re.IGNORECASE)
848
+ description = pattern.sub(scene_type, description)
849
+
850
+ # 確保場景類型在描述中被提及
851
+ if scene_type.lower() not in description.lower():
852
+ # 尋找通用場景詞並替換
853
+ for general_term in ["scene", "area", "place", "location"]:
854
+ if general_term in description.lower():
855
+ pattern = re.compile(r'\b' + general_term + r'\b', re.IGNORECASE)
856
+ description = pattern.sub(scene_type, description, count=1)
857
+ break
858
+ else:
859
+ # 如果沒有找到通用詞,在開頭添加場景類型
860
+ if description.startswith("The "):
861
+ description = description.replace("The ", f"The {scene_type} ", 1)
862
+ elif description.startswith("This "):
863
+ description = description.replace("This ", f"This {scene_type} ", 1)
864
+ else:
865
+ description = f"This {scene_type} " + description
866
+
867
+ return description
868
+
869
+ def _generate_llm_response(self, prompt: str) -> str:
870
+ """生成 LLM 的回應"""
871
+ self._load_model()
872
+
873
+ try:
874
+ self.call_count += 1
875
+ self.logger.info(f"LLM call #{self.call_count}")
876
+
877
+ # 清除 GPU 緩存
878
+ torch.cuda.empty_cache()
879
+
880
+ # 設置固定種子以提高一致性
881
+ torch.manual_seed(42)
882
+
883
+ # 準備輸入
884
+ inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=self.max_length).to(self.device)
885
+
886
+ # 根據模型類型調整參數
887
+ generation_params = {
888
+ "max_new_tokens": 120,
889
+ "pad_token_id": self.tokenizer.eos_token_id,
890
+ "attention_mask": inputs.attention_mask,
891
+ "use_cache": True,
892
+ }
893
+
894
+ # 為 Llama 模型設置特定參數
895
+ if "llama" in self.model_path.lower():
896
+ generation_params.update({
897
+ "temperature": 0.4, # 不要太高, 否則模型可能會太有主觀意見
898
+ "max_new_tokens": 600,
899
+ "do_sample": True,
900
+ "top_p": 0.8,
901
+ "repetition_penalty": 1.2, # 重複的懲罰權重,可避免掉重複字
902
+ "num_beams": 4 ,
903
+ "length_penalty": 1.2,
904
+ })
905
+
906
+ else:
907
+ # 如果用其他模型的參數
908
+ generation_params.update({
909
+ "temperature": 0.6,
910
+ "max_new_tokens": 300,
911
+ "top_p": 0.9,
912
+ "do_sample": True,
913
+ "num_beams": 1,
914
+ "repetition_penalty": 1.05
915
+ })
916
+
917
+ # 生成回應
918
+ with torch.no_grad():
919
+ outputs = self.model.generate(inputs.input_ids, **generation_params)
920
+
921
+ # 解碼完整輸出
922
+ full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
923
+
924
+ # 提取生成的響應部分
925
+ assistant_tag = "<|assistant|>"
926
+ if assistant_tag in full_response:
927
+ response = full_response.split(assistant_tag)[-1].strip()
928
+
929
+ # 檢查是否有未閉合的 <|assistant|>
930
+ user_tag = "<|user|>"
931
+ if user_tag in response:
932
+ response = response.split(user_tag)[0].strip()
933
+ else:
934
+ # 移除輸入提示
935
+ input_text = self.tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)
936
+ response = full_response
937
+ if response.startswith(input_text):
938
+ response = response[len(input_text):].strip()
939
+
940
+ # 確保不返回空響應
941
+ if not response or len(response.strip()) < 10:
942
+ self.logger.warning("生成的回應為空的或太短,返回默認回應")
943
+ return "No detailed description could be generated."
944
+
945
+ return response
946
+
947
+ except Exception as e:
948
+ self.logger.error(f"生成 LLM 響應時出錯: {str(e)}")
949
+ import traceback
950
+ self.logger.error(traceback.format_exc())
951
+ return "Unable to generate enhanced description."
952
+
953
+ def _clean_llm_response(self, response: str) -> str:
954
+ """
955
+ Clean the LLM response to ensure the output contains only clean descriptive text.
956
+ Sometimes it will not only display the description but display tags, notes...etc
957
+
958
+ Args:
959
+ response: Original response from the LLM
960
+
961
+ Returns:
962
+ Cleaned description text
963
+ """
964
+ if not response:
965
+ return ""
966
+
967
+ # Save original response as backup
968
+ original_response = response
969
+
970
+ # 1. Extract content between markers (if present)
971
+ output_start = response.find("[OUTPUT_START]")
972
+ output_end = response.find("[OUTPUT_END]")
973
+ if output_start != -1 and output_end != -1 and output_end > output_start:
974
+ response = response[output_start + len("[OUTPUT_START]"):output_end].strip()
975
+
976
+ # 2. Remove all remaining section markers and instructions
977
+ section_markers = [
978
+ r'\[.*?\]', # [any text]
979
+ r'OUTPUT_START\s*:|OUTPUT_END\s*:', # OUTPUT_START: or OUTPUT_END:
980
+ r'ENHANCED DESCRIPTION\s*:', # ENHANCED DESCRIPTION:
981
+ r'Scene Type\s*:.*?(?=\n|$)', # Scene Type: text
982
+ r'Original Description\s*:.*?(?=\n|$)', # Original Description: text
983
+ r'GOOD\s*:|BAD\s*:', # GOOD: or BAD:
984
+ r'PROBLEM\s*:.*?(?=\n|$)', # PROBLEM: text
985
+ r'</?\|(?:assistant|system|user)\|>', # Dialog markers
986
+ r'\(Note:.*?\)', # Notes in parentheses
987
+ r'\(.*?I\'ve.*?\)', # Common explanatory content
988
+ r'\(.*?as per your request.*?\)' # References to instructions
989
+ ]
990
+
991
+ for marker in section_markers:
992
+ response = re.sub(marker, '', response, flags=re.IGNORECASE)
993
+
994
+ # 3. Remove common prefixes and suffixes
995
+ prefixes_to_remove = [
996
+ "Enhanced Description:",
997
+ "Scene Description:",
998
+ "Description:",
999
+ "Here is the enhanced description:",
1000
+ "Here's the enhanced description:"
1001
+ ]
1002
+
1003
+ for prefix in prefixes_to_remove:
1004
+ if response.lower().startswith(prefix.lower()):
1005
+ response = response[len(prefix):].strip()
1006
+
1007
+ # 4. Remove any Context tags or text containing Context
1008
+ response = re.sub(r'<\s*Context:.*?>', '', response)
1009
+ response = re.sub(r'Context:.*?(?=\n|$)', '', response)
1010
+ response = re.sub(r'Note:.*?(?=\n|$)', '', response, flags=re.IGNORECASE)
1011
+
1012
+ # 5. Clean improper scene type references
1013
+ scene_type_pattern = r'This ([a-zA-Z_]+) (features|shows|displays|contains)'
1014
+ match = re.search(scene_type_pattern, response)
1015
+ if match and '_' in match.group(1):
1016
+ fixed_text = f"This scene {match.group(2)}"
1017
+ response = re.sub(scene_type_pattern, fixed_text, response)
1018
+
1019
+ # 6. Reduce dash usage for more natural punctuation
1020
+ response = re.sub(r'—', ', ', response)
1021
+ response = re.sub(r' - ', ', ', response)
1022
+
1023
+ # 7. Remove excess whitespace and line breaks
1024
+ response = response.replace('\r', ' ')
1025
+ response = re.sub(r'\n+', ' ', response) # 將所有換行符替換為空格
1026
+ response = re.sub(r'\s{2,}', ' ', response) # 將多個空格替換為單個空格
1027
+
1028
+ # 8. Remove Markdown formatting
1029
+ response = re.sub(r'\*\*|\*|__|\|', '', response) # Remove Markdown indicators
1030
+
1031
+ # 9. Detect and remove sentence duplicates
1032
+ sentences = re.split(r'(?<=[.!?])\s+', response)
1033
+ unique_sentences = []
1034
+ seen_content = set()
1035
+
1036
+ for sentence in sentences:
1037
+ # Skip empty sentences
1038
+ if not sentence.strip():
1039
+ continue
1040
+
1041
+ # Create simplified version for comparison (lowercase, no punctuation)
1042
+ simplified = re.sub(r'[^\w\s]', '', sentence.lower())
1043
+ simplified = ' '.join(simplified.split()) # Standardize whitespace
1044
+
1045
+ # Check if we've seen a similar sentence
1046
+ is_duplicate = False
1047
+ for existing in seen_content:
1048
+ if len(simplified) > 10 and (existing in simplified or simplified in existing):
1049
+ is_duplicate = True
1050
+ break
1051
+
1052
+ if not is_duplicate and simplified:
1053
+ unique_sentences.append(sentence)
1054
+ seen_content.add(simplified)
1055
+
1056
+ # Recombine unique sentences
1057
+ response = ' '.join(unique_sentences)
1058
+
1059
+ # 10. Ensure word count is within limits (50-150 words)
1060
+ words = response.split()
1061
+ if len(words) > 200:
1062
+ # Find sentence ending near the word limit
1063
+ truncated = ' '.join(words[:200])
1064
+ last_period = max(truncated.rfind('.'), truncated.rfind('!'), truncated.rfind('?'))
1065
+
1066
+ if last_period > 0:
1067
+ response = truncated[:last_period+1]
1068
+ else:
1069
+ response = truncated + "."
1070
+
1071
+ # 11. Check sentence completeness
1072
+ if response and not response.strip()[-1] in ['.', '!', '?']:
1073
+ # Find the last preposition or conjunction
1074
+ common_prepositions = ["into", "onto", "about", "above", "across", "after", "along", "around", "at", "before", "behind", "below", "beneath", "beside", "between", "beyond", "by", "down", "during", "except", "for", "from", "in", "inside", "near", "of", "off", "on", "over", "through", "to", "toward", "under", "up", "upon", "with", "within"]
1075
+
1076
+ # Check if ending with preposition or conjunction
1077
+ last_word = response.strip().split()[-1].lower() if response.strip().split() else ""
1078
+ if last_word in common_prepositions or last_word in ["and", "or", "but"]:
1079
+ # Find the last complete sentence
1080
+ last_period = max(response.rfind('.'), response.rfind('!'), response.rfind('?'))
1081
+ if last_period > 0:
1082
+ response = response[:last_period+1]
1083
+ else:
1084
+ # If no complete sentence found, modify the ending
1085
+ words = response.strip().split()
1086
+ if words:
1087
+ # Remove the last preposition or conjunction
1088
+ response = " ".join(words[:-1]) + "."
1089
+
1090
+ # 12. Ensure haven't over-filtered
1091
+ if not response or len(response) < 40:
1092
+ # Try to get the first meaningful paragraph from the original response
1093
+ paragraphs = [p for p in original_response.split('\n\n') if p.strip()]
1094
+ if paragraphs:
1095
+ # Choose the longest paragraph as it's most likely the actual description
1096
+ best_para = max(paragraphs, key=len)
1097
+ # Clean it using a subset of the above rules
1098
+ best_para = re.sub(r'\[.*?\]', '', best_para) # Remove [SECTION] markers
1099
+ best_para = re.sub(r'\s{2,}', ' ', best_para).strip() # Clean whitespace
1100
+
1101
+ if len(best_para) >= 40:
1102
+ return best_para
1103
+
1104
+ # If still no good content, return a simple message
1105
+ return "Unable to generate a valid enhanced description."
1106
+
1107
+ # 13. Final cleaning - catch any missed special cases
1108
+ response = re.sub(r'</?\|.*?\|>', '', response) # Any remaining tags
1109
+ response = re.sub(r'\(.*?\)', '', response) # Any remaining parenthetical content
1110
+ response = re.sub(r'Note:.*?(?=\n|$)', '', response, flags=re.IGNORECASE) # Any remaining notes
1111
+
1112
+ # Ensure proper spacing after punctuation
1113
+ response = re.sub(r'([.!?])([A-Z])', r'\1 \2', response)
1114
+
1115
+ # Ensure first letter is capitalized
1116
+ if response and response[0].islower():
1117
+ response = response[0].upper() + response[1:]
1118
+
1119
+ # 14. 統一格式 - 確保輸出始終是單一段落
1120
+ response = re.sub(r'\s*\n\s*', ' ', response) # 將所有換行符替換為空格
1121
+ response = ' '.join(response.split())
1122
+
1123
+ return response.strip()
1124
+
1125
+ def _format_objects_for_prompt(self, objects: List[Dict]) -> str:
1126
+ """格式化物體列表以用於提示"""
1127
+ if not objects:
1128
+ return "No objects detected"
1129
+
1130
+ formatted = []
1131
+ for obj in objects:
1132
+ formatted.append(f"{obj['class_name']} (confidence: {obj['confidence']:.2f})")
1133
+
1134
+ return "\n- " + "\n- ".join(formatted)
1135
+
1136
+ def _format_lighting(self, lighting_info: Dict) -> str:
1137
+ """格式化光照信息以用於提示"""
1138
+ if not lighting_info:
1139
+ return "Unknown lighting conditions"
1140
+
1141
+ time = lighting_info.get("time_of_day", "unknown")
1142
+ conf = lighting_info.get("confidence", 0)
1143
+ is_indoor = lighting_info.get("is_indoor", False)
1144
+
1145
+ base_info = f"{'Indoor' if is_indoor else 'Outdoor'} {time} (confidence: {conf:.2f})"
1146
+
1147
+ # 添加更詳細的診斷信息
1148
+ diagnostics = lighting_info.get("diagnostics", {})
1149
+ if diagnostics:
1150
+ diag_str = "\nAdditional lighting diagnostics:"
1151
+ for key, value in diagnostics.items():
1152
+ diag_str += f"\n- {key}: {value}"
1153
+ base_info += diag_str
1154
+
1155
+ return base_info
1156
+
1157
+ def _format_zones(self, zones: Dict) -> str:
1158
+ """格式化功能區域以用於提示"""
1159
+ if not zones:
1160
+ return "No distinct functional zones identified"
1161
+
1162
+ formatted = ["Identified functional zones:"]
1163
+ for zone_name, zone_data in zones.items():
1164
+ desc = zone_data.get("description", "")
1165
+ objects = zone_data.get("objects", [])
1166
+
1167
+ zone_str = f"- {zone_name}: {desc}"
1168
+ if objects:
1169
+ zone_str += f" (Contains: {', '.join(objects)})"
1170
+
1171
+ formatted.append(zone_str)
1172
+
1173
+ return "\n".join(formatted)
1174
+
1175
+ def _format_clip_results(self, clip_analysis: Dict) -> str:
1176
+ """格式化CLIP分析結果以用於提示"""
1177
+ if not clip_analysis or "error" in clip_analysis:
1178
+ return "No CLIP analysis available"
1179
+
1180
+ parts = ["CLIP Analysis Results:"]
1181
+
1182
+ # 加上頂級場景
1183
+ top_scene, confidence = clip_analysis.get("top_scene", ("unknown", 0))
1184
+ parts.append(f"- Most likely scene: {top_scene} (confidence: {confidence:.2f})")
1185
+
1186
+ # 加上視角
1187
+ viewpoint, vp_conf = clip_analysis.get("viewpoint", ("standard", 0))
1188
+ parts.append(f"- Camera viewpoint: {viewpoint} (confidence: {vp_conf:.2f})")
1189
+
1190
+ # 加上物體組合
1191
+ if "object_combinations" in clip_analysis:
1192
+ combos = []
1193
+ for combo, score in clip_analysis["object_combinations"][:3]:
1194
+ combos.append(f"{combo} ({score:.2f})")
1195
+ parts.append(f"- Object combinations: {', '.join(combos)}")
1196
+
1197
+ # 加上文化分析
1198
+ if "cultural_analysis" in clip_analysis:
1199
+ parts.append("- Cultural analysis:")
1200
+ for culture_type, data in clip_analysis["cultural_analysis"].items():
1201
+ best_desc = data.get("best_description", "")
1202
+ desc_conf = data.get("confidence", 0)
1203
+ parts.append(f" * {culture_type}: {best_desc} ({desc_conf:.2f})")
1204
+
1205
+ return "\n".join(parts)
1206
+
1207
+ def _format_cultural_analysis(self, cultural_analysis: Dict) -> str:
1208
+ """格式化文化分析結果"""
1209
+ if not cultural_analysis:
1210
+ return "No specific cultural elements detected"
1211
+
1212
+ parts = []
1213
+ for culture_type, data in cultural_analysis.items():
1214
+ best_desc = data.get("best_description", "")
1215
+ desc_conf = data.get("confidence", 0)
1216
+ parts.append(f"{culture_type}: {best_desc} (confidence: {desc_conf:.2f})")
1217
+
1218
+ return "\n".join(parts)
requirements.txt CHANGED
@@ -9,3 +9,8 @@ gradio>=3.32.0
9
  git+https://github.com/openai/CLIP.git
10
  yt-dlp>=2023.3.4
11
  requests>=2.28.1
 
 
 
 
 
 
9
  git+https://github.com/openai/CLIP.git
10
  yt-dlp>=2023.3.4
11
  requests>=2.28.1
12
+ transformers
13
+ accelerate
14
+ bitsandbytes
15
+ sentencepiece
16
+ huggingface_hub>=0.19.0
scene_analyzer.py CHANGED
@@ -6,6 +6,7 @@ from spatial_analyzer import SpatialAnalyzer
6
  from scene_description import SceneDescriptor
7
  from enhance_scene_describer import EnhancedSceneDescriber
8
  from clip_analyzer import CLIPAnalyzer
 
9
  from scene_type import SCENE_TYPES
10
  from object_categories import OBJECT_CATEGORIES
11
 
@@ -14,7 +15,7 @@ class SceneAnalyzer:
14
  Core class for scene analysis and understanding based on object detection results.
15
  Analyzes detected objects, their relationships, and infers the scene type.
16
  """
17
- def __init__(self, class_names: Dict[int, str] = None):
18
  """
19
  Initialize the scene analyzer with optional class name mappings.
20
  Args:
@@ -40,6 +41,18 @@ class SceneAnalyzer:
40
  print("Scene analysis will proceed without CLIP. Install CLIP with 'pip install clip' for enhanced scene understanding.")
41
  self.use_clip = False
42
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  def generate_scene_description(self,
44
  scene_type,
45
  detected_objects,
@@ -106,8 +119,31 @@ class SceneAnalyzer:
106
  Returns:
107
  Dictionary with scene analysis results
108
  """
109
- # If no result or no detections, return empty analysis
110
  if detection_result is None or len(detection_result.boxes) == 0:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  return {
112
  "scene_type": "unknown",
113
  "confidence": 0,
@@ -226,6 +262,53 @@ class SceneAnalyzer:
226
  functional_zones=functional_zones
227
  )
228
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  # Return comprehensive analysis
230
  result = {
231
  "scene_type": best_scene if scene_confidence >= scene_confidence_threshold else "unknown",
@@ -233,6 +316,7 @@ class SceneAnalyzer:
233
  if scene_confidence >= scene_confidence_threshold else "Unknown Scene",
234
  "confidence": scene_confidence,
235
  "description": scene_description,
 
236
  "objects_present": [
237
  {"class_id": obj["class_id"],
238
  "class_name": obj["class_name"],
@@ -248,6 +332,12 @@ class SceneAnalyzer:
248
  "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0}
249
  }
250
 
 
 
 
 
 
 
251
  # 添加 CLIP 特定的結果
252
  if clip_analysis and "error" not in clip_analysis:
253
  result["clip_analysis"] = {
 
6
  from scene_description import SceneDescriptor
7
  from enhance_scene_describer import EnhancedSceneDescriber
8
  from clip_analyzer import CLIPAnalyzer
9
+ from llm_enhancer import LLMEnhancer
10
  from scene_type import SCENE_TYPES
11
  from object_categories import OBJECT_CATEGORIES
12
 
 
15
  Core class for scene analysis and understanding based on object detection results.
16
  Analyzes detected objects, their relationships, and infers the scene type.
17
  """
18
+ def __init__(self, class_names: Dict[int, str] = None, use_llm: bool = True, llm_model_path: str = None):
19
  """
20
  Initialize the scene analyzer with optional class name mappings.
21
  Args:
 
41
  print("Scene analysis will proceed without CLIP. Install CLIP with 'pip install clip' for enhanced scene understanding.")
42
  self.use_clip = False
43
 
44
+ # 初始化LLM Model
45
+ self.use_llm = use_llm
46
+ if use_llm:
47
+ try:
48
+ # from llm_enhancer import LLMEnhancer
49
+ self.llm_enhancer = LLMEnhancer(model_path=llm_model_path)
50
+ print(f"LLM enhancer initialized successfully.")
51
+ except Exception as e:
52
+ print(f"Warning: Could not initialize LLM enhancer: {e}")
53
+ print("Scene analysis will proceed without LLM. Make sure required packages are installed.")
54
+ self.use_llm = False
55
+
56
  def generate_scene_description(self,
57
  scene_type,
58
  detected_objects,
 
119
  Returns:
120
  Dictionary with scene analysis results
121
  """
122
+ # If no result or no detections, handle with LLM if possible
123
  if detection_result is None or len(detection_result.boxes) == 0:
124
+ if self.use_llm and self.use_clip and detection_result is not None:
125
+ # 使用CLIP和LLM分析無物體檢測的情況
126
+ try:
127
+ original_image = detection_result.orig_img
128
+ clip_analysis = self.clip_analyzer.analyze_image(original_image)
129
+ llm_description = self.llm_enhancer.handle_no_detection(clip_analysis)
130
+
131
+ return {
132
+ "scene_type": "llm_inferred",
133
+ "confidence": clip_analysis.get("top_scene", ("unknown", 0))[1],
134
+ "description": "No objects detected by standard detection.",
135
+ "enhanced_description": llm_description,
136
+ "objects_present": [],
137
+ "object_count": 0,
138
+ "regions": {},
139
+ "possible_activities": [],
140
+ "safety_concerns": [],
141
+ "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0}
142
+ }
143
+ except Exception as e:
144
+ print(f"Error in LLM no-detection handling: {e}")
145
+
146
+ # 如果無法使用LLM/CLIP或處理失敗,返回原始的無檢測結果
147
  return {
148
  "scene_type": "unknown",
149
  "confidence": 0,
 
262
  functional_zones=functional_zones
263
  )
264
 
265
+ # 使用LLM進行增強處理
266
+ enhanced_description = None
267
+ llm_verification = None
268
+
269
+ if self.use_llm:
270
+ try:
271
+ # 準備用於LLM的場景數據
272
+ scene_data = {
273
+ "original_description": scene_description,
274
+ "scene_type": best_scene,
275
+ "scene_name": self.SCENE_TYPES.get(best_scene, {}).get("name", "Unknown"),
276
+ "detected_objects": detected_objects,
277
+ "confidence": scene_confidence,
278
+ "lighting_info": lighting_info,
279
+ "functional_zones": functional_zones,
280
+ "activities": activities,
281
+ "safety_concerns": safety_concerns,
282
+ "clip_analysis": clip_analysis
283
+ }
284
+
285
+ # 如果CLIP和YOLO結果之間存在顯著差異,使用LLM進行驗證
286
+ if self.use_clip and clip_analysis and "top_scene" in clip_analysis:
287
+ clip_top_scene = clip_analysis["top_scene"][0]
288
+ clip_confidence = clip_analysis["top_scene"][1]
289
+
290
+ # 如果CLIP和YOLO的場景預測不同且都有較高的置信度,進行驗證
291
+ if clip_top_scene != best_scene and clip_confidence > 0.4 and scene_confidence > 0.4:
292
+ llm_verification = self.llm_enhancer.verify_detection(
293
+ detected_objects,
294
+ clip_analysis,
295
+ best_scene,
296
+ self.SCENE_TYPES.get(best_scene, {}).get("name", "Unknown"),
297
+ scene_confidence
298
+ )
299
+
300
+ # 將驗證結果添加到場景數據中
301
+ scene_data["verification_result"] = llm_verification.get("verification_text", "")
302
+
303
+ # 使用LLM生成增強描述
304
+ enhanced_description = self.llm_enhancer.enhance_description(scene_data)
305
+
306
+ except Exception as e:
307
+ print(f"Error in LLM enhancement: {e}")
308
+ import traceback
309
+ traceback.print_exc()
310
+ enhanced_description = None
311
+
312
  # Return comprehensive analysis
313
  result = {
314
  "scene_type": best_scene if scene_confidence >= scene_confidence_threshold else "unknown",
 
316
  if scene_confidence >= scene_confidence_threshold else "Unknown Scene",
317
  "confidence": scene_confidence,
318
  "description": scene_description,
319
+ "enhanced_description": enhanced_description, # 添加LLM增強的描述
320
  "objects_present": [
321
  {"class_id": obj["class_id"],
322
  "class_name": obj["class_name"],
 
332
  "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0}
333
  }
334
 
335
+ # 如果有LLM驗證結果,添加到輸出中
336
+ if llm_verification:
337
+ result["llm_verification"] = llm_verification.get("verification_text")
338
+ if llm_verification.get("has_errors", False):
339
+ result["detection_warnings"] = "LLM detected potential issues with object recognition"
340
+
341
  # 添加 CLIP 特定的結果
342
  if clip_analysis and "error" not in clip_analysis:
343
  result["clip_analysis"] = {
scene_type.py CHANGED
@@ -282,13 +282,6 @@ SCENE_TYPES = {
282
  "minimum_required": 1,
283
  "description": "A traditional Asian temple complex with visitors and cultural elements"
284
  },
285
- "european_plaza": {
286
- "name": "European Plaza",
287
- "required_objects": [0], # person
288
- "optional_objects": [1, 2, 4, 9, 24, 26, 67], # bicycle, car, airplane, traffic light, backpack, handbag, cell phone
289
- "minimum_required": 1,
290
- "description": "A European-style city plaza with historic architecture and pedestrian activity"
291
- },
292
 
293
  # specific time item
294
  "nighttime_street": {
 
282
  "minimum_required": 1,
283
  "description": "A traditional Asian temple complex with visitors and cultural elements"
284
  },
 
 
 
 
 
 
 
285
 
286
  # specific time item
287
  "nighttime_street": {
style.py CHANGED
@@ -289,13 +289,13 @@ class Style:
289
  padding: 15px !important; /* 內邊距,讓文字和邊框有點空間 */
290
  border-radius: 8px !important; /* 圓角 */
291
  margin: 10px 0 20px 0 !important; /* 其他元素的間距,特別是上下的part */
292
- display: block !important;
293
- width: 100% !important;
294
- box-sizing: border-box !important;
295
  }
296
 
297
  #scene_analysis_description_text p {
298
- margin: 0 !important;
299
  color: #2D3748 !important; /* 確保文字顏色 */
300
  font-family: Arial, sans-serif !important;
301
  font-size: 16px !important; /* 你可以調整文字大小 */
@@ -485,6 +485,37 @@ class Style:
485
  max-width: 100% !important;
486
  }
487
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
488
  /* 動畫效果, 增加互動感 */
489
  @keyframes fadeIn {
490
  from { opacity: 0; }
 
289
  padding: 15px !important; /* 內邊距,讓文字和邊框有點空間 */
290
  border-radius: 8px !important; /* 圓角 */
291
  margin: 10px 0 20px 0 !important; /* 其他元素的間距,特別是上下的part */
292
+ display: block !important;
293
+ width: 100% !important;
294
+ box-sizing: border-box !important;
295
  }
296
 
297
  #scene_analysis_description_text p {
298
+ margin: 0 !important;
299
  color: #2D3748 !important; /* 確保文字顏色 */
300
  font-family: Arial, sans-serif !important;
301
  font-size: 16px !important; /* 你可以調整文字大小 */
 
485
  max-width: 100% !important;
486
  }
487
 
488
+ /* LLM 增強描述樣式 */
489
+ #llm_enhanced_description_text {
490
+ padding: 15px !important;
491
+ background-color: #ffffff !important;
492
+ border-radius: 8px !important;
493
+ border: 1px solid #e2e8f0 !important;
494
+ margin-bottom: 20px !important;
495
+ box-shadow: 0 1px 3px rgba(0,0,0,0.05) !important;
496
+ font-family: Arial, sans-serif !important;
497
+ line-height: 1.7 !important;
498
+ color: #2D3748 !important;
499
+ font-size: 16px !important;
500
+ width: 100% !important;
501
+ box-sizing: border-box !important;
502
+ min-height: 200px !important;
503
+ }
504
+
505
+ /* 原始描述折疊區域樣式 */
506
+ #original_scene_analysis_accordion {
507
+ margin-top: 10px !important;
508
+ margin-bottom: 20px !important;
509
+ background-color: #f8f9fa !important;
510
+ border-radius: 8px !important;
511
+ border: 1px solid #e2e8f0 !important;
512
+ }
513
+
514
+ /* 確保折疊區域內容與頁面樣式協調 */
515
+ #original_scene_analysis_accordion > div:nth-child(2) {
516
+ padding: 15px !important;
517
+ }
518
+
519
  /* 動畫效果, 增加互動感 */
520
  @keyframes fadeIn {
521
  from { opacity: 0; }