thinksoso commited on
Commit
d75a39b
·
verified ·
1 Parent(s): e51b638

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +147 -75
README.md CHANGED
@@ -72,23 +72,24 @@ Fleming-VL is a multimodal reasoning model for medical scenarios that can proces
72
  ## 🔧 Quick Start
73
 
74
  ```python
75
- """
76
- Fleming-VL-8B Multi-Modal Inference Script
77
 
78
- This script demonstrates three inference modes:
79
- 1. Single image inference
80
- 2. Video inference (frame-by-frame)
81
- 3. 3D medical image (CT/MRI) inference from .npy files
82
 
83
- Model: UbiquantAI/Fleming-VL-8B
84
- Based on: InternVL_chat-1.2 template
85
- """
 
86
 
87
- from transformers import AutoTokenizer, AutoModel, CLIPImageProcessor
 
 
 
 
 
88
  from decord import VideoReader, cpu
89
  from PIL import Image
 
90
  import numpy as np
91
- import shutil
92
  import torch
93
  import os
94
 
@@ -98,7 +99,6 @@ import os
98
  # ============================================================================
99
 
100
  MODEL_PATH = "UbiquantAI/Fleming-VL-8B"
101
- REQUIRED_FILES_DIR = './required_files'
102
 
103
  # Prompt template for reasoning-based responses
104
  REASONING_PROMPT = (
@@ -111,46 +111,105 @@ REASONING_PROMPT = (
111
  "<answer> answer here </answer>"
112
  )
113
 
 
 
 
114
 
115
  # ============================================================================
116
- # Utility Functions
117
  # ============================================================================
118
 
119
- def copy_necessary_files(target_path, source_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  """
121
- Copy required model configuration files to the model directory.
122
 
123
  Args:
124
- target_path: Destination directory (model path)
125
- source_path: Source directory containing required files
 
 
 
 
 
 
126
  """
127
- required_files = [
128
- "modeling_internvl_chat.py",
129
- "conversation.py",
130
- "modeling_intern_vit.py",
131
- "preprocessor_config.json",
132
- "configuration_internvl_chat.py",
133
- "configuration_intern_vit.py",
134
- ]
135
-
136
- for filename in required_files:
137
- target_file = os.path.join(target_path, filename)
138
- source_file = os.path.join(source_path, filename)
139
-
140
- if not os.path.exists(target_file):
141
- print(f"File {filename} not found in target path, copying from source...")
142
-
143
- if os.path.exists(source_file):
144
- try:
145
- shutil.copy2(source_file, target_file)
146
- print(f"Successfully copied {filename}")
147
- except Exception as e:
148
- print(f"Error copying {filename}: {str(e)}")
149
- else:
150
- print(f"Warning: Source file {filename} does not exist, cannot copy")
151
- else:
152
- print(f"File {filename} already exists")
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
  def load_model(model_path, use_flash_attn=True):
156
  """
@@ -184,7 +243,8 @@ def load_model(model_path, use_flash_attn=True):
184
  # Image Inference
185
  # ============================================================================
186
 
187
- def inference_single_image(model, tokenizer, image_path, question, prompt=REASONING_PROMPT):
 
188
  """
189
  Perform inference on a single image.
190
 
@@ -194,23 +254,25 @@ def inference_single_image(model, tokenizer, image_path, question, prompt=REASON
194
  image_path: Path to the input image
195
  question: Question to ask about the image
196
  prompt: System prompt template
 
 
197
 
198
  Returns:
199
  str: Model response
200
  """
201
- # Load and preprocess image
202
- image_processor = CLIPImageProcessor.from_pretrained(MODEL_PATH)
203
- image = Image.open(image_path).resize((448, 448))
204
- pixel_values = image_processor(
205
- images=image,
206
- return_tensors='pt'
207
- ).pixel_values.to(torch.bfloat16).cuda()
208
 
209
  # Prepare question with prompt and image token
210
  full_question = f"{prompt}\n<image>\n{question}"
 
211
 
212
  # Generate response
213
- generation_config = dict(max_new_tokens=1024, do_sample=False)
214
  response = model.chat(tokenizer, pixel_values, full_question, generation_config)
215
 
216
  return response
@@ -251,14 +313,15 @@ def get_frame_indices(bound, fps, max_frame, first_idx=0, num_segments=32):
251
  return frame_indices
252
 
253
 
254
- def load_video(video_path, model_path, bound=None, num_segments=32):
255
  """
256
  Load and preprocess video frames.
257
 
258
  Args:
259
  video_path: Path to the video file
260
- model_path: Path to the model (for image processor)
261
  bound: Time boundary tuple (start, end) in seconds
 
 
262
  num_segments: Number of frames to extract
263
 
264
  Returns:
@@ -270,14 +333,16 @@ def load_video(video_path, model_path, bound=None, num_segments=32):
270
 
271
  pixel_values_list = []
272
  num_patches_list = []
273
- image_processor = CLIPImageProcessor.from_pretrained(model_path)
274
 
275
  frame_indices = get_frame_indices(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
276
 
277
  for frame_index in frame_indices:
278
  # Extract and preprocess frame
279
- img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB').resize((448, 448))
280
- pixel_values = image_processor(images=img, return_tensors='pt').pixel_values
 
 
281
  num_patches_list.append(pixel_values.shape[0])
282
  pixel_values_list.append(pixel_values)
283
 
@@ -285,7 +350,8 @@ def load_video(video_path, model_path, bound=None, num_segments=32):
285
  return pixel_values, num_patches_list
286
 
287
 
288
- def inference_video(model, tokenizer, video_path, video_duration, question, prompt=REASONING_PROMPT):
 
289
  """
290
  Perform inference on a video by sampling frames.
291
 
@@ -296,13 +362,18 @@ def inference_video(model, tokenizer, video_path, video_duration, question, prom
296
  video_duration: Duration of video in seconds
297
  question: Question to ask about the video
298
  prompt: System prompt template
 
 
299
 
300
  Returns:
301
  str: Model response
302
  """
303
  # Sample frames from video (1 frame per second)
304
  num_segments = int(video_duration)
305
- pixel_values, num_patches_list = load_video(video_path, MODEL_PATH, num_segments=num_segments)
 
 
 
306
  pixel_values = pixel_values.to(torch.bfloat16).cuda()
307
 
308
  # Create image token prefix for all frames
@@ -349,7 +420,7 @@ def normalize_image(image):
349
  return ((image - img_min) / (img_max - img_min) * 255).astype(np.uint8)
350
 
351
 
352
- def convert_npy_to_images(npy_path, model_path, num_slices=11):
353
  """
354
  Convert 3D medical image (.npy) to multiple 2D RGB images.
355
 
@@ -358,7 +429,8 @@ def convert_npy_to_images(npy_path, model_path, num_slices=11):
358
 
359
  Args:
360
  npy_path: Path to the .npy file
361
- model_path: Path to the model (for image processor)
 
362
  num_slices: Number of slices to extract (default: 11)
363
 
364
  Returns:
@@ -380,7 +452,7 @@ def convert_npy_to_images(npy_path, model_path, num_slices=11):
380
  # Select evenly distributed slices from 32 slices
381
  indices = np.linspace(0, 31, num_slices, dtype=int)
382
 
383
- image_processor = CLIPImageProcessor.from_pretrained(model_path)
384
  pixel_values_list = []
385
  num_patches_list = []
386
 
@@ -398,8 +470,10 @@ def convert_npy_to_images(npy_path, model_path, num_slices=11):
398
  # Convert to PIL Image
399
  img = Image.fromarray(rgb_img)
400
 
401
- # Preprocess with CLIP processor
402
- pixel_values = image_processor(images=img, return_tensors='pt').pixel_values
 
 
403
  num_patches_list.append(pixel_values.shape[0])
404
  pixel_values_list.append(pixel_values)
405
 
@@ -411,7 +485,8 @@ def convert_npy_to_images(npy_path, model_path, num_slices=11):
411
  return False
412
 
413
 
414
- def inference_3d_medical_image(model, tokenizer, npy_path, question, prompt=REASONING_PROMPT):
 
415
  """
416
  Perform inference on 3D medical images stored as .npy files.
417
 
@@ -421,12 +496,14 @@ def inference_3d_medical_image(model, tokenizer, npy_path, question, prompt=REAS
421
  npy_path: Path to the .npy file (shape: 32x256x256)
422
  question: Question to ask about the image
423
  prompt: System prompt template
 
 
424
 
425
  Returns:
426
  str: Model response or None if error
427
  """
428
  # Convert 3D volume to multiple 2D slices
429
- result = convert_npy_to_images(npy_path, MODEL_PATH)
430
 
431
  if result is False:
432
  return None
@@ -463,8 +540,6 @@ def main():
463
  """
464
  Main function demonstrating all three inference modes.
465
  """
466
- # Copy necessary files
467
- copy_necessary_files(MODEL_PATH, REQUIRED_FILES_DIR)
468
 
469
  # ========================================================================
470
  # Example 1: Single Image Inference
@@ -473,11 +548,8 @@ def main():
473
  print("EXAMPLE 1: Single Image Inference")
474
  print("="*80)
475
 
476
- image_path = "./test.png"
477
- question = (
478
- "What imaging technique was employed to obtain this picture?\n"
479
- "A. PET scan. B. CT scan. C. Blood test. D. Fundus imaging."
480
- )
481
 
482
  model, tokenizer = load_model(MODEL_PATH, use_flash_attn=True)
483
  response = inference_single_image(model, tokenizer, image_path, question)
@@ -496,7 +568,7 @@ def main():
496
  print("EXAMPLE 2: Video Inference")
497
  print("="*80)
498
 
499
- video_path = "./test.mp4"
500
  video_duration = 6 # seconds
501
  question = "Please describe the video."
502
 
@@ -517,7 +589,7 @@ def main():
517
  print("EXAMPLE 3: 3D Medical Image Inference")
518
  print("="*80)
519
 
520
- npy_path = "./test.npy"
521
  question = "What device is observed on the chest wall?"
522
 
523
  # Example cases:
 
72
  ## 🔧 Quick Start
73
 
74
  ```python
 
 
75
 
76
+ # Fleming-VL-8B Multi-Modal Inference Script
 
 
 
77
 
78
+ # This script demonstrates three inference modes:
79
+ # 1. Single image inference
80
+ # 2. Video inference (frame-by-frame)
81
+ # 3. 3D medical image (CT/MRI) inference from .npy files
82
 
83
+ # Model: UbiquantAI/Fleming-VL-8B
84
+ # Based on: InternVL_chat-1.2 template
85
+
86
+
87
+ from transformers import AutoTokenizer, AutoModel
88
+ from torchvision.transforms.functional import InterpolationMode
89
  from decord import VideoReader, cpu
90
  from PIL import Image
91
+ import torchvision.transforms as T
92
  import numpy as np
 
93
  import torch
94
  import os
95
 
 
99
  # ============================================================================
100
 
101
  MODEL_PATH = "UbiquantAI/Fleming-VL-8B"
 
102
 
103
  # Prompt template for reasoning-based responses
104
  REASONING_PROMPT = (
 
111
  "<answer> answer here </answer>"
112
  )
113
 
114
+ IMAGENET_MEAN = (0.485, 0.456, 0.406)
115
+ IMAGENET_STD = (0.229, 0.224, 0.225)
116
+
117
 
118
  # ============================================================================
119
+ # Image Preprocessing Functions
120
  # ============================================================================
121
 
122
+ def build_transform(input_size):
123
+ """Build image transformation pipeline."""
124
+ MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
125
+ transform = T.Compose([
126
+ T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
127
+ T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
128
+ T.ToTensor(),
129
+ T.Normalize(mean=MEAN, std=STD)
130
+ ])
131
+ return transform
132
+
133
+
134
+ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
135
+ """Find the closest aspect ratio from target ratios."""
136
+ best_ratio_diff = float('inf')
137
+ best_ratio = (1, 1)
138
+ area = width * height
139
+ for ratio in target_ratios:
140
+ target_aspect_ratio = ratio[0] / ratio[1]
141
+ ratio_diff = abs(aspect_ratio - target_aspect_ratio)
142
+ if ratio_diff < best_ratio_diff:
143
+ best_ratio_diff = ratio_diff
144
+ best_ratio = ratio
145
+ elif ratio_diff == best_ratio_diff:
146
+ if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
147
+ best_ratio = ratio
148
+ return best_ratio
149
+
150
+
151
+ def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
152
  """
153
+ Dynamically preprocess image by splitting into tiles based on aspect ratio.
154
 
155
  Args:
156
+ image: PIL Image
157
+ min_num: Minimum number of tiles
158
+ max_num: Maximum number of tiles
159
+ image_size: Size of each tile
160
+ use_thumbnail: Whether to add a thumbnail image
161
+
162
+ Returns:
163
+ List of preprocessed PIL Images
164
  """
165
+ orig_width, orig_height = image.size
166
+ aspect_ratio = orig_width / orig_height
167
+
168
+ # Calculate possible tile configurations
169
+ target_ratios = set(
170
+ (i, j) for n in range(min_num, max_num + 1)
171
+ for i in range(1, n + 1)
172
+ for j in range(1, n + 1)
173
+ if i * j <= max_num and i * j >= min_num
174
+ )
175
+ target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
176
+
177
+ # Find the closest aspect ratio to the target
178
+ target_aspect_ratio = find_closest_aspect_ratio(
179
+ aspect_ratio, target_ratios, orig_width, orig_height, image_size
180
+ )
 
 
 
 
 
 
 
 
 
 
181
 
182
+ # Calculate target dimensions
183
+ target_width = image_size * target_aspect_ratio[0]
184
+ target_height = image_size * target_aspect_ratio[1]
185
+ blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
186
+
187
+ # Resize and split the image
188
+ resized_img = image.resize((target_width, target_height))
189
+ processed_images = []
190
+ for i in range(blocks):
191
+ box = (
192
+ (i % (target_width // image_size)) * image_size,
193
+ (i // (target_width // image_size)) * image_size,
194
+ ((i % (target_width // image_size)) + 1) * image_size,
195
+ ((i // (target_width // image_size)) + 1) * image_size
196
+ )
197
+ split_img = resized_img.crop(box)
198
+ processed_images.append(split_img)
199
+
200
+ assert len(processed_images) == blocks
201
+
202
+ # Add thumbnail if requested
203
+ if use_thumbnail and len(processed_images) != 1:
204
+ thumbnail_img = image.resize((image_size, image_size))
205
+ processed_images.append(thumbnail_img)
206
+
207
+ return processed_images
208
+
209
+
210
+ # ============================================================================
211
+ # Utility Functions
212
+ # ============================================================================
213
 
214
  def load_model(model_path, use_flash_attn=True):
215
  """
 
243
  # Image Inference
244
  # ============================================================================
245
 
246
+ def inference_single_image(model, tokenizer, image_path, question,
247
+ prompt=REASONING_PROMPT, input_size=448, max_num=12):
248
  """
249
  Perform inference on a single image.
250
 
 
254
  image_path: Path to the input image
255
  question: Question to ask about the image
256
  prompt: System prompt template
257
+ input_size: Input image size (default: 448)
258
+ max_num: Maximum number of tiles (default: 12)
259
 
260
  Returns:
261
  str: Model response
262
  """
263
+ # Load and preprocess image using InternVL's dynamic preprocessing
264
+ image = Image.open(image_path).convert('RGB')
265
+ transform = build_transform(input_size=input_size)
266
+ images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
267
+ pixel_values = [transform(img) for img in images]
268
+ pixel_values = torch.stack(pixel_values).to(torch.bfloat16).cuda()
 
269
 
270
  # Prepare question with prompt and image token
271
  full_question = f"{prompt}\n<image>\n{question}"
272
+ # print("###",full_question)
273
 
274
  # Generate response
275
+ generation_config = dict(max_new_tokens=2048, do_sample=False)
276
  response = model.chat(tokenizer, pixel_values, full_question, generation_config)
277
 
278
  return response
 
313
  return frame_indices
314
 
315
 
316
+ def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
317
  """
318
  Load and preprocess video frames.
319
 
320
  Args:
321
  video_path: Path to the video file
 
322
  bound: Time boundary tuple (start, end) in seconds
323
+ input_size: Input image size (default: 448)
324
+ max_num: Maximum number of tiles per frame (default: 1)
325
  num_segments: Number of frames to extract
326
 
327
  Returns:
 
333
 
334
  pixel_values_list = []
335
  num_patches_list = []
336
+ transform = build_transform(input_size=input_size)
337
 
338
  frame_indices = get_frame_indices(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
339
 
340
  for frame_index in frame_indices:
341
  # Extract and preprocess frame
342
+ img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
343
+ img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
344
+ pixel_values = [transform(tile) for tile in img]
345
+ pixel_values = torch.stack(pixel_values)
346
  num_patches_list.append(pixel_values.shape[0])
347
  pixel_values_list.append(pixel_values)
348
 
 
350
  return pixel_values, num_patches_list
351
 
352
 
353
+ def inference_video(model, tokenizer, video_path, video_duration, question,
354
+ prompt=REASONING_PROMPT, input_size=448, max_num=1):
355
  """
356
  Perform inference on a video by sampling frames.
357
 
 
362
  video_duration: Duration of video in seconds
363
  question: Question to ask about the video
364
  prompt: System prompt template
365
+ input_size: Input image size (default: 448)
366
+ max_num: Maximum number of tiles per frame (default: 1)
367
 
368
  Returns:
369
  str: Model response
370
  """
371
  # Sample frames from video (1 frame per second)
372
  num_segments = int(video_duration)
373
+ pixel_values, num_patches_list = load_video(
374
+ video_path, bound=None, input_size=input_size,
375
+ max_num=max_num, num_segments=num_segments
376
+ )
377
  pixel_values = pixel_values.to(torch.bfloat16).cuda()
378
 
379
  # Create image token prefix for all frames
 
420
  return ((image - img_min) / (img_max - img_min) * 255).astype(np.uint8)
421
 
422
 
423
+ def convert_npy_to_images(npy_path, input_size=448, max_num=1, num_slices=11):
424
  """
425
  Convert 3D medical image (.npy) to multiple 2D RGB images.
426
 
 
429
 
430
  Args:
431
  npy_path: Path to the .npy file
432
+ input_size: Input image size (default: 448)
433
+ max_num: Maximum number of tiles per slice (default: 1)
434
  num_slices: Number of slices to extract (default: 11)
435
 
436
  Returns:
 
452
  # Select evenly distributed slices from 32 slices
453
  indices = np.linspace(0, 31, num_slices, dtype=int)
454
 
455
+ transform = build_transform(input_size=input_size)
456
  pixel_values_list = []
457
  num_patches_list = []
458
 
 
470
  # Convert to PIL Image
471
  img = Image.fromarray(rgb_img)
472
 
473
+ # Preprocess with InternVL's dynamic preprocessing
474
+ img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
475
+ pixel_values = [transform(tile) for tile in img]
476
+ pixel_values = torch.stack(pixel_values)
477
  num_patches_list.append(pixel_values.shape[0])
478
  pixel_values_list.append(pixel_values)
479
 
 
485
  return False
486
 
487
 
488
+ def inference_3d_medical_image(model, tokenizer, npy_path, question,
489
+ prompt=REASONING_PROMPT, input_size=448, max_num=1):
490
  """
491
  Perform inference on 3D medical images stored as .npy files.
492
 
 
496
  npy_path: Path to the .npy file (shape: 32x256x256)
497
  question: Question to ask about the image
498
  prompt: System prompt template
499
+ input_size: Input image size (default: 448)
500
+ max_num: Maximum number of tiles per slice (default: 1)
501
 
502
  Returns:
503
  str: Model response or None if error
504
  """
505
  # Convert 3D volume to multiple 2D slices
506
+ result = convert_npy_to_images(npy_path, input_size=input_size, max_num=max_num)
507
 
508
  if result is False:
509
  return None
 
540
  """
541
  Main function demonstrating all three inference modes.
542
  """
 
 
543
 
544
  # ========================================================================
545
  # Example 1: Single Image Inference
 
548
  print("EXAMPLE 1: Single Image Inference")
549
  print("="*80)
550
 
551
+ image_path = "./resource/1.jpg"
552
+ question = ' What type of abnormality is present in this image?'
 
 
 
553
 
554
  model, tokenizer = load_model(MODEL_PATH, use_flash_attn=True)
555
  response = inference_single_image(model, tokenizer, image_path, question)
 
568
  print("EXAMPLE 2: Video Inference")
569
  print("="*80)
570
 
571
+ video_path = "./resource/video.mp4"
572
  video_duration = 6 # seconds
573
  question = "Please describe the video."
574
 
 
589
  print("EXAMPLE 3: 3D Medical Image Inference")
590
  print("="*80)
591
 
592
+ npy_path = "./resource/test.npy"
593
  question = "What device is observed on the chest wall?"
594
 
595
  # Example cases: