Update: README
Browse files
README.md
CHANGED
@@ -7,14 +7,11 @@ language:
|
|
7 |
- multilingual
|
8 |
tags:
|
9 |
- minicpm-v
|
10 |
-
- VLM
|
11 |
- vision
|
12 |
- ocr
|
13 |
-
- document parsing
|
14 |
- multi-image
|
15 |
- video
|
16 |
- custom_code
|
17 |
-
|
18 |
---
|
19 |
|
20 |
<h1>A GPT-4o Level MLLM for Single Image, Multi Image and High-FPS Video Understanding on Your Phone</h1>
|
@@ -289,7 +286,7 @@ for new_text in answer:
|
|
289 |
print(new_text, flush=True, end='')
|
290 |
|
291 |
# Second round chat, pass history context of multi-turn conversation
|
292 |
-
msgs.append({"role": "assistant", "content": [
|
293 |
msgs.append({"role": "user", "content": ["What should I pay attention to when traveling here?"]})
|
294 |
|
295 |
answer = model.chat(
|
@@ -406,7 +403,7 @@ def encode_video(video_path, choose_fps=3, force_packing=None):
|
|
406 |
|
407 |
video_path="video_test.mp4"
|
408 |
fps = 5 # fps for video
|
409 |
-
force_packing = None # You can set force_packing to ensure that 3D
|
410 |
frames, frame_ts_id_group = encode_video(video_path, fps, force_packing=force_packing)
|
411 |
|
412 |
question = "Describe the video"
|
@@ -418,7 +415,7 @@ msgs = [
|
|
418 |
answer = model.chat(
|
419 |
msgs=msgs,
|
420 |
tokenizer=tokenizer,
|
421 |
-
use_image_id=False,
|
422 |
max_slice_nums=1,
|
423 |
temporal_ids=frame_ts_id_group
|
424 |
)
|
|
|
7 |
- multilingual
|
8 |
tags:
|
9 |
- minicpm-v
|
|
|
10 |
- vision
|
11 |
- ocr
|
|
|
12 |
- multi-image
|
13 |
- video
|
14 |
- custom_code
|
|
|
15 |
---
|
16 |
|
17 |
<h1>A GPT-4o Level MLLM for Single Image, Multi Image and High-FPS Video Understanding on Your Phone</h1>
|
|
|
286 |
print(new_text, flush=True, end='')
|
287 |
|
288 |
# Second round chat, pass history context of multi-turn conversation
|
289 |
+
msgs.append({"role": "assistant", "content": [generated_text]})
|
290 |
msgs.append({"role": "user", "content": ["What should I pay attention to when traveling here?"]})
|
291 |
|
292 |
answer = model.chat(
|
|
|
403 |
|
404 |
video_path="video_test.mp4"
|
405 |
fps = 5 # fps for video
|
406 |
+
force_packing = None # You can set force_packing to ensure that 3D packing is forcibly enabled; otherwise, encode_video will dynamically set the packing quantity based on the duration.
|
407 |
frames, frame_ts_id_group = encode_video(video_path, fps, force_packing=force_packing)
|
408 |
|
409 |
question = "Describe the video"
|
|
|
415 |
answer = model.chat(
|
416 |
msgs=msgs,
|
417 |
tokenizer=tokenizer,
|
418 |
+
use_image_id=False,
|
419 |
max_slice_nums=1,
|
420 |
temporal_ids=frame_ts_id_group
|
421 |
)
|