Update: README
Browse files
README.md
CHANGED
|
@@ -7,14 +7,11 @@ language:
|
|
| 7 |
- multilingual
|
| 8 |
tags:
|
| 9 |
- minicpm-v
|
| 10 |
-
- VLM
|
| 11 |
- vision
|
| 12 |
- ocr
|
| 13 |
-
- document parsing
|
| 14 |
- multi-image
|
| 15 |
- video
|
| 16 |
- custom_code
|
| 17 |
-
|
| 18 |
---
|
| 19 |
|
| 20 |
<h1>A GPT-4o Level MLLM for Single Image, Multi Image and High-FPS Video Understanding on Your Phone</h1>
|
|
@@ -289,7 +286,7 @@ for new_text in answer:
|
|
| 289 |
print(new_text, flush=True, end='')
|
| 290 |
|
| 291 |
# Second round chat, pass history context of multi-turn conversation
|
| 292 |
-
msgs.append({"role": "assistant", "content": [
|
| 293 |
msgs.append({"role": "user", "content": ["What should I pay attention to when traveling here?"]})
|
| 294 |
|
| 295 |
answer = model.chat(
|
|
@@ -406,7 +403,7 @@ def encode_video(video_path, choose_fps=3, force_packing=None):
|
|
| 406 |
|
| 407 |
video_path="video_test.mp4"
|
| 408 |
fps = 5 # fps for video
|
| 409 |
-
force_packing = None # You can set force_packing to ensure that 3D
|
| 410 |
frames, frame_ts_id_group = encode_video(video_path, fps, force_packing=force_packing)
|
| 411 |
|
| 412 |
question = "Describe the video"
|
|
@@ -418,7 +415,7 @@ msgs = [
|
|
| 418 |
answer = model.chat(
|
| 419 |
msgs=msgs,
|
| 420 |
tokenizer=tokenizer,
|
| 421 |
-
use_image_id=False,
|
| 422 |
max_slice_nums=1,
|
| 423 |
temporal_ids=frame_ts_id_group
|
| 424 |
)
|
|
|
|
| 7 |
- multilingual
|
| 8 |
tags:
|
| 9 |
- minicpm-v
|
|
|
|
| 10 |
- vision
|
| 11 |
- ocr
|
|
|
|
| 12 |
- multi-image
|
| 13 |
- video
|
| 14 |
- custom_code
|
|
|
|
| 15 |
---
|
| 16 |
|
| 17 |
<h1>A GPT-4o Level MLLM for Single Image, Multi Image and High-FPS Video Understanding on Your Phone</h1>
|
|
|
|
| 286 |
print(new_text, flush=True, end='')
|
| 287 |
|
| 288 |
# Second round chat, pass history context of multi-turn conversation
|
| 289 |
+
msgs.append({"role": "assistant", "content": [generated_text]})
|
| 290 |
msgs.append({"role": "user", "content": ["What should I pay attention to when traveling here?"]})
|
| 291 |
|
| 292 |
answer = model.chat(
|
|
|
|
| 403 |
|
| 404 |
video_path="video_test.mp4"
|
| 405 |
fps = 5 # fps for video
|
| 406 |
+
force_packing = None # You can set force_packing to ensure that 3D packing is forcibly enabled; otherwise, encode_video will dynamically set the packing quantity based on the duration.
|
| 407 |
frames, frame_ts_id_group = encode_video(video_path, fps, force_packing=force_packing)
|
| 408 |
|
| 409 |
question = "Describe the video"
|
|
|
|
| 415 |
answer = model.chat(
|
| 416 |
msgs=msgs,
|
| 417 |
tokenizer=tokenizer,
|
| 418 |
+
use_image_id=False,
|
| 419 |
max_slice_nums=1,
|
| 420 |
temporal_ids=frame_ts_id_group
|
| 421 |
)
|