Update README.md
Browse files
README.md
CHANGED
@@ -173,9 +173,74 @@ We release text annotations for all embodied reasoning datasets and videos for R
|
|
173 |
|
174 |
|
175 |
## Inference:
|
176 |
-
**Acceleration Engine:** PyTorch, flash attention <br>
|
177 |
**Test Hardware:** H100, A100, GB200 <br>
|
178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
|
180 |
## Ethical Considerations
|
181 |
|
|
|
173 |
|
174 |
|
175 |
## Inference:
|
|
|
176 |
**Test Hardware:** H100, A100, GB200 <br>
|
177 |
+
```python
|
178 |
+
from transformers import AutoProcessor
|
179 |
+
from vllm import LLM, SamplingParams
|
180 |
+
from qwen_vl_utils import process_vision_info
|
181 |
+
|
182 |
+
# You can also replace the MODEL_PATH by a safetensors folder path mentioned above
|
183 |
+
MODEL_PATH = "nvidia/Cosmos-Reason1-7B"
|
184 |
+
|
185 |
+
llm = LLM(
|
186 |
+
model=MODEL_PATH,
|
187 |
+
limit_mm_per_prompt={"image": 10, "video": 10},
|
188 |
+
)
|
189 |
+
|
190 |
+
sampling_params = SamplingParams(
|
191 |
+
temperature=0.6,
|
192 |
+
top_p=0.95,
|
193 |
+
repetition_penalty=1.05,
|
194 |
+
max_tokens=4096,
|
195 |
+
)
|
196 |
+
|
197 |
+
video_messages = [
|
198 |
+
{"role": "system", "content": "You are a helpful assistant. Answer the question in the following format: <think>\nyour reasoning\n</think>\n\n<answer>\nyour answer\n</answer>."},
|
199 |
+
{"role": "user", "content": [
|
200 |
+
{"type": "text", "text": (
|
201 |
+
"Is it safe to turn right?"
|
202 |
+
)
|
203 |
+
},
|
204 |
+
{
|
205 |
+
"type": "video",
|
206 |
+
"video": "file:///path/to/your/video.mp4",
|
207 |
+
"fps": 4,
|
208 |
+
}
|
209 |
+
]
|
210 |
+
},
|
211 |
+
]
|
212 |
+
|
213 |
+
# Here we use video messages as a demonstration
|
214 |
+
messages = video_messages
|
215 |
+
|
216 |
+
processor = AutoProcessor.from_pretrained(MODEL_PATH)
|
217 |
+
prompt = processor.apply_chat_template(
|
218 |
+
messages,
|
219 |
+
tokenize=False,
|
220 |
+
add_generation_prompt=True,
|
221 |
+
)
|
222 |
+
image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
|
223 |
+
|
224 |
+
mm_data = {}
|
225 |
+
if image_inputs is not None:
|
226 |
+
mm_data["image"] = image_inputs
|
227 |
+
if video_inputs is not None:
|
228 |
+
mm_data["video"] = video_inputs
|
229 |
+
|
230 |
+
llm_inputs = {
|
231 |
+
"prompt": prompt,
|
232 |
+
"multi_modal_data": mm_data,
|
233 |
+
|
234 |
+
# FPS will be returned in video_kwargs
|
235 |
+
"mm_processor_kwargs": video_kwargs,
|
236 |
+
}
|
237 |
+
|
238 |
+
outputs = llm.generate([llm_inputs], sampling_params=sampling_params)
|
239 |
+
generated_text = outputs[0].outputs[0].text
|
240 |
+
|
241 |
+
print(generated_text)
|
242 |
+
```
|
243 |
+
|
244 |
|
245 |
## Ethical Considerations
|
246 |
|