readme update
Browse files
README.md
CHANGED
@@ -52,6 +52,7 @@ Average word error rate (WER) over the FLEURS, Mozilla Common Voice and Multilin
|
|
52 |
|
53 |
The model can be used with the following frameworks;
|
54 |
- [`vllm (recommended)`](https://github.com/vllm-project/vllm): See [here](#vllm-recommended)
|
|
|
55 |
|
56 |
**Notes**:
|
57 |
|
@@ -327,4 +328,285 @@ print(30 * "=" + "BOT 1" + 30 * "=")
|
|
327 |
print(response.choices[0].message.tool_calls)
|
328 |
print("\n\n")
|
329 |
```
|
330 |
-
</details>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
The model can be used with the following frameworks;
|
54 |
- [`vllm (recommended)`](https://github.com/vllm-project/vllm): See [here](#vllm-recommended)
|
55 |
+
- [`Transformers` 🤗](https://github.com/huggingface/transformers): See [here](#transformers-🤗)
|
56 |
|
57 |
**Notes**:
|
58 |
|
|
|
328 |
print(response.choices[0].message.tool_calls)
|
329 |
print("\n\n")
|
330 |
```
|
331 |
+
</details>
|
332 |
+
|
333 |
+
### Transformers 🤗
|
334 |
+
|
335 |
+
Voxtral is supported in Transformers natively!
|
336 |
+
|
337 |
+
Install Transformers from source:
|
338 |
+
```bash
|
339 |
+
pip install git+https://github.com/huggingface/transformers
|
340 |
+
```
|
341 |
+
|
342 |
+
#### Audio Instruct
|
343 |
+
|
344 |
+
<details>
|
345 |
+
<summary>➡️ multi-audio + text instruction</summary>
|
346 |
+
|
347 |
+
```python
|
348 |
+
from transformers import VoxtralForConditionalGeneration, AutoProcessor
|
349 |
+
import torch
|
350 |
+
|
351 |
+
device = "cuda"
|
352 |
+
repo_id = "mistralai/Voxtral-Small-24B-2507"
|
353 |
+
|
354 |
+
processor = AutoProcessor.from_pretrained(repo_id)
|
355 |
+
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
|
356 |
+
|
357 |
+
conversation = [
|
358 |
+
{
|
359 |
+
"role": "user",
|
360 |
+
"content": [
|
361 |
+
{
|
362 |
+
"type": "audio",
|
363 |
+
"path": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/mary_had_lamb.mp3",
|
364 |
+
},
|
365 |
+
{
|
366 |
+
"type": "audio",
|
367 |
+
"path": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/winning_call.mp3",
|
368 |
+
},
|
369 |
+
{"type": "text", "text": "What sport and what nursery rhyme are referenced?"},
|
370 |
+
],
|
371 |
+
}
|
372 |
+
]
|
373 |
+
|
374 |
+
inputs = processor.apply_chat_template(conversation)
|
375 |
+
inputs = inputs.to(device, dtype=torch.bfloat16)
|
376 |
+
|
377 |
+
outputs = model.generate(**inputs, max_new_tokens=500)
|
378 |
+
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
379 |
+
|
380 |
+
print("\nGenerated response:")
|
381 |
+
print("=" * 80)
|
382 |
+
print(decoded_outputs[0])
|
383 |
+
print("=" * 80)
|
384 |
+
```
|
385 |
+
</details>
|
386 |
+
|
387 |
+
|
388 |
+
<details>
|
389 |
+
<summary>➡️ multi-turn</summary>
|
390 |
+
|
391 |
+
```python
|
392 |
+
from transformers import VoxtralForConditionalGeneration, AutoProcessor
|
393 |
+
import torch
|
394 |
+
|
395 |
+
device = "cuda"
|
396 |
+
repo_id = "mistralai/Voxtral-Small-24B-2507"
|
397 |
+
|
398 |
+
processor = AutoProcessor.from_pretrained(repo_id)
|
399 |
+
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
|
400 |
+
|
401 |
+
conversation = [
|
402 |
+
{
|
403 |
+
"role": "user",
|
404 |
+
"content": [
|
405 |
+
{
|
406 |
+
"type": "audio",
|
407 |
+
"path": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3",
|
408 |
+
},
|
409 |
+
{
|
410 |
+
"type": "audio",
|
411 |
+
"path": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/bcn_weather.mp3",
|
412 |
+
},
|
413 |
+
{"type": "text", "text": "Describe briefly what you can hear."},
|
414 |
+
],
|
415 |
+
},
|
416 |
+
{
|
417 |
+
"role": "assistant",
|
418 |
+
"content": "The audio begins with the speaker delivering a farewell address in Chicago, reflecting on his eight years as president and expressing gratitude to the American people. The audio then transitions to a weather report, stating that it was 35 degrees in Barcelona the previous day, but the temperature would drop to minus 20 degrees the following day.",
|
419 |
+
},
|
420 |
+
{
|
421 |
+
"role": "user",
|
422 |
+
"content": [
|
423 |
+
{
|
424 |
+
"type": "audio",
|
425 |
+
"path": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/winning_call.mp3",
|
426 |
+
},
|
427 |
+
{"type": "text", "text": "Ok, now compare this new audio with the previous one."},
|
428 |
+
],
|
429 |
+
},
|
430 |
+
]
|
431 |
+
|
432 |
+
inputs = processor.apply_chat_template(conversation)
|
433 |
+
inputs = inputs.to(device, dtype=torch.bfloat16)
|
434 |
+
|
435 |
+
outputs = model.generate(**inputs, max_new_tokens=500)
|
436 |
+
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
437 |
+
|
438 |
+
print("\nGenerated response:")
|
439 |
+
print("=" * 80)
|
440 |
+
print(decoded_outputs[0])
|
441 |
+
print("=" * 80)
|
442 |
+
```
|
443 |
+
</details>
|
444 |
+
|
445 |
+
|
446 |
+
<details>
|
447 |
+
<summary>➡️ text only</summary>
|
448 |
+
|
449 |
+
```python
|
450 |
+
from transformers import VoxtralForConditionalGeneration, AutoProcessor
|
451 |
+
import torch
|
452 |
+
|
453 |
+
device = "cuda"
|
454 |
+
repo_id = "mistralai/Voxtral-Small-24B-2507"
|
455 |
+
|
456 |
+
processor = AutoProcessor.from_pretrained(repo_id)
|
457 |
+
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
|
458 |
+
|
459 |
+
conversation = [
|
460 |
+
{
|
461 |
+
"role": "user",
|
462 |
+
"content": [
|
463 |
+
{
|
464 |
+
"type": "text",
|
465 |
+
"text": "Why should AI models be open-sourced?",
|
466 |
+
},
|
467 |
+
],
|
468 |
+
}
|
469 |
+
]
|
470 |
+
|
471 |
+
inputs = processor.apply_chat_template(conversation)
|
472 |
+
inputs = inputs.to(device, dtype=torch.bfloat16)
|
473 |
+
|
474 |
+
outputs = model.generate(**inputs, max_new_tokens=500)
|
475 |
+
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
476 |
+
|
477 |
+
print("\nGenerated response:")
|
478 |
+
print("=" * 80)
|
479 |
+
print(decoded_outputs[0])
|
480 |
+
print("=" * 80)
|
481 |
+
```
|
482 |
+
</details>
|
483 |
+
|
484 |
+
|
485 |
+
<details>
|
486 |
+
<summary>➡️ audio only</summary>
|
487 |
+
|
488 |
+
```python
|
489 |
+
from transformers import VoxtralForConditionalGeneration, AutoProcessor
|
490 |
+
import torch
|
491 |
+
|
492 |
+
device = "cuda"
|
493 |
+
repo_id = "mistralai/Voxtral-Small-24B-2507"
|
494 |
+
|
495 |
+
processor = AutoProcessor.from_pretrained(repo_id)
|
496 |
+
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
|
497 |
+
|
498 |
+
conversation = [
|
499 |
+
{
|
500 |
+
"role": "user",
|
501 |
+
"content": [
|
502 |
+
{
|
503 |
+
"type": "audio",
|
504 |
+
"path": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/winning_call.mp3",
|
505 |
+
},
|
506 |
+
],
|
507 |
+
}
|
508 |
+
]
|
509 |
+
|
510 |
+
inputs = processor.apply_chat_template(conversation)
|
511 |
+
inputs = inputs.to(device, dtype=torch.bfloat16)
|
512 |
+
|
513 |
+
outputs = model.generate(**inputs, max_new_tokens=500)
|
514 |
+
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
515 |
+
|
516 |
+
print("\nGenerated response:")
|
517 |
+
print("=" * 80)
|
518 |
+
print(decoded_outputs[0])
|
519 |
+
print("=" * 80)
|
520 |
+
```
|
521 |
+
</details>
|
522 |
+
|
523 |
+
|
524 |
+
<details>
|
525 |
+
<summary>➡️ batched inference</summary>
|
526 |
+
|
527 |
+
```python
|
528 |
+
from transformers import VoxtralForConditionalGeneration, AutoProcessor
|
529 |
+
import torch
|
530 |
+
|
531 |
+
device = "cuda"
|
532 |
+
repo_id = "mistralai/Voxtral-Small-24B-2507"
|
533 |
+
|
534 |
+
processor = AutoProcessor.from_pretrained(repo_id)
|
535 |
+
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
|
536 |
+
|
537 |
+
conversations = [
|
538 |
+
[
|
539 |
+
{
|
540 |
+
"role": "user",
|
541 |
+
"content": [
|
542 |
+
{
|
543 |
+
"type": "audio",
|
544 |
+
"path": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3",
|
545 |
+
},
|
546 |
+
{
|
547 |
+
"type": "audio",
|
548 |
+
"path": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/bcn_weather.mp3",
|
549 |
+
},
|
550 |
+
{
|
551 |
+
"type": "text",
|
552 |
+
"text": "Who's speaking in the speach and what city's weather is being discussed?",
|
553 |
+
},
|
554 |
+
],
|
555 |
+
}
|
556 |
+
],
|
557 |
+
[
|
558 |
+
{
|
559 |
+
"role": "user",
|
560 |
+
"content": [
|
561 |
+
{
|
562 |
+
"type": "audio",
|
563 |
+
"path": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/winning_call.mp3",
|
564 |
+
},
|
565 |
+
{"type": "text", "text": "What can you tell me about this audio?"},
|
566 |
+
],
|
567 |
+
}
|
568 |
+
],
|
569 |
+
]
|
570 |
+
|
571 |
+
inputs = processor.apply_chat_template(conversations)
|
572 |
+
inputs = inputs.to(device, dtype=torch.bfloat16)
|
573 |
+
|
574 |
+
outputs = model.generate(**inputs, max_new_tokens=500)
|
575 |
+
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
576 |
+
|
577 |
+
print("\nGenerated responses:")
|
578 |
+
print("=" * 80)
|
579 |
+
for decoded_output in decoded_outputs:
|
580 |
+
print(decoded_output)
|
581 |
+
print("=" * 80)
|
582 |
+
```
|
583 |
+
</details>
|
584 |
+
|
585 |
+
#### Transcription
|
586 |
+
|
587 |
+
<details>
|
588 |
+
<summary>➡️ transcribe</summary>
|
589 |
+
|
590 |
+
```python
|
591 |
+
from transformers import VoxtralForConditionalGeneration, AutoProcessor
|
592 |
+
import torch
|
593 |
+
|
594 |
+
device = "cuda"
|
595 |
+
repo_id = "mistralai/Voxtral-Small-24B-2507"
|
596 |
+
|
597 |
+
processor = AutoProcessor.from_pretrained(repo_id)
|
598 |
+
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
|
599 |
+
|
600 |
+
inputs = processor.apply_transcrition_request(language="en", audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3", model_id=repo_id)
|
601 |
+
inputs = inputs.to(device, dtype=torch.bfloat16)
|
602 |
+
|
603 |
+
outputs = model.generate(**inputs, max_new_tokens=500)
|
604 |
+
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
605 |
+
|
606 |
+
print("\nGenerated responses:")
|
607 |
+
print("=" * 80)
|
608 |
+
for decoded_output in decoded_outputs:
|
609 |
+
print(decoded_output)
|
610 |
+
print("=" * 80)
|
611 |
+
```
|
612 |
+
</details>
|