akhaliq HF Staff commited on
Commit
15a6c65
Β·
verified Β·
1 Parent(s): 932bf55

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +67 -0
app.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import tempfile
4
+ import os
5
+ from typing import List, Tuple
6
+ from transformers import VoxtralForConditionalGeneration, AutoProcessor
7
+
8
+ device = "cuda" if torch.cuda.is_available() else "cpu"
9
+ repo_id = "mistralai/Voxtral-Mini-3B-2507"
10
+
11
+ processor = AutoProcessor.from_pretrained(repo_id)
12
+ model = VoxtralForConditionalGeneration.from_pretrained(
13
+ repo_id,
14
+ torch_dtype=torch.bfloat16,
15
+ device_map=device,
16
+ )
17
+
18
+ def respond(audio_files: List[str], question: str) -> Tuple[str, List[str]]:
19
+ if not audio_files:
20
+ return "Please upload at least one audio file.", []
21
+
22
+ conversation = [
23
+ {
24
+ "role": "user",
25
+ "content": [
26
+ {"type": "audio", "path": path} for path in audio_files
27
+ ] + [{"type": "text", "text": question}],
28
+ }
29
+ ]
30
+
31
+ inputs = processor.apply_chat_template(conversation)
32
+ inputs = inputs.to(device, dtype=torch.bfloat16)
33
+
34
+ with torch.no_grad():
35
+ outputs = model.generate(**inputs, max_new_tokens=500)
36
+ decoded = processor.batch_decode(
37
+ outputs[:, inputs.input_ids.shape[1]:],
38
+ skip_special_tokens=True,
39
+ )
40
+ return decoded[0], audio_files
41
+
42
+ demo = gr.Interface(
43
+ fn=respond,
44
+ inputs=[
45
+ gr.Audio(type="filepath", label="Audio files", file_count="multiple"),
46
+ gr.Textbox(lines=2, placeholder="Ask something about the audio(s)...", label="Question"),
47
+ ],
48
+ outputs=[
49
+ gr.Textbox(label="Answer"),
50
+ gr.Gallery(label="Uploaded audio files"),
51
+ ],
52
+ title="Voxtral-Mini-3B-2507 Audio Q&A",
53
+ description="Upload one or more audio files and ask any question about them.",
54
+ examples=[
55
+ [
56
+ [
57
+ "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/mary_had_lamb.mp3",
58
+ "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/winning_call.mp3",
59
+ ],
60
+ "What sport and what nursery rhyme are referenced?",
61
+ ]
62
+ ],
63
+ cache_examples=False,
64
+ )
65
+
66
+ if __name__ == "__main__":
67
+ demo.launch()