reach-vb HF Staff Matthijs commited on
Commit
6b0e6af
·
0 Parent(s):

Duplicate from Matthijs/whisper_word_timestamps

Browse files

Co-authored-by: Mathijs Hollemans <[email protected]>

.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ *.wav filter=lfs diff=lfs merge=lfs -text
36
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
37
+ *.ttf filter=lfs diff=lfs merge=lfs -text
Lato-Regular.ttf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f43f1c7780d69792278f04b136c934a0298fc66f2e974bac13dd2e53adc52bde
3
+ size 72312
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Whisper Word-Level Timestamps
3
+ emoji: 💭⏰
4
+ colorFrom: yellow
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 3.35.2
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ duplicated_from: Matthijs/whisper_word_timestamps
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import librosa
3
+ import numpy as np
4
+ import moviepy.editor as mpy
5
+ import torch
6
+
7
+ from PIL import Image, ImageDraw, ImageFont
8
+ from transformers import pipeline
9
+
10
+
11
+ max_duration = 60 # seconds
12
+ fps = 25
13
+ video_width = 640
14
+ video_height = 480
15
+ margin_left = 20
16
+ margin_right = 20
17
+ margin_top = 20
18
+ line_height = 44
19
+
20
+ background_image = Image.open("background.png")
21
+ font = ImageFont.truetype("Lato-Regular.ttf", 40)
22
+ text_color = (255, 200, 200)
23
+ highlight_color = (255, 255, 255)
24
+
25
+ # checkpoint = "openai/whisper-tiny"
26
+ # checkpoint = "openai/whisper-base"
27
+ checkpoint = "openai/whisper-small"
28
+
29
+ if torch.cuda.is_available() and torch.cuda.device_count() > 0:
30
+ from transformers import (
31
+ AutomaticSpeechRecognitionPipeline,
32
+ WhisperForConditionalGeneration,
33
+ WhisperProcessor,
34
+ )
35
+ model = WhisperForConditionalGeneration.from_pretrained(checkpoint).to("cuda").half()
36
+ processor = WhisperProcessor.from_pretrained(checkpoint)
37
+ pipe = AutomaticSpeechRecognitionPipeline(
38
+ model=model,
39
+ tokenizer=processor.tokenizer,
40
+ feature_extractor=processor.feature_extractor,
41
+ batch_size=8,
42
+ torch_dtype=torch.float16,
43
+ device="cuda:0"
44
+ )
45
+ else:
46
+ pipe = pipeline(model=checkpoint)
47
+
48
+ # TODO: no longer need to set these manually once the models have been updated on the Hub
49
+ # whisper-tiny
50
+ # pipe.model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
51
+ # whisper-base
52
+ # pipe.model.generation_config.alignment_heads = [[3, 1], [4, 2], [4, 3], [4, 7], [5, 1], [5, 2], [5, 4], [5, 6]]
53
+ # whisper-small
54
+ pipe.model.generation_config.alignment_heads = [[5, 3], [5, 9], [8, 0], [8, 4], [8, 7], [8, 8], [9, 0], [9, 7], [9, 9], [10, 5]]
55
+
56
+ chunks = []
57
+
58
+ start_chunk = 0
59
+ last_draws = []
60
+ last_image = None
61
+
62
+
63
+ def make_frame(t):
64
+ global chunks, start_chunk, last_draws, last_image
65
+
66
+ # TODO in the Henry V example, the word "desires" has an ending timestamp
67
+ # that's too far into the future, and so the word stays highlighted.
68
+ # Could fix this by finding the latest word that is active in the chunk
69
+ # and only highlight that one.
70
+
71
+ image = background_image.copy()
72
+ draw = ImageDraw.Draw(image)
73
+
74
+ # for debugging: draw frame time
75
+ #draw.text((20, 20), str(t), fill=text_color, font=font)
76
+
77
+ space_length = draw.textlength(" ", font)
78
+ x = margin_left
79
+ y = margin_top
80
+
81
+ # Create a list of drawing commands
82
+ draws = []
83
+ for i in range(start_chunk, len(chunks)):
84
+ chunk = chunks[i]
85
+ chunk_start = chunk["timestamp"][0]
86
+ chunk_end = chunk["timestamp"][1]
87
+ if chunk_start > t: break
88
+ if chunk_end is None: chunk_end = max_duration
89
+
90
+ word = chunk["text"]
91
+ word_length = draw.textlength(word + " ", font) - space_length
92
+
93
+ if x + word_length >= video_width - margin_right:
94
+ x = margin_left
95
+ y += line_height
96
+
97
+ # restart page when end is reached
98
+ if y >= margin_top + line_height * 7:
99
+ start_chunk = i
100
+ break
101
+
102
+ highlight = (chunk_start <= t < chunk_end)
103
+ draws.append([x, y, word, word_length, highlight])
104
+
105
+ x += word_length + space_length
106
+
107
+ # If the drawing commands didn't change, then reuse the last image,
108
+ # otherwise draw a new image
109
+ if draws != last_draws:
110
+ for x, y, word, word_length, highlight in draws:
111
+ if highlight:
112
+ color = highlight_color
113
+ draw.rectangle([x, y + line_height, x + word_length, y + line_height + 4], fill=color)
114
+ else:
115
+ color = text_color
116
+
117
+ draw.text((x, y), word, fill=color, font=font)
118
+
119
+ last_image = np.array(image)
120
+ last_draws = draws
121
+
122
+ return last_image
123
+
124
+
125
+ def predict(audio_path):
126
+ global chunks, start_chunk, last_draws, last_image
127
+
128
+ start_chunk = 0
129
+ last_draws = []
130
+ last_image = None
131
+
132
+ audio_data, sr = librosa.load(audio_path, mono=True)
133
+ duration = librosa.get_duration(y=audio_data, sr=sr)
134
+ duration = min(max_duration, duration)
135
+ audio_data = audio_data[:int(duration * sr)]
136
+
137
+ # Run Whisper to get word-level timestamps.
138
+ audio_inputs = librosa.resample(audio_data, orig_sr=sr, target_sr=pipe.feature_extractor.sampling_rate)
139
+ output = pipe(audio_inputs, chunk_length_s=30, stride_length_s=[4, 2], return_timestamps="word")
140
+ chunks = output["chunks"]
141
+ #print(chunks)
142
+
143
+ # Create the video.
144
+ clip = mpy.VideoClip(make_frame, duration=duration)
145
+ audio_clip = mpy.AudioFileClip(audio_path).set_duration(duration)
146
+ clip = clip.set_audio(audio_clip)
147
+ clip.write_videofile("my_video.mp4", fps=fps, codec="libx264", audio_codec="aac")
148
+ return "my_video.mp4"
149
+
150
+
151
+ title = "Word-level timestamps with Whisper"
152
+
153
+ description = """
154
+ This demo shows Whisper <b>word-level timestamps</b> in action using Hugging Face Transformers. It creates a video showing subtitled audio with the current word highlighted. It can even do music lyrics!
155
+
156
+ This demo uses the <b>openai/whisper-small</b> checkpoint.
157
+
158
+ Since it's only a demo, the output is limited to the first 60 seconds of audio.
159
+ To use this on longer audio, <a href="https://huggingface.co/spaces/Matthijs/whisper_word_timestamps/settings?duplicate=true">duplicate the space</a>
160
+ and in <b>app.py</b> change the value of `max_duration`.
161
+ """
162
+
163
+ article = """
164
+ <div style='margin:20px auto;'>
165
+
166
+ <p>Credits:<p>
167
+
168
+ <ul>
169
+ <li>Shakespeare's "Henry V" speech from <a href="https://freesound.org/people/acclivity/sounds/24096/">acclivity</a> (CC BY-NC 4.0 license)
170
+ <li>"Here's to the Crazy Ones" speech by Steve Jobs</li>
171
+ <li>"Stupid People" comedy routine by Bill Engvall</li>
172
+ <li>"BeOS, It's The OS" song by The Cotton Squares</li>
173
+ <li>Lato font by Łukasz Dziedzic (licensed under Open Font License)</li>
174
+ <li>Whisper model by OpenAI</li>
175
+ </ul>
176
+
177
+ </div>
178
+ """
179
+
180
+ examples = [
181
+ "examples/steve_jobs_crazy_ones.mp3",
182
+ "examples/henry5.wav",
183
+ "examples/stupid_people.mp3",
184
+ "examples/beos_song.mp3",
185
+ ]
186
+
187
+ gr.Interface(
188
+ fn=predict,
189
+ inputs=[
190
+ gr.Audio(label="Upload Audio", source="upload", type="filepath"),
191
+ ],
192
+ outputs=[
193
+ gr.Video(label="Output Video"),
194
+ ],
195
+ title=title,
196
+ description=description,
197
+ article=article,
198
+ examples=examples,
199
+ ).launch()
background.png ADDED
examples/beos_song.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a9a0df5dba8bfd3f4dcc895d98f03552ac4220e7fb30267c20448d33684410b
3
+ size 1245689
examples/henry5.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bc2a163bd390650377e63c86c38d2826947f523429bb7e3ad91a6cba8b61309
3
+ size 6721664
examples/steve_jobs_crazy_ones.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:adc4eb3004ef2878dd694a893ec9cd0c1e2ccc749f8a47aaf4d7fdbdad33cb42
3
+ size 1467173
examples/stupid_people.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b91f0d180d32a75bc911a22b9331c60b70200087df16e6a422d949e669606ec8
3
+ size 498736
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ git+https://github.com/huggingface/transformers.git
2
+ torch
3
+ torchaudio
4
+ soundfile
5
+ librosa
6
+ moviepy
7
+ matplotlib
8
+ pillow