Dionyssos commited on
Commit
ad493ec
·
1 Parent(s): 2c8dc43

orig. files

Browse files
README.md CHANGED
@@ -1,13 +1,18 @@
1
  ---
2
- title: Speech Analysis2
3
- emoji: 🏢
4
- colorFrom: pink
5
  colorTo: gray
6
  sdk: gradio
7
  sdk_version: 5.41.1
8
  app_file: app.py
9
  pinned: false
10
- short_description: Fusion of Age-gender and emotion detectors
 
 
 
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Speech analysis
3
+ emoji:
4
+ colorFrom: gray
5
  colorTo: gray
6
  sdk: gradio
7
  sdk_version: 5.41.1
8
  app_file: app.py
9
  pinned: false
10
+ license: cc-by-nc-4.0
11
+ tags:
12
+ - age
13
+ - gender
14
+ - expression
15
+ - audio
16
  ---
17
 
18
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import typing
2
+
3
+ import gradio as gr
4
+ import matplotlib.pyplot as plt
5
+ import numpy as np
6
+ import spaces
7
+ import torch
8
+ import torch.nn as nn
9
+ from transformers import Wav2Vec2Processor
10
+ from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2Model
11
+ from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2PreTrainedModel
12
+
13
+ import audiofile
14
+ import audresample
15
+
16
+
17
+ device = 0 if torch.cuda.is_available() else "cpu"
18
+ duration = 2 # limit processing of audio
19
+ age_gender_model_name = "audeering/wav2vec2-large-robust-24-ft-age-gender"
20
+ expression_model_name = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"
21
+
22
+
23
+ class AgeGenderHead(nn.Module):
24
+ r"""Age-gender model head."""
25
+
26
+ def __init__(self, config, num_labels):
27
+
28
+ super().__init__()
29
+
30
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
31
+ self.dropout = nn.Dropout(config.final_dropout)
32
+ self.out_proj = nn.Linear(config.hidden_size, num_labels)
33
+
34
+ def forward(self, features, **kwargs):
35
+
36
+ x = features
37
+ x = self.dropout(x)
38
+ x = self.dense(x)
39
+ x = torch.tanh(x)
40
+ x = self.dropout(x)
41
+ x = self.out_proj(x)
42
+
43
+ return x
44
+
45
+
46
+ class AgeGenderModel(Wav2Vec2PreTrainedModel):
47
+ r"""Age-gender recognition model."""
48
+
49
+ def __init__(self, config):
50
+
51
+ super().__init__(config)
52
+
53
+ self.config = config
54
+ self.wav2vec2 = Wav2Vec2Model(config)
55
+ self.age = AgeGenderHead(config, 1)
56
+ self.gender = AgeGenderHead(config, 3)
57
+ self.init_weights()
58
+
59
+ def forward(
60
+ self,
61
+ input_values,
62
+ ):
63
+
64
+ outputs = self.wav2vec2(input_values)
65
+ hidden_states = outputs[0]
66
+ hidden_states = torch.mean(hidden_states, dim=1)
67
+ logits_age = self.age(hidden_states)
68
+ logits_gender = torch.softmax(self.gender(hidden_states), dim=1)
69
+
70
+ return hidden_states, logits_age, logits_gender
71
+
72
+
73
+ class ExpressionHead(nn.Module):
74
+ r"""Expression model head."""
75
+
76
+ def __init__(self, config):
77
+
78
+ super().__init__()
79
+
80
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
81
+ self.dropout = nn.Dropout(config.final_dropout)
82
+ self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
83
+
84
+ def forward(self, features, **kwargs):
85
+
86
+ x = features
87
+ x = self.dropout(x)
88
+ x = self.dense(x)
89
+ x = torch.tanh(x)
90
+ x = self.dropout(x)
91
+ x = self.out_proj(x)
92
+
93
+ return x
94
+
95
+
96
+ class ExpressionModel(Wav2Vec2PreTrainedModel):
97
+ r"""speech expression model."""
98
+
99
+ def __init__(self, config):
100
+
101
+ super().__init__(config)
102
+
103
+ self.config = config
104
+ self.wav2vec2 = Wav2Vec2Model(config)
105
+ self.classifier = ExpressionHead(config)
106
+ self.init_weights()
107
+
108
+ def forward(self, input_values):
109
+ outputs = self.wav2vec2(input_values)
110
+ hidden_states = outputs[0]
111
+ hidden_states = torch.mean(hidden_states, dim=1)
112
+ logits = self.classifier(hidden_states)
113
+
114
+ return hidden_states, logits
115
+
116
+
117
+ # Load models from hub
118
+ age_gender_processor = Wav2Vec2Processor.from_pretrained(age_gender_model_name)
119
+ age_gender_model = AgeGenderModel.from_pretrained(age_gender_model_name)
120
+ expression_processor = Wav2Vec2Processor.from_pretrained(expression_model_name)
121
+ expression_model = ExpressionModel.from_pretrained(expression_model_name)
122
+
123
+
124
+ def process_func(x: np.ndarray, sampling_rate: int) -> typing.Tuple[str, dict, str]:
125
+ r"""Predict age and gender or extract embeddings from raw audio signal."""
126
+ # run through processor to normalize signal
127
+ # always returns a batch, so we just get the first entry
128
+ # then we put it on the device
129
+ results = []
130
+ for processor, model in zip(
131
+ [age_gender_processor, expression_processor],
132
+ [age_gender_model, expression_model],
133
+ ):
134
+ y = processor(x, sampling_rate=sampling_rate)
135
+ y = y['input_values'][0]
136
+ y = y.reshape(1, -1)
137
+ y = torch.from_numpy(y).to(device)
138
+
139
+ # run through model
140
+ with torch.no_grad():
141
+ y = model(y)
142
+ if len(y) == 3:
143
+ # Age-gender model
144
+ y = torch.hstack([y[1], y[2]])
145
+ else:
146
+ # Expression model
147
+ y = y[1]
148
+
149
+ # convert to numpy
150
+ y = y.detach().cpu().numpy()
151
+ results.append(y[0])
152
+
153
+ # Plot A/D/V values
154
+ plot_expression(results[1][0], results[1][1], results[1][2])
155
+ expression_file = "expression.png"
156
+ plt.savefig(expression_file)
157
+ return (
158
+ f"{round(100 * results[0][0])} years", # age
159
+ {
160
+ "female": results[0][1],
161
+ "male": results[0][2],
162
+ "child": results[0][3],
163
+ },
164
+ expression_file,
165
+ )
166
+
167
+
168
+ @spaces.GPU
169
+ def recognize(input_file: str) -> typing.Tuple[str, dict, str]:
170
+ # sampling_rate, signal = input_microphone
171
+ # signal = signal.astype(np.float32, order="C") / 32768.0
172
+ if input_file is None:
173
+ raise gr.Error(
174
+ "No audio file submitted! "
175
+ "Please upload or record an audio file "
176
+ "before submitting your request."
177
+ )
178
+
179
+ signal, sampling_rate = audiofile.read(input_file, duration=duration)
180
+ # Resample to sampling rate supported byu the models
181
+ target_rate = 16000
182
+ signal = audresample.resample(signal, sampling_rate, target_rate)
183
+
184
+ return process_func(signal, target_rate)
185
+
186
+
187
+ def plot_expression(arousal, dominance, valence):
188
+ r"""3D pixel plot of arousal, dominance, valence."""
189
+ # Voxels per dimension
190
+ voxels = 7
191
+ # Create voxel grid
192
+ x, y, z = np.indices((voxels + 1, voxels + 1, voxels + 1))
193
+ voxel = (
194
+ (x == round(arousal * voxels))
195
+ & (y == round(dominance * voxels))
196
+ & (z == round(valence * voxels))
197
+ )
198
+ projection = (
199
+ (x == round(arousal * voxels))
200
+ & (y == round(dominance * voxels))
201
+ & (z < round(valence * voxels))
202
+ )
203
+ colors = np.empty((voxel | projection).shape, dtype=object)
204
+ colors[voxel] = "#fcb06c"
205
+ colors[projection] = "#fed7a9"
206
+ ax = plt.figure().add_subplot(projection='3d')
207
+ ax.voxels(voxel | projection, facecolors=colors, edgecolor='k')
208
+ ax.set_xlim([0, voxels])
209
+ ax.set_ylim([0, voxels])
210
+ ax.set_zlim([0, voxels])
211
+ ax.set_aspect("equal")
212
+ ax.set_xlabel("arousal", fontsize="large", labelpad=0)
213
+ ax.set_ylabel("dominance", fontsize="large", labelpad=0)
214
+ ax.set_zlabel("valence", fontsize="large", labelpad=0)
215
+ ax.set_xticks(
216
+ list(range(voxels + 1)),
217
+ labels=[0, None, None, None, None, None, None, 1],
218
+ verticalalignment="bottom",
219
+ )
220
+ ax.set_yticks(
221
+ list(range(voxels + 1)),
222
+ labels=[0, None, None, None, None, None, None, 1],
223
+ verticalalignment="bottom",
224
+ )
225
+ ax.set_zticks(
226
+ list(range(voxels + 1)),
227
+ labels=[0, None, None, None, None, None, None, 1],
228
+ verticalalignment="top",
229
+ )
230
+
231
+
232
+
233
+ description = (
234
+ "Estimate **age**, **gender**, and **expression** "
235
+ "of the speaker contained in an audio file or microphone recording. \n"
236
+ f"The model [{age_gender_model_name}]"
237
+ f"(https://huggingface.co/{age_gender_model_name}) "
238
+ "recognises age and gender, "
239
+ f"whereas [{expression_model_name}]"
240
+ f"(https://huggingface.co/{expression_model_name}) "
241
+ "recognises the expression dimensions arousal, dominance, and valence. "
242
+ )
243
+
244
+ with gr.Blocks() as demo:
245
+ with gr.Tab(label="Speech analysis"):
246
+ with gr.Row():
247
+ with gr.Column():
248
+ gr.Markdown(description)
249
+ input = gr.Audio(
250
+ sources=["upload", "microphone"],
251
+ type="filepath",
252
+ label="Audio input",
253
+ min_length=0.025, # seconds
254
+ )
255
+ gr.Examples(
256
+ [
257
+ "female-46-neutral.wav",
258
+ "female-20-happy.wav",
259
+ "male-60-angry.wav",
260
+ "male-27-sad.wav",
261
+ ],
262
+ [input],
263
+ label="Examples from CREMA-D, ODbL v1.0 license",
264
+ )
265
+ gr.Markdown("Only the first two seconds of the audio will be processed.")
266
+ submit_btn = gr.Button(value="Submit")
267
+ with gr.Column():
268
+ output_age = gr.Textbox(label="Age")
269
+ output_gender = gr.Label(label="Gender")
270
+ output_expression = gr.Image(label="Expression")
271
+
272
+ outputs = [output_age, output_gender, output_expression]
273
+ submit_btn.click(recognize, input, outputs)
274
+
275
+
276
+ demo.launch(debug=True)
female-20-happy.wav ADDED
Binary file (51 kB). View file
 
female-46-neutral.wav ADDED
Binary file (37.6 kB). View file
 
male-27-sad.wav ADDED
Binary file (50.4 kB). View file
 
male-60-angry.wav ADDED
Binary file (60.5 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ audiofile
2
+ audresample
3
+ matplotlib
4
+ torch
5
+ transformers