aishams commited on
Commit
0122525
·
1 Parent(s): 70ba2f1

Upload 6 files

Browse files
modules/__init__.py ADDED
File without changes
modules/attentions.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+ import numpy as np
4
+ import torch
5
+ from torch import nn
6
+ from torch.nn import functional as F
7
+
8
+ import modules.commons as commons
9
+ import modules.modules as modules
10
+ from modules.modules import LayerNorm
11
+
12
+
13
+ class FFT(nn.Module):
14
+ def __init__(self, hidden_channels, filter_channels, n_heads, n_layers=1, kernel_size=1, p_dropout=0.,
15
+ proximal_bias=False, proximal_init=True, **kwargs):
16
+ super().__init__()
17
+ self.hidden_channels = hidden_channels
18
+ self.filter_channels = filter_channels
19
+ self.n_heads = n_heads
20
+ self.n_layers = n_layers
21
+ self.kernel_size = kernel_size
22
+ self.p_dropout = p_dropout
23
+ self.proximal_bias = proximal_bias
24
+ self.proximal_init = proximal_init
25
+
26
+ self.drop = nn.Dropout(p_dropout)
27
+ self.self_attn_layers = nn.ModuleList()
28
+ self.norm_layers_0 = nn.ModuleList()
29
+ self.ffn_layers = nn.ModuleList()
30
+ self.norm_layers_1 = nn.ModuleList()
31
+ for i in range(self.n_layers):
32
+ self.self_attn_layers.append(
33
+ MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias,
34
+ proximal_init=proximal_init))
35
+ self.norm_layers_0.append(LayerNorm(hidden_channels))
36
+ self.ffn_layers.append(
37
+ FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
38
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
39
+
40
+ def forward(self, x, x_mask):
41
+ """
42
+ x: decoder input
43
+ h: encoder output
44
+ """
45
+ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
46
+ x = x * x_mask
47
+ for i in range(self.n_layers):
48
+ y = self.self_attn_layers[i](x, x, self_attn_mask)
49
+ y = self.drop(y)
50
+ x = self.norm_layers_0[i](x + y)
51
+
52
+ y = self.ffn_layers[i](x, x_mask)
53
+ y = self.drop(y)
54
+ x = self.norm_layers_1[i](x + y)
55
+ x = x * x_mask
56
+ return x
57
+
58
+
59
+ class Encoder(nn.Module):
60
+ def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
61
+ super().__init__()
62
+ self.hidden_channels = hidden_channels
63
+ self.filter_channels = filter_channels
64
+ self.n_heads = n_heads
65
+ self.n_layers = n_layers
66
+ self.kernel_size = kernel_size
67
+ self.p_dropout = p_dropout
68
+ self.window_size = window_size
69
+
70
+ self.drop = nn.Dropout(p_dropout)
71
+ self.attn_layers = nn.ModuleList()
72
+ self.norm_layers_1 = nn.ModuleList()
73
+ self.ffn_layers = nn.ModuleList()
74
+ self.norm_layers_2 = nn.ModuleList()
75
+ for i in range(self.n_layers):
76
+ self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
77
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
78
+ self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
79
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
80
+
81
+ def forward(self, x, x_mask):
82
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
83
+ x = x * x_mask
84
+ for i in range(self.n_layers):
85
+ y = self.attn_layers[i](x, x, attn_mask)
86
+ y = self.drop(y)
87
+ x = self.norm_layers_1[i](x + y)
88
+
89
+ y = self.ffn_layers[i](x, x_mask)
90
+ y = self.drop(y)
91
+ x = self.norm_layers_2[i](x + y)
92
+ x = x * x_mask
93
+ return x
94
+
95
+
96
+ class Decoder(nn.Module):
97
+ def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
98
+ super().__init__()
99
+ self.hidden_channels = hidden_channels
100
+ self.filter_channels = filter_channels
101
+ self.n_heads = n_heads
102
+ self.n_layers = n_layers
103
+ self.kernel_size = kernel_size
104
+ self.p_dropout = p_dropout
105
+ self.proximal_bias = proximal_bias
106
+ self.proximal_init = proximal_init
107
+
108
+ self.drop = nn.Dropout(p_dropout)
109
+ self.self_attn_layers = nn.ModuleList()
110
+ self.norm_layers_0 = nn.ModuleList()
111
+ self.encdec_attn_layers = nn.ModuleList()
112
+ self.norm_layers_1 = nn.ModuleList()
113
+ self.ffn_layers = nn.ModuleList()
114
+ self.norm_layers_2 = nn.ModuleList()
115
+ for i in range(self.n_layers):
116
+ self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
117
+ self.norm_layers_0.append(LayerNorm(hidden_channels))
118
+ self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
119
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
120
+ self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
121
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
122
+
123
+ def forward(self, x, x_mask, h, h_mask):
124
+ """
125
+ x: decoder input
126
+ h: encoder output
127
+ """
128
+ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
129
+ encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
130
+ x = x * x_mask
131
+ for i in range(self.n_layers):
132
+ y = self.self_attn_layers[i](x, x, self_attn_mask)
133
+ y = self.drop(y)
134
+ x = self.norm_layers_0[i](x + y)
135
+
136
+ y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
137
+ y = self.drop(y)
138
+ x = self.norm_layers_1[i](x + y)
139
+
140
+ y = self.ffn_layers[i](x, x_mask)
141
+ y = self.drop(y)
142
+ x = self.norm_layers_2[i](x + y)
143
+ x = x * x_mask
144
+ return x
145
+
146
+
147
+ class MultiHeadAttention(nn.Module):
148
+ def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
149
+ super().__init__()
150
+ assert channels % n_heads == 0
151
+
152
+ self.channels = channels
153
+ self.out_channels = out_channels
154
+ self.n_heads = n_heads
155
+ self.p_dropout = p_dropout
156
+ self.window_size = window_size
157
+ self.heads_share = heads_share
158
+ self.block_length = block_length
159
+ self.proximal_bias = proximal_bias
160
+ self.proximal_init = proximal_init
161
+ self.attn = None
162
+
163
+ self.k_channels = channels // n_heads
164
+ self.conv_q = nn.Conv1d(channels, channels, 1)
165
+ self.conv_k = nn.Conv1d(channels, channels, 1)
166
+ self.conv_v = nn.Conv1d(channels, channels, 1)
167
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
168
+ self.drop = nn.Dropout(p_dropout)
169
+
170
+ if window_size is not None:
171
+ n_heads_rel = 1 if heads_share else n_heads
172
+ rel_stddev = self.k_channels**-0.5
173
+ self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
174
+ self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
175
+
176
+ nn.init.xavier_uniform_(self.conv_q.weight)
177
+ nn.init.xavier_uniform_(self.conv_k.weight)
178
+ nn.init.xavier_uniform_(self.conv_v.weight)
179
+ if proximal_init:
180
+ with torch.no_grad():
181
+ self.conv_k.weight.copy_(self.conv_q.weight)
182
+ self.conv_k.bias.copy_(self.conv_q.bias)
183
+
184
+ def forward(self, x, c, attn_mask=None):
185
+ q = self.conv_q(x)
186
+ k = self.conv_k(c)
187
+ v = self.conv_v(c)
188
+
189
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
190
+
191
+ x = self.conv_o(x)
192
+ return x
193
+
194
+ def attention(self, query, key, value, mask=None):
195
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
196
+ b, d, t_s, t_t = (*key.size(), query.size(2))
197
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
198
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
199
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
200
+
201
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
202
+ if self.window_size is not None:
203
+ assert t_s == t_t, "Relative attention is only available for self-attention."
204
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
205
+ rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
206
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
207
+ scores = scores + scores_local
208
+ if self.proximal_bias:
209
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
210
+ scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
211
+ if mask is not None:
212
+ scores = scores.masked_fill(mask == 0, -1e4)
213
+ if self.block_length is not None:
214
+ assert t_s == t_t, "Local attention is only available for self-attention."
215
+ block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
216
+ scores = scores.masked_fill(block_mask == 0, -1e4)
217
+ p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
218
+ p_attn = self.drop(p_attn)
219
+ output = torch.matmul(p_attn, value)
220
+ if self.window_size is not None:
221
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
222
+ value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
223
+ output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
224
+ output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
225
+ return output, p_attn
226
+
227
+ def _matmul_with_relative_values(self, x, y):
228
+ """
229
+ x: [b, h, l, m]
230
+ y: [h or 1, m, d]
231
+ ret: [b, h, l, d]
232
+ """
233
+ ret = torch.matmul(x, y.unsqueeze(0))
234
+ return ret
235
+
236
+ def _matmul_with_relative_keys(self, x, y):
237
+ """
238
+ x: [b, h, l, d]
239
+ y: [h or 1, m, d]
240
+ ret: [b, h, l, m]
241
+ """
242
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
243
+ return ret
244
+
245
+ def _get_relative_embeddings(self, relative_embeddings, length):
246
+ max_relative_position = 2 * self.window_size + 1
247
+ # Pad first before slice to avoid using cond ops.
248
+ pad_length = max(length - (self.window_size + 1), 0)
249
+ slice_start_position = max((self.window_size + 1) - length, 0)
250
+ slice_end_position = slice_start_position + 2 * length - 1
251
+ if pad_length > 0:
252
+ padded_relative_embeddings = F.pad(
253
+ relative_embeddings,
254
+ commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
255
+ else:
256
+ padded_relative_embeddings = relative_embeddings
257
+ used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
258
+ return used_relative_embeddings
259
+
260
+ def _relative_position_to_absolute_position(self, x):
261
+ """
262
+ x: [b, h, l, 2*l-1]
263
+ ret: [b, h, l, l]
264
+ """
265
+ batch, heads, length, _ = x.size()
266
+ # Concat columns of pad to shift from relative to absolute indexing.
267
+ x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
268
+
269
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
270
+ x_flat = x.view([batch, heads, length * 2 * length])
271
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
272
+
273
+ # Reshape and slice out the padded elements.
274
+ x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
275
+ return x_final
276
+
277
+ def _absolute_position_to_relative_position(self, x):
278
+ """
279
+ x: [b, h, l, l]
280
+ ret: [b, h, l, 2*l-1]
281
+ """
282
+ batch, heads, length, _ = x.size()
283
+ # padd along column
284
+ x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
285
+ x_flat = x.view([batch, heads, length**2 + length*(length -1)])
286
+ # add 0's in the beginning that will skew the elements after reshape
287
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
288
+ x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
289
+ return x_final
290
+
291
+ def _attention_bias_proximal(self, length):
292
+ """Bias for self-attention to encourage attention to close positions.
293
+ Args:
294
+ length: an integer scalar.
295
+ Returns:
296
+ a Tensor with shape [1, 1, length, length]
297
+ """
298
+ r = torch.arange(length, dtype=torch.float32)
299
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
300
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
301
+
302
+
303
+ class FFN(nn.Module):
304
+ def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
305
+ super().__init__()
306
+ self.in_channels = in_channels
307
+ self.out_channels = out_channels
308
+ self.filter_channels = filter_channels
309
+ self.kernel_size = kernel_size
310
+ self.p_dropout = p_dropout
311
+ self.activation = activation
312
+ self.causal = causal
313
+
314
+ if causal:
315
+ self.padding = self._causal_padding
316
+ else:
317
+ self.padding = self._same_padding
318
+
319
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
320
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
321
+ self.drop = nn.Dropout(p_dropout)
322
+
323
+ def forward(self, x, x_mask):
324
+ x = self.conv_1(self.padding(x * x_mask))
325
+ if self.activation == "gelu":
326
+ x = x * torch.sigmoid(1.702 * x)
327
+ else:
328
+ x = torch.relu(x)
329
+ x = self.drop(x)
330
+ x = self.conv_2(self.padding(x * x_mask))
331
+ return x * x_mask
332
+
333
+ def _causal_padding(self, x):
334
+ if self.kernel_size == 1:
335
+ return x
336
+ pad_l = self.kernel_size - 1
337
+ pad_r = 0
338
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
339
+ x = F.pad(x, commons.convert_pad_shape(padding))
340
+ return x
341
+
342
+ def _same_padding(self, x):
343
+ if self.kernel_size == 1:
344
+ return x
345
+ pad_l = (self.kernel_size - 1) // 2
346
+ pad_r = self.kernel_size // 2
347
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
348
+ x = F.pad(x, commons.convert_pad_shape(padding))
349
+ return x
modules/commons.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import numpy as np
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+
7
+ def slice_pitch_segments(x, ids_str, segment_size=4):
8
+ ret = torch.zeros_like(x[:, :segment_size])
9
+ for i in range(x.size(0)):
10
+ idx_str = ids_str[i]
11
+ idx_end = idx_str + segment_size
12
+ ret[i] = x[i, idx_str:idx_end]
13
+ return ret
14
+
15
+ def rand_slice_segments_with_pitch(x, pitch, x_lengths=None, segment_size=4):
16
+ b, d, t = x.size()
17
+ if x_lengths is None:
18
+ x_lengths = t
19
+ ids_str_max = x_lengths - segment_size + 1
20
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
21
+ ret = slice_segments(x, ids_str, segment_size)
22
+ ret_pitch = slice_pitch_segments(pitch, ids_str, segment_size)
23
+ return ret, ret_pitch, ids_str
24
+
25
+ def init_weights(m, mean=0.0, std=0.01):
26
+ classname = m.__class__.__name__
27
+ if classname.find("Conv") != -1:
28
+ m.weight.data.normal_(mean, std)
29
+
30
+
31
+ def get_padding(kernel_size, dilation=1):
32
+ return int((kernel_size*dilation - dilation)/2)
33
+
34
+
35
+ def convert_pad_shape(pad_shape):
36
+ l = pad_shape[::-1]
37
+ pad_shape = [item for sublist in l for item in sublist]
38
+ return pad_shape
39
+
40
+
41
+ def intersperse(lst, item):
42
+ result = [item] * (len(lst) * 2 + 1)
43
+ result[1::2] = lst
44
+ return result
45
+
46
+
47
+ def kl_divergence(m_p, logs_p, m_q, logs_q):
48
+ """KL(P||Q)"""
49
+ kl = (logs_q - logs_p) - 0.5
50
+ kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
51
+ return kl
52
+
53
+
54
+ def rand_gumbel(shape):
55
+ """Sample from the Gumbel distribution, protect from overflows."""
56
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
57
+ return -torch.log(-torch.log(uniform_samples))
58
+
59
+
60
+ def rand_gumbel_like(x):
61
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
62
+ return g
63
+
64
+
65
+ def slice_segments(x, ids_str, segment_size=4):
66
+ ret = torch.zeros_like(x[:, :, :segment_size])
67
+ for i in range(x.size(0)):
68
+ idx_str = ids_str[i]
69
+ idx_end = idx_str + segment_size
70
+ ret[i] = x[i, :, idx_str:idx_end]
71
+ return ret
72
+
73
+
74
+ def rand_slice_segments(x, x_lengths=None, segment_size=4):
75
+ b, d, t = x.size()
76
+ if x_lengths is None:
77
+ x_lengths = t
78
+ ids_str_max = x_lengths - segment_size + 1
79
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
80
+ ret = slice_segments(x, ids_str, segment_size)
81
+ return ret, ids_str
82
+
83
+
84
+ def rand_spec_segments(x, x_lengths=None, segment_size=4):
85
+ b, d, t = x.size()
86
+ if x_lengths is None:
87
+ x_lengths = t
88
+ ids_str_max = x_lengths - segment_size
89
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
90
+ ret = slice_segments(x, ids_str, segment_size)
91
+ return ret, ids_str
92
+
93
+
94
+ def get_timing_signal_1d(
95
+ length, channels, min_timescale=1.0, max_timescale=1.0e4):
96
+ position = torch.arange(length, dtype=torch.float)
97
+ num_timescales = channels // 2
98
+ log_timescale_increment = (
99
+ math.log(float(max_timescale) / float(min_timescale)) /
100
+ (num_timescales - 1))
101
+ inv_timescales = min_timescale * torch.exp(
102
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
103
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
104
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
105
+ signal = F.pad(signal, [0, 0, 0, channels % 2])
106
+ signal = signal.view(1, channels, length)
107
+ return signal
108
+
109
+
110
+ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
111
+ b, channels, length = x.size()
112
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
113
+ return x + signal.to(dtype=x.dtype, device=x.device)
114
+
115
+
116
+ def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
117
+ b, channels, length = x.size()
118
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
119
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
120
+
121
+
122
+ def subsequent_mask(length):
123
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
124
+ return mask
125
+
126
+
127
+ @torch.jit.script
128
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
129
+ n_channels_int = n_channels[0]
130
+ in_act = input_a + input_b
131
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
132
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
133
+ acts = t_act * s_act
134
+ return acts
135
+
136
+
137
+ def convert_pad_shape(pad_shape):
138
+ l = pad_shape[::-1]
139
+ pad_shape = [item for sublist in l for item in sublist]
140
+ return pad_shape
141
+
142
+
143
+ def shift_1d(x):
144
+ x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
145
+ return x
146
+
147
+
148
+ def sequence_mask(length, max_length=None):
149
+ if max_length is None:
150
+ max_length = length.max()
151
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
152
+ return x.unsqueeze(0) < length.unsqueeze(1)
153
+
154
+
155
+ def generate_path(duration, mask):
156
+ """
157
+ duration: [b, 1, t_x]
158
+ mask: [b, 1, t_y, t_x]
159
+ """
160
+ device = duration.device
161
+
162
+ b, _, t_y, t_x = mask.shape
163
+ cum_duration = torch.cumsum(duration, -1)
164
+
165
+ cum_duration_flat = cum_duration.view(b * t_x)
166
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
167
+ path = path.view(b, t_x, t_y)
168
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
169
+ path = path.unsqueeze(1).transpose(2,3) * mask
170
+ return path
171
+
172
+
173
+ def clip_grad_value_(parameters, clip_value, norm_type=2):
174
+ if isinstance(parameters, torch.Tensor):
175
+ parameters = [parameters]
176
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
177
+ norm_type = float(norm_type)
178
+ if clip_value is not None:
179
+ clip_value = float(clip_value)
180
+
181
+ total_norm = 0
182
+ for p in parameters:
183
+ param_norm = p.grad.data.norm(norm_type)
184
+ total_norm += param_norm.item() ** norm_type
185
+ if clip_value is not None:
186
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
187
+ total_norm = total_norm ** (1. / norm_type)
188
+ return total_norm
modules/losses.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.nn import functional as F
3
+
4
+ import modules.commons as commons
5
+
6
+
7
+ def feature_loss(fmap_r, fmap_g):
8
+ loss = 0
9
+ for dr, dg in zip(fmap_r, fmap_g):
10
+ for rl, gl in zip(dr, dg):
11
+ rl = rl.float().detach()
12
+ gl = gl.float()
13
+ loss += torch.mean(torch.abs(rl - gl))
14
+
15
+ return loss * 2
16
+
17
+
18
+ def discriminator_loss(disc_real_outputs, disc_generated_outputs):
19
+ loss = 0
20
+ r_losses = []
21
+ g_losses = []
22
+ for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
23
+ dr = dr.float()
24
+ dg = dg.float()
25
+ r_loss = torch.mean((1-dr)**2)
26
+ g_loss = torch.mean(dg**2)
27
+ loss += (r_loss + g_loss)
28
+ r_losses.append(r_loss.item())
29
+ g_losses.append(g_loss.item())
30
+
31
+ return loss, r_losses, g_losses
32
+
33
+
34
+ def generator_loss(disc_outputs):
35
+ loss = 0
36
+ gen_losses = []
37
+ for dg in disc_outputs:
38
+ dg = dg.float()
39
+ l = torch.mean((1-dg)**2)
40
+ gen_losses.append(l)
41
+ loss += l
42
+
43
+ return loss, gen_losses
44
+
45
+
46
+ def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
47
+ """
48
+ z_p, logs_q: [b, h, t_t]
49
+ m_p, logs_p: [b, h, t_t]
50
+ """
51
+ z_p = z_p.float()
52
+ logs_q = logs_q.float()
53
+ m_p = m_p.float()
54
+ logs_p = logs_p.float()
55
+ z_mask = z_mask.float()
56
+ #print(logs_p)
57
+ kl = logs_p - logs_q - 0.5
58
+ kl += 0.5 * ((z_p - m_p)**2) * torch.exp(-2. * logs_p)
59
+ kl = torch.sum(kl * z_mask)
60
+ l = kl / torch.sum(z_mask)
61
+ return l
modules/mel_processing.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import os
3
+ import random
4
+ import torch
5
+ from torch import nn
6
+ import torch.nn.functional as F
7
+ import torch.utils.data
8
+ import numpy as np
9
+ import librosa
10
+ import librosa.util as librosa_util
11
+ from librosa.util import normalize, pad_center, tiny
12
+ from scipy.signal import get_window
13
+ from scipy.io.wavfile import read
14
+ from librosa.filters import mel as librosa_mel_fn
15
+
16
+ MAX_WAV_VALUE = 32768.0
17
+
18
+
19
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
20
+ """
21
+ PARAMS
22
+ ------
23
+ C: compression factor
24
+ """
25
+ return torch.log(torch.clamp(x, min=clip_val) * C)
26
+
27
+
28
+ def dynamic_range_decompression_torch(x, C=1):
29
+ """
30
+ PARAMS
31
+ ------
32
+ C: compression factor used to compress
33
+ """
34
+ return torch.exp(x) / C
35
+
36
+
37
+ def spectral_normalize_torch(magnitudes):
38
+ output = dynamic_range_compression_torch(magnitudes)
39
+ return output
40
+
41
+
42
+ def spectral_de_normalize_torch(magnitudes):
43
+ output = dynamic_range_decompression_torch(magnitudes)
44
+ return output
45
+
46
+
47
+ mel_basis = {}
48
+ hann_window = {}
49
+
50
+
51
+ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
52
+ if torch.min(y) < -1.:
53
+ print('min value is ', torch.min(y))
54
+ if torch.max(y) > 1.:
55
+ print('max value is ', torch.max(y))
56
+
57
+ global hann_window
58
+ dtype_device = str(y.dtype) + '_' + str(y.device)
59
+ wnsize_dtype_device = str(win_size) + '_' + dtype_device
60
+ if wnsize_dtype_device not in hann_window:
61
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
62
+
63
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
64
+ y = y.squeeze(1)
65
+
66
+ spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
67
+ center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
68
+
69
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
70
+ return spec
71
+
72
+
73
+ def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
74
+ global mel_basis
75
+ dtype_device = str(spec.dtype) + '_' + str(spec.device)
76
+ fmax_dtype_device = str(fmax) + '_' + dtype_device
77
+ if fmax_dtype_device not in mel_basis:
78
+ mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
79
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
80
+ spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
81
+ spec = spectral_normalize_torch(spec)
82
+ return spec
83
+
84
+
85
+ def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
86
+ if torch.min(y) < -1.:
87
+ print('min value is ', torch.min(y))
88
+ if torch.max(y) > 1.:
89
+ print('max value is ', torch.max(y))
90
+
91
+ global mel_basis, hann_window
92
+ dtype_device = str(y.dtype) + '_' + str(y.device)
93
+ fmax_dtype_device = str(fmax) + '_' + dtype_device
94
+ wnsize_dtype_device = str(win_size) + '_' + dtype_device
95
+ if fmax_dtype_device not in mel_basis:
96
+ mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
97
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
98
+ if wnsize_dtype_device not in hann_window:
99
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
100
+
101
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
102
+ y = y.squeeze(1)
103
+
104
+ spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
105
+ center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
106
+
107
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
108
+
109
+ spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
110
+ spec = spectral_normalize_torch(spec)
111
+
112
+ return spec
modules/modules.py ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+ import numpy as np
4
+ import scipy
5
+ import torch
6
+ from torch import nn
7
+ from torch.nn import functional as F
8
+
9
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
10
+ from torch.nn.utils import weight_norm, remove_weight_norm
11
+
12
+ import modules.commons as commons
13
+ from modules.commons import init_weights, get_padding
14
+
15
+
16
+ LRELU_SLOPE = 0.1
17
+
18
+
19
+ class LayerNorm(nn.Module):
20
+ def __init__(self, channels, eps=1e-5):
21
+ super().__init__()
22
+ self.channels = channels
23
+ self.eps = eps
24
+
25
+ self.gamma = nn.Parameter(torch.ones(channels))
26
+ self.beta = nn.Parameter(torch.zeros(channels))
27
+
28
+ def forward(self, x):
29
+ x = x.transpose(1, -1)
30
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
31
+ return x.transpose(1, -1)
32
+
33
+
34
+ class ConvReluNorm(nn.Module):
35
+ def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
36
+ super().__init__()
37
+ self.in_channels = in_channels
38
+ self.hidden_channels = hidden_channels
39
+ self.out_channels = out_channels
40
+ self.kernel_size = kernel_size
41
+ self.n_layers = n_layers
42
+ self.p_dropout = p_dropout
43
+ assert n_layers > 1, "Number of layers should be larger than 0."
44
+
45
+ self.conv_layers = nn.ModuleList()
46
+ self.norm_layers = nn.ModuleList()
47
+ self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
48
+ self.norm_layers.append(LayerNorm(hidden_channels))
49
+ self.relu_drop = nn.Sequential(
50
+ nn.ReLU(),
51
+ nn.Dropout(p_dropout))
52
+ for _ in range(n_layers-1):
53
+ self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
54
+ self.norm_layers.append(LayerNorm(hidden_channels))
55
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
56
+ self.proj.weight.data.zero_()
57
+ self.proj.bias.data.zero_()
58
+
59
+ def forward(self, x, x_mask):
60
+ x_org = x
61
+ for i in range(self.n_layers):
62
+ x = self.conv_layers[i](x * x_mask)
63
+ x = self.norm_layers[i](x)
64
+ x = self.relu_drop(x)
65
+ x = x_org + self.proj(x)
66
+ return x * x_mask
67
+
68
+
69
+ class DDSConv(nn.Module):
70
+ """
71
+ Dialted and Depth-Separable Convolution
72
+ """
73
+ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
74
+ super().__init__()
75
+ self.channels = channels
76
+ self.kernel_size = kernel_size
77
+ self.n_layers = n_layers
78
+ self.p_dropout = p_dropout
79
+
80
+ self.drop = nn.Dropout(p_dropout)
81
+ self.convs_sep = nn.ModuleList()
82
+ self.convs_1x1 = nn.ModuleList()
83
+ self.norms_1 = nn.ModuleList()
84
+ self.norms_2 = nn.ModuleList()
85
+ for i in range(n_layers):
86
+ dilation = kernel_size ** i
87
+ padding = (kernel_size * dilation - dilation) // 2
88
+ self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
89
+ groups=channels, dilation=dilation, padding=padding
90
+ ))
91
+ self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
92
+ self.norms_1.append(LayerNorm(channels))
93
+ self.norms_2.append(LayerNorm(channels))
94
+
95
+ def forward(self, x, x_mask, g=None):
96
+ if g is not None:
97
+ x = x + g
98
+ for i in range(self.n_layers):
99
+ y = self.convs_sep[i](x * x_mask)
100
+ y = self.norms_1[i](y)
101
+ y = F.gelu(y)
102
+ y = self.convs_1x1[i](y)
103
+ y = self.norms_2[i](y)
104
+ y = F.gelu(y)
105
+ y = self.drop(y)
106
+ x = x + y
107
+ return x * x_mask
108
+
109
+
110
+ class WN(torch.nn.Module):
111
+ def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
112
+ super(WN, self).__init__()
113
+ assert(kernel_size % 2 == 1)
114
+ self.hidden_channels =hidden_channels
115
+ self.kernel_size = kernel_size,
116
+ self.dilation_rate = dilation_rate
117
+ self.n_layers = n_layers
118
+ self.gin_channels = gin_channels
119
+ self.p_dropout = p_dropout
120
+
121
+ self.in_layers = torch.nn.ModuleList()
122
+ self.res_skip_layers = torch.nn.ModuleList()
123
+ self.drop = nn.Dropout(p_dropout)
124
+
125
+ if gin_channels != 0:
126
+ cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
127
+ self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
128
+
129
+ for i in range(n_layers):
130
+ dilation = dilation_rate ** i
131
+ padding = int((kernel_size * dilation - dilation) / 2)
132
+ in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
133
+ dilation=dilation, padding=padding)
134
+ in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
135
+ self.in_layers.append(in_layer)
136
+
137
+ # last one is not necessary
138
+ if i < n_layers - 1:
139
+ res_skip_channels = 2 * hidden_channels
140
+ else:
141
+ res_skip_channels = hidden_channels
142
+
143
+ res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
144
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
145
+ self.res_skip_layers.append(res_skip_layer)
146
+
147
+ def forward(self, x, x_mask, g=None, **kwargs):
148
+ output = torch.zeros_like(x)
149
+ n_channels_tensor = torch.IntTensor([self.hidden_channels])
150
+
151
+ if g is not None:
152
+ g = self.cond_layer(g)
153
+
154
+ for i in range(self.n_layers):
155
+ x_in = self.in_layers[i](x)
156
+ if g is not None:
157
+ cond_offset = i * 2 * self.hidden_channels
158
+ g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
159
+ else:
160
+ g_l = torch.zeros_like(x_in)
161
+
162
+ acts = commons.fused_add_tanh_sigmoid_multiply(
163
+ x_in,
164
+ g_l,
165
+ n_channels_tensor)
166
+ acts = self.drop(acts)
167
+
168
+ res_skip_acts = self.res_skip_layers[i](acts)
169
+ if i < self.n_layers - 1:
170
+ res_acts = res_skip_acts[:,:self.hidden_channels,:]
171
+ x = (x + res_acts) * x_mask
172
+ output = output + res_skip_acts[:,self.hidden_channels:,:]
173
+ else:
174
+ output = output + res_skip_acts
175
+ return output * x_mask
176
+
177
+ def remove_weight_norm(self):
178
+ if self.gin_channels != 0:
179
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
180
+ for l in self.in_layers:
181
+ torch.nn.utils.remove_weight_norm(l)
182
+ for l in self.res_skip_layers:
183
+ torch.nn.utils.remove_weight_norm(l)
184
+
185
+
186
+ class ResBlock1(torch.nn.Module):
187
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
188
+ super(ResBlock1, self).__init__()
189
+ self.convs1 = nn.ModuleList([
190
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
191
+ padding=get_padding(kernel_size, dilation[0]))),
192
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
193
+ padding=get_padding(kernel_size, dilation[1]))),
194
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
195
+ padding=get_padding(kernel_size, dilation[2])))
196
+ ])
197
+ self.convs1.apply(init_weights)
198
+
199
+ self.convs2 = nn.ModuleList([
200
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
201
+ padding=get_padding(kernel_size, 1))),
202
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
203
+ padding=get_padding(kernel_size, 1))),
204
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
205
+ padding=get_padding(kernel_size, 1)))
206
+ ])
207
+ self.convs2.apply(init_weights)
208
+
209
+ def forward(self, x, x_mask=None):
210
+ for c1, c2 in zip(self.convs1, self.convs2):
211
+ xt = F.leaky_relu(x, LRELU_SLOPE)
212
+ if x_mask is not None:
213
+ xt = xt * x_mask
214
+ xt = c1(xt)
215
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
216
+ if x_mask is not None:
217
+ xt = xt * x_mask
218
+ xt = c2(xt)
219
+ x = xt + x
220
+ if x_mask is not None:
221
+ x = x * x_mask
222
+ return x
223
+
224
+ def remove_weight_norm(self):
225
+ for l in self.convs1:
226
+ remove_weight_norm(l)
227
+ for l in self.convs2:
228
+ remove_weight_norm(l)
229
+
230
+
231
+ class ResBlock2(torch.nn.Module):
232
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
233
+ super(ResBlock2, self).__init__()
234
+ self.convs = nn.ModuleList([
235
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
236
+ padding=get_padding(kernel_size, dilation[0]))),
237
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
238
+ padding=get_padding(kernel_size, dilation[1])))
239
+ ])
240
+ self.convs.apply(init_weights)
241
+
242
+ def forward(self, x, x_mask=None):
243
+ for c in self.convs:
244
+ xt = F.leaky_relu(x, LRELU_SLOPE)
245
+ if x_mask is not None:
246
+ xt = xt * x_mask
247
+ xt = c(xt)
248
+ x = xt + x
249
+ if x_mask is not None:
250
+ x = x * x_mask
251
+ return x
252
+
253
+ def remove_weight_norm(self):
254
+ for l in self.convs:
255
+ remove_weight_norm(l)
256
+
257
+
258
+ class Log(nn.Module):
259
+ def forward(self, x, x_mask, reverse=False, **kwargs):
260
+ if not reverse:
261
+ y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
262
+ logdet = torch.sum(-y, [1, 2])
263
+ return y, logdet
264
+ else:
265
+ x = torch.exp(x) * x_mask
266
+ return x
267
+
268
+
269
+ class Flip(nn.Module):
270
+ def forward(self, x, *args, reverse=False, **kwargs):
271
+ x = torch.flip(x, [1])
272
+ if not reverse:
273
+ logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
274
+ return x, logdet
275
+ else:
276
+ return x
277
+
278
+
279
+ class ElementwiseAffine(nn.Module):
280
+ def __init__(self, channels):
281
+ super().__init__()
282
+ self.channels = channels
283
+ self.m = nn.Parameter(torch.zeros(channels,1))
284
+ self.logs = nn.Parameter(torch.zeros(channels,1))
285
+
286
+ def forward(self, x, x_mask, reverse=False, **kwargs):
287
+ if not reverse:
288
+ y = self.m + torch.exp(self.logs) * x
289
+ y = y * x_mask
290
+ logdet = torch.sum(self.logs * x_mask, [1,2])
291
+ return y, logdet
292
+ else:
293
+ x = (x - self.m) * torch.exp(-self.logs) * x_mask
294
+ return x
295
+
296
+
297
+ class ResidualCouplingLayer(nn.Module):
298
+ def __init__(self,
299
+ channels,
300
+ hidden_channels,
301
+ kernel_size,
302
+ dilation_rate,
303
+ n_layers,
304
+ p_dropout=0,
305
+ gin_channels=0,
306
+ mean_only=False):
307
+ assert channels % 2 == 0, "channels should be divisible by 2"
308
+ super().__init__()
309
+ self.channels = channels
310
+ self.hidden_channels = hidden_channels
311
+ self.kernel_size = kernel_size
312
+ self.dilation_rate = dilation_rate
313
+ self.n_layers = n_layers
314
+ self.half_channels = channels // 2
315
+ self.mean_only = mean_only
316
+
317
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
318
+ self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
319
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
320
+ self.post.weight.data.zero_()
321
+ self.post.bias.data.zero_()
322
+
323
+ def forward(self, x, x_mask, g=None, reverse=False):
324
+ x0, x1 = torch.split(x, [self.half_channels]*2, 1)
325
+ h = self.pre(x0) * x_mask
326
+ h = self.enc(h, x_mask, g=g)
327
+ stats = self.post(h) * x_mask
328
+ if not self.mean_only:
329
+ m, logs = torch.split(stats, [self.half_channels]*2, 1)
330
+ else:
331
+ m = stats
332
+ logs = torch.zeros_like(m)
333
+
334
+ if not reverse:
335
+ x1 = m + x1 * torch.exp(logs) * x_mask
336
+ x = torch.cat([x0, x1], 1)
337
+ logdet = torch.sum(logs, [1,2])
338
+ return x, logdet
339
+ else:
340
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
341
+ x = torch.cat([x0, x1], 1)
342
+ return x