PhoenixStormJr commited on
Commit
d1abff7
·
verified ·
1 Parent(s): 43b3a27

Upload vc_infer_pipeline.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. vc_infer_pipeline.py +620 -0
vc_infer_pipeline.py ADDED
@@ -0,0 +1,620 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np, parselmouth, torch, pdb
2
+ from time import time as ttime
3
+ import torch.nn.functional as F
4
+ import torchcrepe # Fork feature. Use the crepe f0 algorithm. New dependency (pip install torchcrepe)
5
+ from torch import Tensor
6
+ import scipy.signal as signal
7
+ import pyworld, os, traceback, faiss, librosa, torchcrepe
8
+ from scipy import signal
9
+ from functools import lru_cache
10
+
11
+ bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
12
+
13
+ input_audio_path2wav = {}
14
+
15
+ @lru_cache
16
+ def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
17
+ audio = input_audio_path2wav[input_audio_path]
18
+ f0, t = pyworld.harvest(
19
+ audio,
20
+ fs=fs,
21
+ f0_ceil=f0max,
22
+ f0_floor=f0min,
23
+ frame_period=frame_period,
24
+ )
25
+ f0 = pyworld.stonemask(audio, f0, t, fs)
26
+ return f0
27
+
28
+
29
+ def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比
30
+ # print(data1.max(),data2.max())
31
+ rms1 = librosa.feature.rms(
32
+ y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
33
+ ) # 每半秒一个点
34
+ rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
35
+ rms1 = torch.from_numpy(rms1)
36
+ rms1 = F.interpolate(
37
+ rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
38
+ ).squeeze()
39
+ rms2 = torch.from_numpy(rms2)
40
+ rms2 = F.interpolate(
41
+ rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
42
+ ).squeeze()
43
+ rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
44
+ data2 *= (
45
+ torch.pow(rms1, torch.tensor(1 - rate))
46
+ * torch.pow(rms2, torch.tensor(rate - 1))
47
+ ).numpy()
48
+ return data2
49
+
50
+
51
+ class VC(object):
52
+ def __init__(self, tgt_sr, config):
53
+ self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
54
+ config.x_pad,
55
+ config.x_query,
56
+ config.x_center,
57
+ config.x_max,
58
+ config.is_half,
59
+ )
60
+ self.sr = 16000 # hubert输入采样率
61
+ self.window = 160 # 每帧点数
62
+ self.t_pad = self.sr * self.x_pad # 每条前后pad时间
63
+ self.t_pad_tgt = tgt_sr * self.x_pad
64
+ self.t_pad2 = self.t_pad * 2
65
+ self.t_query = self.sr * self.x_query # 查询切点前后查询时间
66
+ self.t_center = self.sr * self.x_center # 查询切点位置
67
+ self.t_max = self.sr * self.x_max # 免查询时长阈值
68
+ self.device = config.device
69
+
70
+ # Fork Feature: Get the best torch device to use for f0 algorithms that require a torch device. Will return the type (torch.device)
71
+ def get_optimal_torch_device(self, index: int = 0) -> torch.device:
72
+ # Get cuda device
73
+ if torch.cuda.is_available():
74
+ return torch.device(f"cuda:{index % torch.cuda.device_count()}") # Very fast
75
+ elif torch.backends.mps.is_available():
76
+ return torch.device("mps")
77
+ # Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library
78
+ # Else wise return the "cpu" as a torch device,
79
+ return torch.device("cpu")
80
+
81
+ # Fork Feature: Compute f0 with the crepe method
82
+ def get_f0_crepe_computation(
83
+ self,
84
+ x,
85
+ f0_min,
86
+ f0_max,
87
+ p_len,
88
+ hop_length=160, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time.
89
+ model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full
90
+ ):
91
+ x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float.
92
+ x /= np.quantile(np.abs(x), 0.999)
93
+ torch_device = self.get_optimal_torch_device()
94
+ audio = torch.from_numpy(x).to(torch_device, copy=True)
95
+ audio = torch.unsqueeze(audio, dim=0)
96
+ if audio.ndim == 2 and audio.shape[0] > 1:
97
+ audio = torch.mean(audio, dim=0, keepdim=True).detach()
98
+ audio = audio.detach()
99
+ print("Initiating prediction with a crepe_hop_length of: " + str(hop_length))
100
+ pitch: Tensor = torchcrepe.predict(
101
+ audio,
102
+ self.sr,
103
+ hop_length,
104
+ f0_min,
105
+ f0_max,
106
+ model,
107
+ batch_size=hop_length * 2,
108
+ device=torch_device,
109
+ pad=True
110
+ )
111
+ p_len = p_len or x.shape[0] // hop_length
112
+ # Resize the pitch for final f0
113
+ source = np.array(pitch.squeeze(0).cpu().float().numpy())
114
+ source[source < 0.001] = np.nan
115
+ target = np.interp(
116
+ np.arange(0, len(source) * p_len, len(source)) / p_len,
117
+ np.arange(0, len(source)),
118
+ source
119
+ )
120
+ f0 = np.nan_to_num(target)
121
+ return f0 # Resized f0
122
+
123
+ def get_f0_official_crepe_computation(
124
+ self,
125
+ x,
126
+ f0_min,
127
+ f0_max,
128
+ model="full",
129
+ ):
130
+ # Pick a batch size that doesn't cause memory errors on your gpu
131
+ batch_size = 512
132
+ # Compute pitch using first gpu
133
+ audio = torch.tensor(np.copy(x))[None].float()
134
+ f0, pd = torchcrepe.predict(
135
+ audio,
136
+ self.sr,
137
+ self.window,
138
+ f0_min,
139
+ f0_max,
140
+ model,
141
+ batch_size=batch_size,
142
+ device=self.device,
143
+ return_periodicity=True,
144
+ )
145
+ pd = torchcrepe.filter.median(pd, 3)
146
+ f0 = torchcrepe.filter.mean(f0, 3)
147
+ f0[pd < 0.1] = 0
148
+ f0 = f0[0].cpu().numpy()
149
+ return f0
150
+
151
+ # Fork Feature: Compute pYIN f0 method
152
+ def get_f0_pyin_computation(self, x, f0_min, f0_max):
153
+ y, sr = librosa.load('saudio/Sidney.wav', self.sr, mono=True)
154
+ f0, _, _ = librosa.pyin(y, sr=self.sr, fmin=f0_min, fmax=f0_max)
155
+ f0 = f0[1:] # Get rid of extra first frame
156
+ return f0
157
+
158
+ # Fork Feature: Acquire median hybrid f0 estimation calculation
159
+ def get_f0_hybrid_computation(
160
+ self,
161
+ methods_str,
162
+ input_audio_path,
163
+ x,
164
+ f0_min,
165
+ f0_max,
166
+ p_len,
167
+ filter_radius,
168
+ crepe_hop_length,
169
+ time_step,
170
+ ):
171
+ # Get various f0 methods from input to use in the computation stack
172
+ s = methods_str
173
+ s = s.split('hybrid')[1]
174
+ s = s.replace('[', '').replace(']', '')
175
+ methods = s.split('+')
176
+ f0_computation_stack = []
177
+
178
+ print("Calculating f0 pitch estimations for methods: %s" % str(methods))
179
+ x = x.astype(np.float32)
180
+ x /= np.quantile(np.abs(x), 0.999)
181
+ # Get f0 calculations for all methods specified
182
+ for method in methods:
183
+ f0 = None
184
+ if method == "pm":
185
+ f0 = (
186
+ parselmouth.Sound(x, self.sr)
187
+ .to_pitch_ac(
188
+ time_step=time_step / 1000,
189
+ voicing_threshold=0.6,
190
+ pitch_floor=f0_min,
191
+ pitch_ceiling=f0_max,
192
+ )
193
+ .selected_array["frequency"]
194
+ )
195
+ pad_size = (p_len - len(f0) + 1) // 2
196
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
197
+ f0 = np.pad(
198
+ f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
199
+ )
200
+ elif method == "crepe":
201
+ f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max)
202
+ f0 = f0[1:] # Get rid of extra first frame
203
+ elif method == "crepe-tiny":
204
+ f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny")
205
+ f0 = f0[1:] # Get rid of extra first frame
206
+ elif method == "mangio-crepe":
207
+ f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length)
208
+ elif method == "mangio-crepe-tiny":
209
+ f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length, "tiny")
210
+ elif method == "harvest":
211
+ f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
212
+ if filter_radius > 2:
213
+ f0 = signal.medfilt(f0, 3)
214
+ f0 = f0[1:] # Get rid of first frame.
215
+ elif method == "dio": # Potentially buggy?
216
+ f0, t = pyworld.dio(
217
+ x.astype(np.double),
218
+ fs=self.sr,
219
+ f0_ceil=f0_max,
220
+ f0_floor=f0_min,
221
+ frame_period=10
222
+ )
223
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
224
+ f0 = signal.medfilt(f0, 3)
225
+ f0 = f0[1:]
226
+ #elif method == "pyin": Not Working just yet
227
+ # f0 = self.get_f0_pyin_computation(x, f0_min, f0_max)
228
+ # Push method to the stack
229
+ f0_computation_stack.append(f0)
230
+
231
+ for fc in f0_computation_stack:
232
+ print(len(fc))
233
+
234
+ print("Calculating hybrid median f0 from the stack of: %s" % str(methods))
235
+ f0_median_hybrid = None
236
+ if len(f0_computation_stack) == 1:
237
+ f0_median_hybrid = f0_computation_stack[0]
238
+ else:
239
+ f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0)
240
+ return f0_median_hybrid
241
+
242
+ def get_f0(
243
+ self,
244
+ input_audio_path,
245
+ x,
246
+ p_len,
247
+ f0_up_key,
248
+ f0_method,
249
+ filter_radius,
250
+ crepe_hop_length,
251
+ inp_f0=None,
252
+ ):
253
+ global input_audio_path2wav
254
+ time_step = self.window / self.sr * 1000
255
+ f0_min = 50
256
+ f0_max = 1100
257
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
258
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
259
+ if f0_method == "pm":
260
+ f0 = (
261
+ parselmouth.Sound(x, self.sr)
262
+ .to_pitch_ac(
263
+ time_step=time_step / 1000,
264
+ voicing_threshold=0.6,
265
+ pitch_floor=f0_min,
266
+ pitch_ceiling=f0_max,
267
+ )
268
+ .selected_array["frequency"]
269
+ )
270
+ pad_size = (p_len - len(f0) + 1) // 2
271
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
272
+ f0 = np.pad(
273
+ f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
274
+ )
275
+ elif f0_method == "harvest":
276
+ input_audio_path2wav[input_audio_path] = x.astype(np.double)
277
+ f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
278
+ if filter_radius > 2:
279
+ f0 = signal.medfilt(f0, 3)
280
+ elif f0_method == "dio": # Potentially Buggy?
281
+ f0, t = pyworld.dio(
282
+ x.astype(np.double),
283
+ fs=self.sr,
284
+ f0_ceil=f0_max,
285
+ f0_floor=f0_min,
286
+ frame_period=10
287
+ )
288
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
289
+ f0 = signal.medfilt(f0, 3)
290
+ elif f0_method == "crepe":
291
+ f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max)
292
+ elif f0_method == "crepe-tiny":
293
+ f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny")
294
+ elif f0_method == "mangio-crepe":
295
+ f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length)
296
+ elif f0_method == "mangio-crepe-tiny":
297
+ f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length, "tiny")
298
+ elif "hybrid" in f0_method:
299
+ # Perform hybrid median pitch estimation
300
+ input_audio_path2wav[input_audio_path] = x.astype(np.double)
301
+ f0 = self.get_f0_hybrid_computation(
302
+ f0_method,
303
+ input_audio_path,
304
+ x,
305
+ f0_min,
306
+ f0_max,
307
+ p_len,
308
+ filter_radius,
309
+ crepe_hop_length,
310
+ time_step
311
+ )
312
+
313
+ f0 *= pow(2, f0_up_key / 12)
314
+ # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
315
+ tf0 = self.sr // self.window # 每秒f0点数
316
+ if inp_f0 is not None:
317
+ delta_t = np.round(
318
+ (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
319
+ ).astype("int16")
320
+ replace_f0 = np.interp(
321
+ list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
322
+ )
323
+ shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
324
+ f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
325
+ :shape
326
+ ]
327
+ # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
328
+ f0bak = f0.copy()
329
+ f0_mel = 1127 * np.log(1 + f0 / 700)
330
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
331
+ f0_mel_max - f0_mel_min
332
+ ) + 1
333
+ f0_mel[f0_mel <= 1] = 1
334
+ f0_mel[f0_mel > 255] = 255
335
+ f0_coarse = np.rint(f0_mel).astype(np.int)
336
+
337
+ return f0_coarse, f0bak # 1-0
338
+
339
+ def vc(
340
+ self,
341
+ model,
342
+ net_g,
343
+ sid,
344
+ audio0,
345
+ pitch,
346
+ pitchf,
347
+ times,
348
+ index,
349
+ big_npy,
350
+ index_rate,
351
+ version,
352
+ protect,
353
+ ): # ,file_index,file_big_npy
354
+ feats = torch.from_numpy(audio0)
355
+ if self.is_half:
356
+ feats = feats.half()
357
+ else:
358
+ feats = feats.float()
359
+ if feats.dim() == 2: # double channels
360
+ feats = feats.mean(-1)
361
+ assert feats.dim() == 1, feats.dim()
362
+ feats = feats.view(1, -1)
363
+ padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
364
+
365
+ inputs = {
366
+ "source": feats.to(self.device),
367
+ "padding_mask": padding_mask,
368
+ "output_layer": 9 if version == "v1" else 12,
369
+ }
370
+ t0 = ttime()
371
+ with torch.no_grad():
372
+ logits = model.extract_features(**inputs)
373
+ feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
374
+ if protect < 0.5 and pitch != None and pitchf != None:
375
+ feats0 = feats.clone()
376
+ if (
377
+ isinstance(index, type(None)) == False
378
+ and isinstance(big_npy, type(None)) == False
379
+ and index_rate != 0
380
+ ):
381
+ npy = feats[0].cpu().numpy()
382
+ if self.is_half:
383
+ npy = npy.astype("float32")
384
+
385
+ # _, I = index.search(npy, 1)
386
+ # npy = big_npy[I.squeeze()]
387
+
388
+ score, ix = index.search(npy, k=8)
389
+ weight = np.square(1 / score)
390
+ weight /= weight.sum(axis=1, keepdims=True)
391
+ npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
392
+
393
+ if self.is_half:
394
+ npy = npy.astype("float16")
395
+ feats = (
396
+ torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
397
+ + (1 - index_rate) * feats
398
+ )
399
+
400
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
401
+ if protect < 0.5 and pitch != None and pitchf != None:
402
+ feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
403
+ 0, 2, 1
404
+ )
405
+ t1 = ttime()
406
+ p_len = audio0.shape[0] // self.window
407
+ if feats.shape[1] < p_len:
408
+ p_len = feats.shape[1]
409
+ if pitch != None and pitchf != None:
410
+ pitch = pitch[:, :p_len]
411
+ pitchf = pitchf[:, :p_len]
412
+
413
+ if protect < 0.5 and pitch != None and pitchf != None:
414
+ pitchff = pitchf.clone()
415
+ pitchff[pitchf > 0] = 1
416
+ pitchff[pitchf < 1] = protect
417
+ pitchff = pitchff.unsqueeze(-1)
418
+ feats = feats * pitchff + feats0 * (1 - pitchff)
419
+ feats = feats.to(feats0.dtype)
420
+ p_len = torch.tensor([p_len], device=self.device).long()
421
+ with torch.no_grad():
422
+ if pitch != None and pitchf != None:
423
+ audio1 = (
424
+ (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
425
+ .data.cpu()
426
+ .float()
427
+ .numpy()
428
+ )
429
+ else:
430
+ audio1 = (
431
+ (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
432
+ )
433
+ del feats, p_len, padding_mask
434
+ if torch.cuda.is_available():
435
+ torch.cuda.empty_cache()
436
+ t2 = ttime()
437
+ times[0] += t1 - t0
438
+ times[2] += t2 - t1
439
+ return audio1
440
+
441
+ def pipeline(
442
+ self,
443
+ model,
444
+ net_g,
445
+ sid,
446
+ audio,
447
+ input_audio_path,
448
+ times,
449
+ f0_up_key,
450
+ f0_method,
451
+ file_index,
452
+ # file_big_npy,
453
+ index_rate,
454
+ if_f0,
455
+ filter_radius,
456
+ tgt_sr,
457
+ resample_sr,
458
+ rms_mix_rate,
459
+ version,
460
+ protect,
461
+ crepe_hop_length,
462
+ f0_file=None,
463
+ ):
464
+ if (
465
+ file_index != ""
466
+ # and file_big_npy != ""
467
+ # and os.path.exists(file_big_npy) == True
468
+ and os.path.exists(file_index) == True
469
+ and index_rate != 0
470
+ ):
471
+ try:
472
+ index = faiss.read_index(file_index)
473
+ # big_npy = np.load(file_big_npy)
474
+ big_npy = index.reconstruct_n(0, index.ntotal)
475
+ except:
476
+ traceback.print_exc()
477
+ index = big_npy = None
478
+ else:
479
+ index = big_npy = None
480
+ audio = signal.filtfilt(bh, ah, audio)
481
+ audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
482
+ opt_ts = []
483
+ if audio_pad.shape[0] > self.t_max:
484
+ audio_sum = np.zeros_like(audio)
485
+ for i in range(self.window):
486
+ audio_sum += audio_pad[i : i - self.window]
487
+ for t in range(self.t_center, audio.shape[0], self.t_center):
488
+ opt_ts.append(
489
+ t
490
+ - self.t_query
491
+ + np.where(
492
+ np.abs(audio_sum[t - self.t_query : t + self.t_query])
493
+ == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
494
+ )[0][0]
495
+ )
496
+ s = 0
497
+ audio_opt = []
498
+ t = None
499
+ t1 = ttime()
500
+ audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
501
+ p_len = audio_pad.shape[0] // self.window
502
+ inp_f0 = None
503
+ if hasattr(f0_file, "name") == True:
504
+ try:
505
+ with open(f0_file.name, "r") as f:
506
+ lines = f.read().strip("\n").split("\n")
507
+ inp_f0 = []
508
+ for line in lines:
509
+ inp_f0.append([float(i) for i in line.split(",")])
510
+ inp_f0 = np.array(inp_f0, dtype="float32")
511
+ except:
512
+ traceback.print_exc()
513
+ sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
514
+ pitch, pitchf = None, None
515
+ if if_f0 == 1:
516
+ pitch, pitchf = self.get_f0(
517
+ input_audio_path,
518
+ audio_pad,
519
+ p_len,
520
+ f0_up_key,
521
+ f0_method,
522
+ filter_radius,
523
+ crepe_hop_length,
524
+ inp_f0,
525
+ )
526
+ pitch = pitch[:p_len]
527
+ pitchf = pitchf[:p_len]
528
+ if self.device == "mps":
529
+ pitchf = pitchf.astype(np.float32)
530
+ pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
531
+ pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
532
+ t2 = ttime()
533
+ times[1] += t2 - t1
534
+ for t in opt_ts:
535
+ t = t // self.window * self.window
536
+ if if_f0 == 1:
537
+ audio_opt.append(
538
+ self.vc(
539
+ model,
540
+ net_g,
541
+ sid,
542
+ audio_pad[s : t + self.t_pad2 + self.window],
543
+ pitch[:, s // self.window : (t + self.t_pad2) // self.window],
544
+ pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
545
+ times,
546
+ index,
547
+ big_npy,
548
+ index_rate,
549
+ version,
550
+ protect,
551
+ )[self.t_pad_tgt : -self.t_pad_tgt]
552
+ )
553
+ else:
554
+ audio_opt.append(
555
+ self.vc(
556
+ model,
557
+ net_g,
558
+ sid,
559
+ audio_pad[s : t + self.t_pad2 + self.window],
560
+ None,
561
+ None,
562
+ times,
563
+ index,
564
+ big_npy,
565
+ index_rate,
566
+ version,
567
+ protect,
568
+ )[self.t_pad_tgt : -self.t_pad_tgt]
569
+ )
570
+ s = t
571
+ if if_f0 == 1:
572
+ audio_opt.append(
573
+ self.vc(
574
+ model,
575
+ net_g,
576
+ sid,
577
+ audio_pad[t:],
578
+ pitch[:, t // self.window :] if t is not None else pitch,
579
+ pitchf[:, t // self.window :] if t is not None else pitchf,
580
+ times,
581
+ index,
582
+ big_npy,
583
+ index_rate,
584
+ version,
585
+ protect,
586
+ )[self.t_pad_tgt : -self.t_pad_tgt]
587
+ )
588
+ else:
589
+ audio_opt.append(
590
+ self.vc(
591
+ model,
592
+ net_g,
593
+ sid,
594
+ audio_pad[t:],
595
+ None,
596
+ None,
597
+ times,
598
+ index,
599
+ big_npy,
600
+ index_rate,
601
+ version,
602
+ protect,
603
+ )[self.t_pad_tgt : -self.t_pad_tgt]
604
+ )
605
+ audio_opt = np.concatenate(audio_opt)
606
+ if rms_mix_rate != 1:
607
+ audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
608
+ if resample_sr >= 16000 and tgt_sr != resample_sr:
609
+ audio_opt = librosa.resample(
610
+ audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
611
+ )
612
+ audio_max = np.abs(audio_opt).max() / 0.99
613
+ max_int16 = 32768
614
+ if audio_max > 1:
615
+ max_int16 /= audio_max
616
+ audio_opt = (audio_opt * max_int16).astype(np.int16)
617
+ del pitch, pitchf, sid
618
+ if torch.cuda.is_available():
619
+ torch.cuda.empty_cache()
620
+ return audio_opt