PhoenixStormJr commited on
Commit
002c1b1
·
verified ·
1 Parent(s): aa82845

Upload extract_f0_print.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. extract_f0_print.py +411 -0
extract_f0_print.py ADDED
@@ -0,0 +1,411 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, traceback, sys, parselmouth
2
+
3
+ now_dir = os.getcwd()
4
+ sys.path.append(now_dir)
5
+ from my_utils import load_audio
6
+ import pyworld
7
+ from scipy.io import wavfile
8
+ import numpy as np, logging
9
+ import torchcrepe # Fork Feature. Crepe algo for training and preprocess
10
+ import torch
11
+ from torch import Tensor # Fork Feature. Used for pitch prediction for torch crepe.
12
+ import scipy.signal as signal # Fork Feature hybrid inference
13
+ import tqdm
14
+
15
+ logging.getLogger("numba").setLevel(logging.WARNING)
16
+ from multiprocessing import Process
17
+
18
+ exp_dir = sys.argv[1]
19
+ f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
20
+
21
+
22
+ def printt(strr):
23
+ print(strr)
24
+ f.write("%s\n" % strr)
25
+ f.flush()
26
+
27
+
28
+ n_p = int(sys.argv[2])
29
+ f0method = sys.argv[3]
30
+ extraction_crepe_hop_length = 0
31
+ try:
32
+ extraction_crepe_hop_length = int(sys.argv[4])
33
+ except:
34
+ print("Temp Issue. echl is not being passed with argument!")
35
+ extraction_crepe_hop_length = 128
36
+
37
+ # print("EXTRACTION CREPE HOP LENGTH: " + str(extraction_crepe_hop_length))
38
+ # print("EXTRACTION CREPE HOP LENGTH TYPE: " + str(type(extraction_crepe_hop_length)))
39
+
40
+
41
+ class FeatureInput(object):
42
+ def __init__(self, samplerate=16000, hop_size=160):
43
+ self.fs = samplerate
44
+ self.hop = hop_size
45
+
46
+ self.f0_bin = 256
47
+ self.f0_max = 1100.0
48
+ self.f0_min = 50.0
49
+ self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
50
+ self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
51
+
52
+ # EXPERIMENTAL. PROBABLY BUGGY
53
+ def get_f0_hybrid_computation(
54
+ self,
55
+ methods_str,
56
+ x,
57
+ f0_min,
58
+ f0_max,
59
+ p_len,
60
+ crepe_hop_length,
61
+ time_step,
62
+ ):
63
+ # Get various f0 methods from input to use in the computation stack
64
+ s = methods_str
65
+ s = s.split('hybrid')[1]
66
+ s = s.replace('[', '').replace(']', '')
67
+ methods = s.split('+')
68
+ f0_computation_stack = []
69
+
70
+ print("Calculating f0 pitch estimations for methods: %s" % str(methods))
71
+ x = x.astype(np.float32)
72
+ x /= np.quantile(np.abs(x), 0.999)
73
+ # Get f0 calculations for all methods specified
74
+ for method in methods:
75
+ f0 = None
76
+ if method == "pm":
77
+ f0 = (
78
+ parselmouth.Sound(x, self.fs)
79
+ .to_pitch_ac(
80
+ time_step=time_step / 1000,
81
+ voicing_threshold=0.6,
82
+ pitch_floor=f0_min,
83
+ pitch_ceiling=f0_max,
84
+ )
85
+ .selected_array["frequency"]
86
+ )
87
+ pad_size = (p_len - len(f0) + 1) // 2
88
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
89
+ f0 = np.pad(
90
+ f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
91
+ )
92
+ elif method == "crepe":
93
+ # Pick a batch size that doesn't cause memory errors on your gpu
94
+ torch_device_index = 0
95
+ torch_device = None
96
+ if torch.cuda.is_available():
97
+ torch_device = torch.device(f"cuda:{torch_device_index % torch.cuda.device_count()}")
98
+ elif torch.backends.mps.is_available():
99
+ torch_device = torch.device("mps")
100
+ else:
101
+ torch_device = torch.device("cpu")
102
+ model = "full"
103
+ batch_size = 512
104
+ # Compute pitch using first gpu
105
+ audio = torch.tensor(np.copy(x))[None].float()
106
+ f0, pd = torchcrepe.predict(
107
+ audio,
108
+ self.fs,
109
+ 160,
110
+ self.f0_min,
111
+ self.f0_max,
112
+ model,
113
+ batch_size=batch_size,
114
+ device=torch_device,
115
+ return_periodicity=True,
116
+ )
117
+ pd = torchcrepe.filter.median(pd, 3)
118
+ f0 = torchcrepe.filter.mean(f0, 3)
119
+ f0[pd < 0.1] = 0
120
+ f0 = f0[0].cpu().numpy()
121
+ f0 = f0[1:] # Get rid of extra first frame
122
+ elif method == "mangio-crepe":
123
+ # print("Performing crepe pitch extraction. (EXPERIMENTAL)")
124
+ # print("CREPE PITCH EXTRACTION HOP LENGTH: " + str(crepe_hop_length))
125
+ x = x.astype(np.float32)
126
+ x /= np.quantile(np.abs(x), 0.999)
127
+ torch_device_index = 0
128
+ torch_device = None
129
+ if torch.cuda.is_available():
130
+ torch_device = torch.device(f"cuda:{torch_device_index % torch.cuda.device_count()}")
131
+ elif torch.backends.mps.is_available():
132
+ torch_device = torch.device("mps")
133
+ else:
134
+ torch_device = torch.device("cpu")
135
+ audio = torch.from_numpy(x).to(torch_device, copy=True)
136
+ audio = torch.unsqueeze(audio, dim=0)
137
+ if audio.ndim == 2 and audio.shape[0] > 1:
138
+ audio = torch.mean(audio, dim=0, keepdim=True).detach()
139
+ audio = audio.detach()
140
+ # print(
141
+ # "Initiating f0 Crepe Feature Extraction with an extraction_crepe_hop_length of: " +
142
+ # str(crepe_hop_length)
143
+ # )
144
+ # Pitch prediction for pitch extraction
145
+ pitch: Tensor = torchcrepe.predict(
146
+ audio,
147
+ self.fs,
148
+ crepe_hop_length,
149
+ self.f0_min,
150
+ self.f0_max,
151
+ "full",
152
+ batch_size=crepe_hop_length * 2,
153
+ device=torch_device,
154
+ pad=True
155
+ )
156
+ p_len = p_len or x.shape[0] // crepe_hop_length
157
+ # Resize the pitch
158
+ source = np.array(pitch.squeeze(0).cpu().float().numpy())
159
+ source[source < 0.001] = np.nan
160
+ target = np.interp(
161
+ np.arange(0, len(source) * p_len, len(source)) / p_len,
162
+ np.arange(0, len(source)),
163
+ source
164
+ )
165
+ f0 = np.nan_to_num(target)
166
+ elif method == "harvest":
167
+ f0, t = pyworld.harvest(
168
+ x.astype(np.double),
169
+ fs=self.fs,
170
+ f0_ceil=self.f0_max,
171
+ f0_floor=self.f0_min,
172
+ frame_period=1000 * self.hop / self.fs,
173
+ )
174
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
175
+ f0 = signal.medfilt(f0, 3)
176
+ f0 = f0[1:]
177
+ elif method == "dio":
178
+ f0, t = pyworld.dio(
179
+ x.astype(np.double),
180
+ fs=self.fs,
181
+ f0_ceil=self.f0_max,
182
+ f0_floor=self.f0_min,
183
+ frame_period=1000 * self.hop / self.fs,
184
+ )
185
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
186
+ f0 = signal.medfilt(f0, 3)
187
+ f0 = f0[1:]
188
+ f0_computation_stack.append(f0)
189
+
190
+ for fc in f0_computation_stack:
191
+ print(len(fc))
192
+
193
+ # print("Calculating hybrid median f0 from the stack of: %s" % str(methods))
194
+
195
+ f0_median_hybrid = None
196
+ if len(f0_computation_stack) == 1:
197
+ f0_median_hybrid = f0_computation_stack[0]
198
+ else:
199
+ f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0)
200
+ return f0_median_hybrid
201
+
202
+ def compute_f0(self, path, f0_method, crepe_hop_length):
203
+ x = load_audio(path, self.fs)
204
+ p_len = x.shape[0] // self.hop
205
+ if f0_method == "pm":
206
+ time_step = 160 / 16000 * 1000
207
+ f0 = (
208
+ parselmouth.Sound(x, self.fs)
209
+ .to_pitch_ac(
210
+ time_step=time_step / 1000,
211
+ voicing_threshold=0.6,
212
+ pitch_floor=self.f0_min,
213
+ pitch_ceiling=self.f0_max,
214
+ )
215
+ .selected_array["frequency"]
216
+ )
217
+ pad_size = (p_len - len(f0) + 1) // 2
218
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
219
+ f0 = np.pad(
220
+ f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
221
+ )
222
+ elif f0_method == "harvest":
223
+ f0, t = pyworld.harvest(
224
+ x.astype(np.double),
225
+ fs=self.fs,
226
+ f0_ceil=self.f0_max,
227
+ f0_floor=self.f0_min,
228
+ frame_period=1000 * self.hop / self.fs,
229
+ )
230
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
231
+ elif f0_method == "dio":
232
+ f0, t = pyworld.dio(
233
+ x.astype(np.double),
234
+ fs=self.fs,
235
+ f0_ceil=self.f0_max,
236
+ f0_floor=self.f0_min,
237
+ frame_period=1000 * self.hop / self.fs,
238
+ )
239
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
240
+ elif f0_method == "crepe": # Fork Feature: Added crepe f0 for f0 feature extraction
241
+ # Pick a batch size that doesn't cause memory errors on your gpu
242
+ torch_device_index = 0
243
+ torch_device = None
244
+ if torch.cuda.is_available():
245
+ torch_device = torch.device(f"cuda:{torch_device_index % torch.cuda.device_count()}")
246
+ elif torch.backends.mps.is_available():
247
+ torch_device = torch.device("mps")
248
+ else:
249
+ torch_device = torch.device("cpu")
250
+ model = "full"
251
+ batch_size = 512
252
+ # Compute pitch using first gpu
253
+ audio = torch.tensor(np.copy(x))[None].float()
254
+ f0, pd = torchcrepe.predict(
255
+ audio,
256
+ self.fs,
257
+ 160,
258
+ self.f0_min,
259
+ self.f0_max,
260
+ model,
261
+ batch_size=batch_size,
262
+ device=torch_device,
263
+ return_periodicity=True,
264
+ )
265
+ pd = torchcrepe.filter.median(pd, 3)
266
+ f0 = torchcrepe.filter.mean(f0, 3)
267
+ f0[pd < 0.1] = 0
268
+ f0 = f0[0].cpu().numpy()
269
+ elif f0_method == "mangio-crepe":
270
+ # print("Performing crepe pitch extraction. (EXPERIMENTAL)")
271
+ # print("CREPE PITCH EXTRACTION HOP LENGTH: " + str(crepe_hop_length))
272
+ x = x.astype(np.float32)
273
+ x /= np.quantile(np.abs(x), 0.999)
274
+ torch_device_index = 0
275
+ torch_device = None
276
+ if torch.cuda.is_available():
277
+ torch_device = torch.device(f"cuda:{torch_device_index % torch.cuda.device_count()}")
278
+ elif torch.backends.mps.is_available():
279
+ torch_device = torch.device("mps")
280
+ else:
281
+ torch_device = torch.device("cpu")
282
+ audio = torch.from_numpy(x).to(torch_device, copy=True)
283
+ audio = torch.unsqueeze(audio, dim=0)
284
+ if audio.ndim == 2 and audio.shape[0] > 1:
285
+ audio = torch.mean(audio, dim=0, keepdim=True).detach()
286
+ audio = audio.detach()
287
+ # print(
288
+ # "Initiating f0 Crepe Feature Extraction with an extraction_crepe_hop_length of: " +
289
+ # str(crepe_hop_length)
290
+ # )
291
+ # Pitch prediction for pitch extraction
292
+ pitch: Tensor = torchcrepe.predict(
293
+ audio,
294
+ self.fs,
295
+ crepe_hop_length,
296
+ self.f0_min,
297
+ self.f0_max,
298
+ "full",
299
+ batch_size=crepe_hop_length * 2,
300
+ device=torch_device,
301
+ pad=True
302
+ )
303
+ p_len = p_len or x.shape[0] // crepe_hop_length
304
+ # Resize the pitch
305
+ source = np.array(pitch.squeeze(0).cpu().float().numpy())
306
+ source[source < 0.001] = np.nan
307
+ target = np.interp(
308
+ np.arange(0, len(source) * p_len, len(source)) / p_len,
309
+ np.arange(0, len(source)),
310
+ source
311
+ )
312
+ f0 = np.nan_to_num(target)
313
+ elif "hybrid" in f0_method: # EXPERIMENTAL
314
+ # Perform hybrid median pitch estimation
315
+ time_step = 160 / 16000 * 1000
316
+ f0 = self.get_f0_hybrid_computation(
317
+ f0_method,
318
+ x,
319
+ self.f0_min,
320
+ self.f0_max,
321
+ p_len,
322
+ crepe_hop_length,
323
+ time_step
324
+ )
325
+ # Mangio-RVC-Fork Feature: Add hybrid f0 inference to feature extraction. EXPERIMENTAL...
326
+
327
+ return f0
328
+
329
+ def coarse_f0(self, f0):
330
+ f0_mel = 1127 * np.log(1 + f0 / 700)
331
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
332
+ self.f0_bin - 2
333
+ ) / (self.f0_mel_max - self.f0_mel_min) + 1
334
+
335
+ # use 0 or 1
336
+ f0_mel[f0_mel <= 1] = 1
337
+ f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
338
+ f0_coarse = np.rint(f0_mel).astype(int)
339
+ assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
340
+ f0_coarse.max(),
341
+ f0_coarse.min(),
342
+ )
343
+ return f0_coarse
344
+
345
+ def go(self, paths, f0_method, crepe_hop_length, thread_n):
346
+ if len(paths) == 0:
347
+ printt("no-f0-todo")
348
+ else:
349
+ with tqdm.tqdm(total=len(paths), leave=True, position=thread_n) as pbar:
350
+ for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths):
351
+ try:
352
+ pbar.set_description("thread:%s, f0ing, Hop-Length:%s" % (thread_n, crepe_hop_length))
353
+ pbar.update(1)
354
+ if (
355
+ os.path.exists(opt_path1 + ".npy") == True
356
+ and os.path.exists(opt_path2 + ".npy") == True
357
+ ):
358
+ continue
359
+ featur_pit = self.compute_f0(inp_path, f0_method, crepe_hop_length)
360
+ np.save(
361
+ opt_path2,
362
+ featur_pit,
363
+ allow_pickle=False,
364
+ ) # nsf
365
+ coarse_pit = self.coarse_f0(featur_pit)
366
+ np.save(
367
+ opt_path1,
368
+ coarse_pit,
369
+ allow_pickle=False,
370
+ ) # ori
371
+ except:
372
+ printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc()))
373
+
374
+
375
+ if __name__ == "__main__":
376
+ # exp_dir=r"E:\codes\py39\dataset\mi-test"
377
+ # n_p=16
378
+ # f = open("%s/log_extract_f0.log"%exp_dir, "w")
379
+ printt(sys.argv)
380
+ featureInput = FeatureInput()
381
+ paths = []
382
+ inp_root = "%s/1_16k_wavs" % (exp_dir)
383
+ opt_root1 = "%s/2a_f0" % (exp_dir)
384
+ opt_root2 = "%s/2b-f0nsf" % (exp_dir)
385
+
386
+ os.makedirs(opt_root1, exist_ok=True)
387
+ os.makedirs(opt_root2, exist_ok=True)
388
+ for name in sorted(list(os.listdir(inp_root))):
389
+ inp_path = "%s/%s" % (inp_root, name)
390
+ if "spec" in inp_path:
391
+ continue
392
+ opt_path1 = "%s/%s" % (opt_root1, name)
393
+ opt_path2 = "%s/%s" % (opt_root2, name)
394
+ paths.append([inp_path, opt_path1, opt_path2])
395
+
396
+ ps = []
397
+ print("Using f0 method: " + f0method)
398
+ for i in range(n_p):
399
+ p = Process(
400
+ target=featureInput.go,
401
+ args=(
402
+ paths[i::n_p],
403
+ f0method,
404
+ extraction_crepe_hop_length,
405
+ i
406
+ ),
407
+ )
408
+ ps.append(p)
409
+ p.start()
410
+ for i in range(n_p):
411
+ ps[i].join()