PhoenixStormJr commited on
Commit
5d2575f
·
verified ·
1 Parent(s): ed49cc4

Upload folder using huggingface_hub

Browse files
Files changed (43) hide show
  1. uvr5_pack/lib_v5/dataset.py +183 -0
  2. uvr5_pack/lib_v5/layers.py +118 -0
  3. uvr5_pack/lib_v5/layers_123812KB .py +118 -0
  4. uvr5_pack/lib_v5/layers_123821KB.py +118 -0
  5. uvr5_pack/lib_v5/layers_33966KB.py +126 -0
  6. uvr5_pack/lib_v5/layers_537227KB.py +126 -0
  7. uvr5_pack/lib_v5/layers_537238KB.py +126 -0
  8. uvr5_pack/lib_v5/layers_new.py +125 -0
  9. uvr5_pack/lib_v5/model_param_init.py +69 -0
  10. uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json +19 -0
  11. uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json +19 -0
  12. uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json +19 -0
  13. uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json +19 -0
  14. uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json +19 -0
  15. uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json +19 -0
  16. uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json +19 -0
  17. uvr5_pack/lib_v5/modelparams/2band_32000.json +30 -0
  18. uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json +30 -0
  19. uvr5_pack/lib_v5/modelparams/2band_48000.json +30 -0
  20. uvr5_pack/lib_v5/modelparams/3band_44100.json +42 -0
  21. uvr5_pack/lib_v5/modelparams/3band_44100_mid.json +43 -0
  22. uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json +43 -0
  23. uvr5_pack/lib_v5/modelparams/4band_44100.json +54 -0
  24. uvr5_pack/lib_v5/modelparams/4band_44100_mid.json +55 -0
  25. uvr5_pack/lib_v5/modelparams/4band_44100_msb.json +55 -0
  26. uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json +55 -0
  27. uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json +55 -0
  28. uvr5_pack/lib_v5/modelparams/4band_44100_sw.json +55 -0
  29. uvr5_pack/lib_v5/modelparams/4band_v2.json +54 -0
  30. uvr5_pack/lib_v5/modelparams/4band_v2_sn.json +55 -0
  31. uvr5_pack/lib_v5/modelparams/4band_v3.json +54 -0
  32. uvr5_pack/lib_v5/modelparams/ensemble.json +43 -0
  33. uvr5_pack/lib_v5/nets.py +123 -0
  34. uvr5_pack/lib_v5/nets_123812KB.py +122 -0
  35. uvr5_pack/lib_v5/nets_123821KB.py +122 -0
  36. uvr5_pack/lib_v5/nets_33966KB.py +122 -0
  37. uvr5_pack/lib_v5/nets_537227KB.py +123 -0
  38. uvr5_pack/lib_v5/nets_537238KB.py +123 -0
  39. uvr5_pack/lib_v5/nets_61968KB.py +122 -0
  40. uvr5_pack/lib_v5/nets_new.py +132 -0
  41. uvr5_pack/lib_v5/spec_utils.py +667 -0
  42. uvr5_pack/name_params.json +263 -0
  43. uvr5_pack/utils.py +120 -0
uvr5_pack/lib_v5/dataset.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+
4
+ import numpy as np
5
+ import torch
6
+ import torch.utils.data
7
+ from tqdm import tqdm
8
+
9
+ from uvr5_pack.lib_v5 import spec_utils
10
+
11
+
12
+ class VocalRemoverValidationSet(torch.utils.data.Dataset):
13
+ def __init__(self, patch_list):
14
+ self.patch_list = patch_list
15
+
16
+ def __len__(self):
17
+ return len(self.patch_list)
18
+
19
+ def __getitem__(self, idx):
20
+ path = self.patch_list[idx]
21
+ data = np.load(path)
22
+
23
+ X, y = data["X"], data["y"]
24
+
25
+ X_mag = np.abs(X)
26
+ y_mag = np.abs(y)
27
+
28
+ return X_mag, y_mag
29
+
30
+
31
+ def make_pair(mix_dir, inst_dir):
32
+ input_exts = [".wav", ".m4a", ".mp3", ".mp4", ".flac"]
33
+
34
+ X_list = sorted(
35
+ [
36
+ os.path.join(mix_dir, fname)
37
+ for fname in os.listdir(mix_dir)
38
+ if os.path.splitext(fname)[1] in input_exts
39
+ ]
40
+ )
41
+ y_list = sorted(
42
+ [
43
+ os.path.join(inst_dir, fname)
44
+ for fname in os.listdir(inst_dir)
45
+ if os.path.splitext(fname)[1] in input_exts
46
+ ]
47
+ )
48
+
49
+ filelist = list(zip(X_list, y_list))
50
+
51
+ return filelist
52
+
53
+
54
+ def train_val_split(dataset_dir, split_mode, val_rate, val_filelist):
55
+ if split_mode == "random":
56
+ filelist = make_pair(
57
+ os.path.join(dataset_dir, "mixtures"),
58
+ os.path.join(dataset_dir, "instruments"),
59
+ )
60
+
61
+ random.shuffle(filelist)
62
+
63
+ if len(val_filelist) == 0:
64
+ val_size = int(len(filelist) * val_rate)
65
+ train_filelist = filelist[:-val_size]
66
+ val_filelist = filelist[-val_size:]
67
+ else:
68
+ train_filelist = [
69
+ pair for pair in filelist if list(pair) not in val_filelist
70
+ ]
71
+ elif split_mode == "subdirs":
72
+ if len(val_filelist) != 0:
73
+ raise ValueError(
74
+ "The `val_filelist` option is not available in `subdirs` mode"
75
+ )
76
+
77
+ train_filelist = make_pair(
78
+ os.path.join(dataset_dir, "training/mixtures"),
79
+ os.path.join(dataset_dir, "training/instruments"),
80
+ )
81
+
82
+ val_filelist = make_pair(
83
+ os.path.join(dataset_dir, "validation/mixtures"),
84
+ os.path.join(dataset_dir, "validation/instruments"),
85
+ )
86
+
87
+ return train_filelist, val_filelist
88
+
89
+
90
+ def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha):
91
+ perm = np.random.permutation(len(X))
92
+ for i, idx in enumerate(tqdm(perm)):
93
+ if np.random.uniform() < reduction_rate:
94
+ y[idx] = spec_utils.reduce_vocal_aggressively(
95
+ X[idx], y[idx], reduction_mask
96
+ )
97
+
98
+ if np.random.uniform() < 0.5:
99
+ # swap channel
100
+ X[idx] = X[idx, ::-1]
101
+ y[idx] = y[idx, ::-1]
102
+ if np.random.uniform() < 0.02:
103
+ # mono
104
+ X[idx] = X[idx].mean(axis=0, keepdims=True)
105
+ y[idx] = y[idx].mean(axis=0, keepdims=True)
106
+ if np.random.uniform() < 0.02:
107
+ # inst
108
+ X[idx] = y[idx]
109
+
110
+ if np.random.uniform() < mixup_rate and i < len(perm) - 1:
111
+ lam = np.random.beta(mixup_alpha, mixup_alpha)
112
+ X[idx] = lam * X[idx] + (1 - lam) * X[perm[i + 1]]
113
+ y[idx] = lam * y[idx] + (1 - lam) * y[perm[i + 1]]
114
+
115
+ return X, y
116
+
117
+
118
+ def make_padding(width, cropsize, offset):
119
+ left = offset
120
+ roi_size = cropsize - left * 2
121
+ if roi_size == 0:
122
+ roi_size = cropsize
123
+ right = roi_size - (width % roi_size) + left
124
+
125
+ return left, right, roi_size
126
+
127
+
128
+ def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset):
129
+ len_dataset = patches * len(filelist)
130
+
131
+ X_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
132
+ y_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
133
+
134
+ for i, (X_path, y_path) in enumerate(tqdm(filelist)):
135
+ X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
136
+ coef = np.max([np.abs(X).max(), np.abs(y).max()])
137
+ X, y = X / coef, y / coef
138
+
139
+ l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
140
+ X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
141
+ y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
142
+
143
+ starts = np.random.randint(0, X_pad.shape[2] - cropsize, patches)
144
+ ends = starts + cropsize
145
+ for j in range(patches):
146
+ idx = i * patches + j
147
+ X_dataset[idx] = X_pad[:, :, starts[j] : ends[j]]
148
+ y_dataset[idx] = y_pad[:, :, starts[j] : ends[j]]
149
+
150
+ return X_dataset, y_dataset
151
+
152
+
153
+ def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
154
+ patch_list = []
155
+ patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format(
156
+ cropsize, sr, hop_length, n_fft, offset
157
+ )
158
+ os.makedirs(patch_dir, exist_ok=True)
159
+
160
+ for i, (X_path, y_path) in enumerate(tqdm(filelist)):
161
+ basename = os.path.splitext(os.path.basename(X_path))[0]
162
+
163
+ X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
164
+ coef = np.max([np.abs(X).max(), np.abs(y).max()])
165
+ X, y = X / coef, y / coef
166
+
167
+ l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
168
+ X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
169
+ y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
170
+
171
+ len_dataset = int(np.ceil(X.shape[2] / roi_size))
172
+ for j in range(len_dataset):
173
+ outpath = os.path.join(patch_dir, "{}_p{}.npz".format(basename, j))
174
+ start = j * roi_size
175
+ if not os.path.exists(outpath):
176
+ np.savez(
177
+ outpath,
178
+ X=X_pad[:, :, start : start + cropsize],
179
+ y=y_pad[:, :, start : start + cropsize],
180
+ )
181
+ patch_list.append(outpath)
182
+
183
+ return VocalRemoverValidationSet(patch_list)
uvr5_pack/lib_v5/layers.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+
5
+ from uvr5_pack.lib_v5 import spec_utils
6
+
7
+
8
+ class Conv2DBNActiv(nn.Module):
9
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
+ super(Conv2DBNActiv, self).__init__()
11
+ self.conv = nn.Sequential(
12
+ nn.Conv2d(
13
+ nin,
14
+ nout,
15
+ kernel_size=ksize,
16
+ stride=stride,
17
+ padding=pad,
18
+ dilation=dilation,
19
+ bias=False,
20
+ ),
21
+ nn.BatchNorm2d(nout),
22
+ activ(),
23
+ )
24
+
25
+ def __call__(self, x):
26
+ return self.conv(x)
27
+
28
+
29
+ class SeperableConv2DBNActiv(nn.Module):
30
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31
+ super(SeperableConv2DBNActiv, self).__init__()
32
+ self.conv = nn.Sequential(
33
+ nn.Conv2d(
34
+ nin,
35
+ nin,
36
+ kernel_size=ksize,
37
+ stride=stride,
38
+ padding=pad,
39
+ dilation=dilation,
40
+ groups=nin,
41
+ bias=False,
42
+ ),
43
+ nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44
+ nn.BatchNorm2d(nout),
45
+ activ(),
46
+ )
47
+
48
+ def __call__(self, x):
49
+ return self.conv(x)
50
+
51
+
52
+ class Encoder(nn.Module):
53
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54
+ super(Encoder, self).__init__()
55
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57
+
58
+ def __call__(self, x):
59
+ skip = self.conv1(x)
60
+ h = self.conv2(skip)
61
+
62
+ return h, skip
63
+
64
+
65
+ class Decoder(nn.Module):
66
+ def __init__(
67
+ self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68
+ ):
69
+ super(Decoder, self).__init__()
70
+ self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
72
+
73
+ def __call__(self, x, skip=None):
74
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75
+ if skip is not None:
76
+ skip = spec_utils.crop_center(skip, x)
77
+ x = torch.cat([x, skip], dim=1)
78
+ h = self.conv(x)
79
+
80
+ if self.dropout is not None:
81
+ h = self.dropout(h)
82
+
83
+ return h
84
+
85
+
86
+ class ASPPModule(nn.Module):
87
+ def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
88
+ super(ASPPModule, self).__init__()
89
+ self.conv1 = nn.Sequential(
90
+ nn.AdaptiveAvgPool2d((1, None)),
91
+ Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92
+ )
93
+ self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94
+ self.conv3 = SeperableConv2DBNActiv(
95
+ nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96
+ )
97
+ self.conv4 = SeperableConv2DBNActiv(
98
+ nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99
+ )
100
+ self.conv5 = SeperableConv2DBNActiv(
101
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102
+ )
103
+ self.bottleneck = nn.Sequential(
104
+ Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105
+ )
106
+
107
+ def forward(self, x):
108
+ _, _, h, w = x.size()
109
+ feat1 = F.interpolate(
110
+ self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111
+ )
112
+ feat2 = self.conv2(x)
113
+ feat3 = self.conv3(x)
114
+ feat4 = self.conv4(x)
115
+ feat5 = self.conv5(x)
116
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117
+ bottle = self.bottleneck(out)
118
+ return bottle
uvr5_pack/lib_v5/layers_123812KB .py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+
5
+ from uvr5_pack.lib_v5 import spec_utils
6
+
7
+
8
+ class Conv2DBNActiv(nn.Module):
9
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
+ super(Conv2DBNActiv, self).__init__()
11
+ self.conv = nn.Sequential(
12
+ nn.Conv2d(
13
+ nin,
14
+ nout,
15
+ kernel_size=ksize,
16
+ stride=stride,
17
+ padding=pad,
18
+ dilation=dilation,
19
+ bias=False,
20
+ ),
21
+ nn.BatchNorm2d(nout),
22
+ activ(),
23
+ )
24
+
25
+ def __call__(self, x):
26
+ return self.conv(x)
27
+
28
+
29
+ class SeperableConv2DBNActiv(nn.Module):
30
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31
+ super(SeperableConv2DBNActiv, self).__init__()
32
+ self.conv = nn.Sequential(
33
+ nn.Conv2d(
34
+ nin,
35
+ nin,
36
+ kernel_size=ksize,
37
+ stride=stride,
38
+ padding=pad,
39
+ dilation=dilation,
40
+ groups=nin,
41
+ bias=False,
42
+ ),
43
+ nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44
+ nn.BatchNorm2d(nout),
45
+ activ(),
46
+ )
47
+
48
+ def __call__(self, x):
49
+ return self.conv(x)
50
+
51
+
52
+ class Encoder(nn.Module):
53
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54
+ super(Encoder, self).__init__()
55
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57
+
58
+ def __call__(self, x):
59
+ skip = self.conv1(x)
60
+ h = self.conv2(skip)
61
+
62
+ return h, skip
63
+
64
+
65
+ class Decoder(nn.Module):
66
+ def __init__(
67
+ self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68
+ ):
69
+ super(Decoder, self).__init__()
70
+ self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
72
+
73
+ def __call__(self, x, skip=None):
74
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75
+ if skip is not None:
76
+ skip = spec_utils.crop_center(skip, x)
77
+ x = torch.cat([x, skip], dim=1)
78
+ h = self.conv(x)
79
+
80
+ if self.dropout is not None:
81
+ h = self.dropout(h)
82
+
83
+ return h
84
+
85
+
86
+ class ASPPModule(nn.Module):
87
+ def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
88
+ super(ASPPModule, self).__init__()
89
+ self.conv1 = nn.Sequential(
90
+ nn.AdaptiveAvgPool2d((1, None)),
91
+ Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92
+ )
93
+ self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94
+ self.conv3 = SeperableConv2DBNActiv(
95
+ nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96
+ )
97
+ self.conv4 = SeperableConv2DBNActiv(
98
+ nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99
+ )
100
+ self.conv5 = SeperableConv2DBNActiv(
101
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102
+ )
103
+ self.bottleneck = nn.Sequential(
104
+ Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105
+ )
106
+
107
+ def forward(self, x):
108
+ _, _, h, w = x.size()
109
+ feat1 = F.interpolate(
110
+ self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111
+ )
112
+ feat2 = self.conv2(x)
113
+ feat3 = self.conv3(x)
114
+ feat4 = self.conv4(x)
115
+ feat5 = self.conv5(x)
116
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117
+ bottle = self.bottleneck(out)
118
+ return bottle
uvr5_pack/lib_v5/layers_123821KB.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+
5
+ from uvr5_pack.lib_v5 import spec_utils
6
+
7
+
8
+ class Conv2DBNActiv(nn.Module):
9
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
+ super(Conv2DBNActiv, self).__init__()
11
+ self.conv = nn.Sequential(
12
+ nn.Conv2d(
13
+ nin,
14
+ nout,
15
+ kernel_size=ksize,
16
+ stride=stride,
17
+ padding=pad,
18
+ dilation=dilation,
19
+ bias=False,
20
+ ),
21
+ nn.BatchNorm2d(nout),
22
+ activ(),
23
+ )
24
+
25
+ def __call__(self, x):
26
+ return self.conv(x)
27
+
28
+
29
+ class SeperableConv2DBNActiv(nn.Module):
30
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31
+ super(SeperableConv2DBNActiv, self).__init__()
32
+ self.conv = nn.Sequential(
33
+ nn.Conv2d(
34
+ nin,
35
+ nin,
36
+ kernel_size=ksize,
37
+ stride=stride,
38
+ padding=pad,
39
+ dilation=dilation,
40
+ groups=nin,
41
+ bias=False,
42
+ ),
43
+ nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44
+ nn.BatchNorm2d(nout),
45
+ activ(),
46
+ )
47
+
48
+ def __call__(self, x):
49
+ return self.conv(x)
50
+
51
+
52
+ class Encoder(nn.Module):
53
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54
+ super(Encoder, self).__init__()
55
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57
+
58
+ def __call__(self, x):
59
+ skip = self.conv1(x)
60
+ h = self.conv2(skip)
61
+
62
+ return h, skip
63
+
64
+
65
+ class Decoder(nn.Module):
66
+ def __init__(
67
+ self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68
+ ):
69
+ super(Decoder, self).__init__()
70
+ self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
72
+
73
+ def __call__(self, x, skip=None):
74
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75
+ if skip is not None:
76
+ skip = spec_utils.crop_center(skip, x)
77
+ x = torch.cat([x, skip], dim=1)
78
+ h = self.conv(x)
79
+
80
+ if self.dropout is not None:
81
+ h = self.dropout(h)
82
+
83
+ return h
84
+
85
+
86
+ class ASPPModule(nn.Module):
87
+ def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
88
+ super(ASPPModule, self).__init__()
89
+ self.conv1 = nn.Sequential(
90
+ nn.AdaptiveAvgPool2d((1, None)),
91
+ Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92
+ )
93
+ self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94
+ self.conv3 = SeperableConv2DBNActiv(
95
+ nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96
+ )
97
+ self.conv4 = SeperableConv2DBNActiv(
98
+ nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99
+ )
100
+ self.conv5 = SeperableConv2DBNActiv(
101
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102
+ )
103
+ self.bottleneck = nn.Sequential(
104
+ Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105
+ )
106
+
107
+ def forward(self, x):
108
+ _, _, h, w = x.size()
109
+ feat1 = F.interpolate(
110
+ self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111
+ )
112
+ feat2 = self.conv2(x)
113
+ feat3 = self.conv3(x)
114
+ feat4 = self.conv4(x)
115
+ feat5 = self.conv5(x)
116
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117
+ bottle = self.bottleneck(out)
118
+ return bottle
uvr5_pack/lib_v5/layers_33966KB.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+
5
+ from uvr5_pack.lib_v5 import spec_utils
6
+
7
+
8
+ class Conv2DBNActiv(nn.Module):
9
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
+ super(Conv2DBNActiv, self).__init__()
11
+ self.conv = nn.Sequential(
12
+ nn.Conv2d(
13
+ nin,
14
+ nout,
15
+ kernel_size=ksize,
16
+ stride=stride,
17
+ padding=pad,
18
+ dilation=dilation,
19
+ bias=False,
20
+ ),
21
+ nn.BatchNorm2d(nout),
22
+ activ(),
23
+ )
24
+
25
+ def __call__(self, x):
26
+ return self.conv(x)
27
+
28
+
29
+ class SeperableConv2DBNActiv(nn.Module):
30
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31
+ super(SeperableConv2DBNActiv, self).__init__()
32
+ self.conv = nn.Sequential(
33
+ nn.Conv2d(
34
+ nin,
35
+ nin,
36
+ kernel_size=ksize,
37
+ stride=stride,
38
+ padding=pad,
39
+ dilation=dilation,
40
+ groups=nin,
41
+ bias=False,
42
+ ),
43
+ nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44
+ nn.BatchNorm2d(nout),
45
+ activ(),
46
+ )
47
+
48
+ def __call__(self, x):
49
+ return self.conv(x)
50
+
51
+
52
+ class Encoder(nn.Module):
53
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54
+ super(Encoder, self).__init__()
55
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57
+
58
+ def __call__(self, x):
59
+ skip = self.conv1(x)
60
+ h = self.conv2(skip)
61
+
62
+ return h, skip
63
+
64
+
65
+ class Decoder(nn.Module):
66
+ def __init__(
67
+ self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68
+ ):
69
+ super(Decoder, self).__init__()
70
+ self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
72
+
73
+ def __call__(self, x, skip=None):
74
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75
+ if skip is not None:
76
+ skip = spec_utils.crop_center(skip, x)
77
+ x = torch.cat([x, skip], dim=1)
78
+ h = self.conv(x)
79
+
80
+ if self.dropout is not None:
81
+ h = self.dropout(h)
82
+
83
+ return h
84
+
85
+
86
+ class ASPPModule(nn.Module):
87
+ def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
88
+ super(ASPPModule, self).__init__()
89
+ self.conv1 = nn.Sequential(
90
+ nn.AdaptiveAvgPool2d((1, None)),
91
+ Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92
+ )
93
+ self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94
+ self.conv3 = SeperableConv2DBNActiv(
95
+ nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96
+ )
97
+ self.conv4 = SeperableConv2DBNActiv(
98
+ nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99
+ )
100
+ self.conv5 = SeperableConv2DBNActiv(
101
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102
+ )
103
+ self.conv6 = SeperableConv2DBNActiv(
104
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
105
+ )
106
+ self.conv7 = SeperableConv2DBNActiv(
107
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
108
+ )
109
+ self.bottleneck = nn.Sequential(
110
+ Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
111
+ )
112
+
113
+ def forward(self, x):
114
+ _, _, h, w = x.size()
115
+ feat1 = F.interpolate(
116
+ self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
117
+ )
118
+ feat2 = self.conv2(x)
119
+ feat3 = self.conv3(x)
120
+ feat4 = self.conv4(x)
121
+ feat5 = self.conv5(x)
122
+ feat6 = self.conv6(x)
123
+ feat7 = self.conv7(x)
124
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
125
+ bottle = self.bottleneck(out)
126
+ return bottle
uvr5_pack/lib_v5/layers_537227KB.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+
5
+ from uvr5_pack.lib_v5 import spec_utils
6
+
7
+
8
+ class Conv2DBNActiv(nn.Module):
9
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
+ super(Conv2DBNActiv, self).__init__()
11
+ self.conv = nn.Sequential(
12
+ nn.Conv2d(
13
+ nin,
14
+ nout,
15
+ kernel_size=ksize,
16
+ stride=stride,
17
+ padding=pad,
18
+ dilation=dilation,
19
+ bias=False,
20
+ ),
21
+ nn.BatchNorm2d(nout),
22
+ activ(),
23
+ )
24
+
25
+ def __call__(self, x):
26
+ return self.conv(x)
27
+
28
+
29
+ class SeperableConv2DBNActiv(nn.Module):
30
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31
+ super(SeperableConv2DBNActiv, self).__init__()
32
+ self.conv = nn.Sequential(
33
+ nn.Conv2d(
34
+ nin,
35
+ nin,
36
+ kernel_size=ksize,
37
+ stride=stride,
38
+ padding=pad,
39
+ dilation=dilation,
40
+ groups=nin,
41
+ bias=False,
42
+ ),
43
+ nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44
+ nn.BatchNorm2d(nout),
45
+ activ(),
46
+ )
47
+
48
+ def __call__(self, x):
49
+ return self.conv(x)
50
+
51
+
52
+ class Encoder(nn.Module):
53
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54
+ super(Encoder, self).__init__()
55
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57
+
58
+ def __call__(self, x):
59
+ skip = self.conv1(x)
60
+ h = self.conv2(skip)
61
+
62
+ return h, skip
63
+
64
+
65
+ class Decoder(nn.Module):
66
+ def __init__(
67
+ self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68
+ ):
69
+ super(Decoder, self).__init__()
70
+ self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
72
+
73
+ def __call__(self, x, skip=None):
74
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75
+ if skip is not None:
76
+ skip = spec_utils.crop_center(skip, x)
77
+ x = torch.cat([x, skip], dim=1)
78
+ h = self.conv(x)
79
+
80
+ if self.dropout is not None:
81
+ h = self.dropout(h)
82
+
83
+ return h
84
+
85
+
86
+ class ASPPModule(nn.Module):
87
+ def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
88
+ super(ASPPModule, self).__init__()
89
+ self.conv1 = nn.Sequential(
90
+ nn.AdaptiveAvgPool2d((1, None)),
91
+ Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92
+ )
93
+ self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94
+ self.conv3 = SeperableConv2DBNActiv(
95
+ nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96
+ )
97
+ self.conv4 = SeperableConv2DBNActiv(
98
+ nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99
+ )
100
+ self.conv5 = SeperableConv2DBNActiv(
101
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102
+ )
103
+ self.conv6 = SeperableConv2DBNActiv(
104
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
105
+ )
106
+ self.conv7 = SeperableConv2DBNActiv(
107
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
108
+ )
109
+ self.bottleneck = nn.Sequential(
110
+ Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
111
+ )
112
+
113
+ def forward(self, x):
114
+ _, _, h, w = x.size()
115
+ feat1 = F.interpolate(
116
+ self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
117
+ )
118
+ feat2 = self.conv2(x)
119
+ feat3 = self.conv3(x)
120
+ feat4 = self.conv4(x)
121
+ feat5 = self.conv5(x)
122
+ feat6 = self.conv6(x)
123
+ feat7 = self.conv7(x)
124
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
125
+ bottle = self.bottleneck(out)
126
+ return bottle
uvr5_pack/lib_v5/layers_537238KB.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+
5
+ from uvr5_pack.lib_v5 import spec_utils
6
+
7
+
8
+ class Conv2DBNActiv(nn.Module):
9
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
+ super(Conv2DBNActiv, self).__init__()
11
+ self.conv = nn.Sequential(
12
+ nn.Conv2d(
13
+ nin,
14
+ nout,
15
+ kernel_size=ksize,
16
+ stride=stride,
17
+ padding=pad,
18
+ dilation=dilation,
19
+ bias=False,
20
+ ),
21
+ nn.BatchNorm2d(nout),
22
+ activ(),
23
+ )
24
+
25
+ def __call__(self, x):
26
+ return self.conv(x)
27
+
28
+
29
+ class SeperableConv2DBNActiv(nn.Module):
30
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31
+ super(SeperableConv2DBNActiv, self).__init__()
32
+ self.conv = nn.Sequential(
33
+ nn.Conv2d(
34
+ nin,
35
+ nin,
36
+ kernel_size=ksize,
37
+ stride=stride,
38
+ padding=pad,
39
+ dilation=dilation,
40
+ groups=nin,
41
+ bias=False,
42
+ ),
43
+ nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44
+ nn.BatchNorm2d(nout),
45
+ activ(),
46
+ )
47
+
48
+ def __call__(self, x):
49
+ return self.conv(x)
50
+
51
+
52
+ class Encoder(nn.Module):
53
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54
+ super(Encoder, self).__init__()
55
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57
+
58
+ def __call__(self, x):
59
+ skip = self.conv1(x)
60
+ h = self.conv2(skip)
61
+
62
+ return h, skip
63
+
64
+
65
+ class Decoder(nn.Module):
66
+ def __init__(
67
+ self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68
+ ):
69
+ super(Decoder, self).__init__()
70
+ self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
72
+
73
+ def __call__(self, x, skip=None):
74
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75
+ if skip is not None:
76
+ skip = spec_utils.crop_center(skip, x)
77
+ x = torch.cat([x, skip], dim=1)
78
+ h = self.conv(x)
79
+
80
+ if self.dropout is not None:
81
+ h = self.dropout(h)
82
+
83
+ return h
84
+
85
+
86
+ class ASPPModule(nn.Module):
87
+ def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
88
+ super(ASPPModule, self).__init__()
89
+ self.conv1 = nn.Sequential(
90
+ nn.AdaptiveAvgPool2d((1, None)),
91
+ Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92
+ )
93
+ self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94
+ self.conv3 = SeperableConv2DBNActiv(
95
+ nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96
+ )
97
+ self.conv4 = SeperableConv2DBNActiv(
98
+ nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99
+ )
100
+ self.conv5 = SeperableConv2DBNActiv(
101
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102
+ )
103
+ self.conv6 = SeperableConv2DBNActiv(
104
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
105
+ )
106
+ self.conv7 = SeperableConv2DBNActiv(
107
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
108
+ )
109
+ self.bottleneck = nn.Sequential(
110
+ Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
111
+ )
112
+
113
+ def forward(self, x):
114
+ _, _, h, w = x.size()
115
+ feat1 = F.interpolate(
116
+ self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
117
+ )
118
+ feat2 = self.conv2(x)
119
+ feat3 = self.conv3(x)
120
+ feat4 = self.conv4(x)
121
+ feat5 = self.conv5(x)
122
+ feat6 = self.conv6(x)
123
+ feat7 = self.conv7(x)
124
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
125
+ bottle = self.bottleneck(out)
126
+ return bottle
uvr5_pack/lib_v5/layers_new.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+
5
+ from uvr5_pack.lib_v5 import spec_utils
6
+
7
+
8
+ class Conv2DBNActiv(nn.Module):
9
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
+ super(Conv2DBNActiv, self).__init__()
11
+ self.conv = nn.Sequential(
12
+ nn.Conv2d(
13
+ nin,
14
+ nout,
15
+ kernel_size=ksize,
16
+ stride=stride,
17
+ padding=pad,
18
+ dilation=dilation,
19
+ bias=False,
20
+ ),
21
+ nn.BatchNorm2d(nout),
22
+ activ(),
23
+ )
24
+
25
+ def __call__(self, x):
26
+ return self.conv(x)
27
+
28
+
29
+ class Encoder(nn.Module):
30
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
31
+ super(Encoder, self).__init__()
32
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
33
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
34
+
35
+ def __call__(self, x):
36
+ h = self.conv1(x)
37
+ h = self.conv2(h)
38
+
39
+ return h
40
+
41
+
42
+ class Decoder(nn.Module):
43
+ def __init__(
44
+ self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
45
+ ):
46
+ super(Decoder, self).__init__()
47
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
48
+ # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
49
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
50
+
51
+ def __call__(self, x, skip=None):
52
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
53
+
54
+ if skip is not None:
55
+ skip = spec_utils.crop_center(skip, x)
56
+ x = torch.cat([x, skip], dim=1)
57
+
58
+ h = self.conv1(x)
59
+ # h = self.conv2(h)
60
+
61
+ if self.dropout is not None:
62
+ h = self.dropout(h)
63
+
64
+ return h
65
+
66
+
67
+ class ASPPModule(nn.Module):
68
+ def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False):
69
+ super(ASPPModule, self).__init__()
70
+ self.conv1 = nn.Sequential(
71
+ nn.AdaptiveAvgPool2d((1, None)),
72
+ Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ),
73
+ )
74
+ self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
75
+ self.conv3 = Conv2DBNActiv(
76
+ nin, nout, 3, 1, dilations[0], dilations[0], activ=activ
77
+ )
78
+ self.conv4 = Conv2DBNActiv(
79
+ nin, nout, 3, 1, dilations[1], dilations[1], activ=activ
80
+ )
81
+ self.conv5 = Conv2DBNActiv(
82
+ nin, nout, 3, 1, dilations[2], dilations[2], activ=activ
83
+ )
84
+ self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ)
85
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
86
+
87
+ def forward(self, x):
88
+ _, _, h, w = x.size()
89
+ feat1 = F.interpolate(
90
+ self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
91
+ )
92
+ feat2 = self.conv2(x)
93
+ feat3 = self.conv3(x)
94
+ feat4 = self.conv4(x)
95
+ feat5 = self.conv5(x)
96
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
97
+ out = self.bottleneck(out)
98
+
99
+ if self.dropout is not None:
100
+ out = self.dropout(out)
101
+
102
+ return out
103
+
104
+
105
+ class LSTMModule(nn.Module):
106
+ def __init__(self, nin_conv, nin_lstm, nout_lstm):
107
+ super(LSTMModule, self).__init__()
108
+ self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
109
+ self.lstm = nn.LSTM(
110
+ input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True
111
+ )
112
+ self.dense = nn.Sequential(
113
+ nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU()
114
+ )
115
+
116
+ def forward(self, x):
117
+ N, _, nbins, nframes = x.size()
118
+ h = self.conv(x)[:, 0] # N, nbins, nframes
119
+ h = h.permute(2, 0, 1) # nframes, N, nbins
120
+ h, _ = self.lstm(h)
121
+ h = self.dense(h.reshape(-1, h.size()[-1])) # nframes * N, nbins
122
+ h = h.reshape(nframes, N, 1, nbins)
123
+ h = h.permute(1, 2, 3, 0)
124
+
125
+ return h
uvr5_pack/lib_v5/model_param_init.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import pathlib
4
+
5
+ default_param = {}
6
+ default_param["bins"] = 768
7
+ default_param["unstable_bins"] = 9 # training only
8
+ default_param["reduction_bins"] = 762 # training only
9
+ default_param["sr"] = 44100
10
+ default_param["pre_filter_start"] = 757
11
+ default_param["pre_filter_stop"] = 768
12
+ default_param["band"] = {}
13
+
14
+
15
+ default_param["band"][1] = {
16
+ "sr": 11025,
17
+ "hl": 128,
18
+ "n_fft": 960,
19
+ "crop_start": 0,
20
+ "crop_stop": 245,
21
+ "lpf_start": 61, # inference only
22
+ "res_type": "polyphase",
23
+ }
24
+
25
+ default_param["band"][2] = {
26
+ "sr": 44100,
27
+ "hl": 512,
28
+ "n_fft": 1536,
29
+ "crop_start": 24,
30
+ "crop_stop": 547,
31
+ "hpf_start": 81, # inference only
32
+ "res_type": "sinc_best",
33
+ }
34
+
35
+
36
+ def int_keys(d):
37
+ r = {}
38
+ for k, v in d:
39
+ if k.isdigit():
40
+ k = int(k)
41
+ r[k] = v
42
+ return r
43
+
44
+
45
+ class ModelParameters(object):
46
+ def __init__(self, config_path=""):
47
+ if ".pth" == pathlib.Path(config_path).suffix:
48
+ import zipfile
49
+
50
+ with zipfile.ZipFile(config_path, "r") as zip:
51
+ self.param = json.loads(
52
+ zip.read("param.json"), object_pairs_hook=int_keys
53
+ )
54
+ elif ".json" == pathlib.Path(config_path).suffix:
55
+ with open(config_path, "r") as f:
56
+ self.param = json.loads(f.read(), object_pairs_hook=int_keys)
57
+ else:
58
+ self.param = default_param
59
+
60
+ for k in [
61
+ "mid_side",
62
+ "mid_side_b",
63
+ "mid_side_b2",
64
+ "stereo_w",
65
+ "stereo_n",
66
+ "reverse",
67
+ ]:
68
+ if not k in self.param:
69
+ self.param[k] = False
uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 1024,
3
+ "unstable_bins": 0,
4
+ "reduction_bins": 0,
5
+ "band": {
6
+ "1": {
7
+ "sr": 16000,
8
+ "hl": 512,
9
+ "n_fft": 2048,
10
+ "crop_start": 0,
11
+ "crop_stop": 1024,
12
+ "hpf_start": -1,
13
+ "res_type": "sinc_best"
14
+ }
15
+ },
16
+ "sr": 16000,
17
+ "pre_filter_start": 1023,
18
+ "pre_filter_stop": 1024
19
+ }
uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 1024,
3
+ "unstable_bins": 0,
4
+ "reduction_bins": 0,
5
+ "band": {
6
+ "1": {
7
+ "sr": 32000,
8
+ "hl": 512,
9
+ "n_fft": 2048,
10
+ "crop_start": 0,
11
+ "crop_stop": 1024,
12
+ "hpf_start": -1,
13
+ "res_type": "kaiser_fast"
14
+ }
15
+ },
16
+ "sr": 32000,
17
+ "pre_filter_start": 1000,
18
+ "pre_filter_stop": 1021
19
+ }
uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 1024,
3
+ "unstable_bins": 0,
4
+ "reduction_bins": 0,
5
+ "band": {
6
+ "1": {
7
+ "sr": 33075,
8
+ "hl": 384,
9
+ "n_fft": 2048,
10
+ "crop_start": 0,
11
+ "crop_stop": 1024,
12
+ "hpf_start": -1,
13
+ "res_type": "sinc_best"
14
+ }
15
+ },
16
+ "sr": 33075,
17
+ "pre_filter_start": 1000,
18
+ "pre_filter_stop": 1021
19
+ }
uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 1024,
3
+ "unstable_bins": 0,
4
+ "reduction_bins": 0,
5
+ "band": {
6
+ "1": {
7
+ "sr": 44100,
8
+ "hl": 1024,
9
+ "n_fft": 2048,
10
+ "crop_start": 0,
11
+ "crop_stop": 1024,
12
+ "hpf_start": -1,
13
+ "res_type": "sinc_best"
14
+ }
15
+ },
16
+ "sr": 44100,
17
+ "pre_filter_start": 1023,
18
+ "pre_filter_stop": 1024
19
+ }
uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 256,
3
+ "unstable_bins": 0,
4
+ "reduction_bins": 0,
5
+ "band": {
6
+ "1": {
7
+ "sr": 44100,
8
+ "hl": 256,
9
+ "n_fft": 512,
10
+ "crop_start": 0,
11
+ "crop_stop": 256,
12
+ "hpf_start": -1,
13
+ "res_type": "sinc_best"
14
+ }
15
+ },
16
+ "sr": 44100,
17
+ "pre_filter_start": 256,
18
+ "pre_filter_stop": 256
19
+ }
uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 1024,
3
+ "unstable_bins": 0,
4
+ "reduction_bins": 0,
5
+ "band": {
6
+ "1": {
7
+ "sr": 44100,
8
+ "hl": 512,
9
+ "n_fft": 2048,
10
+ "crop_start": 0,
11
+ "crop_stop": 1024,
12
+ "hpf_start": -1,
13
+ "res_type": "sinc_best"
14
+ }
15
+ },
16
+ "sr": 44100,
17
+ "pre_filter_start": 1023,
18
+ "pre_filter_stop": 1024
19
+ }
uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 1024,
3
+ "unstable_bins": 0,
4
+ "reduction_bins": 0,
5
+ "band": {
6
+ "1": {
7
+ "sr": 44100,
8
+ "hl": 512,
9
+ "n_fft": 2048,
10
+ "crop_start": 0,
11
+ "crop_stop": 700,
12
+ "hpf_start": -1,
13
+ "res_type": "sinc_best"
14
+ }
15
+ },
16
+ "sr": 44100,
17
+ "pre_filter_start": 1023,
18
+ "pre_filter_stop": 700
19
+ }
uvr5_pack/lib_v5/modelparams/2band_32000.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 768,
3
+ "unstable_bins": 7,
4
+ "reduction_bins": 705,
5
+ "band": {
6
+ "1": {
7
+ "sr": 6000,
8
+ "hl": 66,
9
+ "n_fft": 512,
10
+ "crop_start": 0,
11
+ "crop_stop": 240,
12
+ "lpf_start": 60,
13
+ "lpf_stop": 118,
14
+ "res_type": "sinc_fastest"
15
+ },
16
+ "2": {
17
+ "sr": 32000,
18
+ "hl": 352,
19
+ "n_fft": 1024,
20
+ "crop_start": 22,
21
+ "crop_stop": 505,
22
+ "hpf_start": 44,
23
+ "hpf_stop": 23,
24
+ "res_type": "sinc_medium"
25
+ }
26
+ },
27
+ "sr": 32000,
28
+ "pre_filter_start": 710,
29
+ "pre_filter_stop": 731
30
+ }
uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 512,
3
+ "unstable_bins": 7,
4
+ "reduction_bins": 510,
5
+ "band": {
6
+ "1": {
7
+ "sr": 11025,
8
+ "hl": 160,
9
+ "n_fft": 768,
10
+ "crop_start": 0,
11
+ "crop_stop": 192,
12
+ "lpf_start": 41,
13
+ "lpf_stop": 139,
14
+ "res_type": "sinc_fastest"
15
+ },
16
+ "2": {
17
+ "sr": 44100,
18
+ "hl": 640,
19
+ "n_fft": 1024,
20
+ "crop_start": 10,
21
+ "crop_stop": 320,
22
+ "hpf_start": 47,
23
+ "hpf_stop": 15,
24
+ "res_type": "sinc_medium"
25
+ }
26
+ },
27
+ "sr": 44100,
28
+ "pre_filter_start": 510,
29
+ "pre_filter_stop": 512
30
+ }
uvr5_pack/lib_v5/modelparams/2band_48000.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 768,
3
+ "unstable_bins": 7,
4
+ "reduction_bins": 705,
5
+ "band": {
6
+ "1": {
7
+ "sr": 6000,
8
+ "hl": 66,
9
+ "n_fft": 512,
10
+ "crop_start": 0,
11
+ "crop_stop": 240,
12
+ "lpf_start": 60,
13
+ "lpf_stop": 240,
14
+ "res_type": "sinc_fastest"
15
+ },
16
+ "2": {
17
+ "sr": 48000,
18
+ "hl": 528,
19
+ "n_fft": 1536,
20
+ "crop_start": 22,
21
+ "crop_stop": 505,
22
+ "hpf_start": 82,
23
+ "hpf_stop": 22,
24
+ "res_type": "sinc_medium"
25
+ }
26
+ },
27
+ "sr": 48000,
28
+ "pre_filter_start": 710,
29
+ "pre_filter_stop": 731
30
+ }
uvr5_pack/lib_v5/modelparams/3band_44100.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 768,
3
+ "unstable_bins": 5,
4
+ "reduction_bins": 733,
5
+ "band": {
6
+ "1": {
7
+ "sr": 11025,
8
+ "hl": 128,
9
+ "n_fft": 768,
10
+ "crop_start": 0,
11
+ "crop_stop": 278,
12
+ "lpf_start": 28,
13
+ "lpf_stop": 140,
14
+ "res_type": "polyphase"
15
+ },
16
+ "2": {
17
+ "sr": 22050,
18
+ "hl": 256,
19
+ "n_fft": 768,
20
+ "crop_start": 14,
21
+ "crop_stop": 322,
22
+ "hpf_start": 70,
23
+ "hpf_stop": 14,
24
+ "lpf_start": 283,
25
+ "lpf_stop": 314,
26
+ "res_type": "polyphase"
27
+ },
28
+ "3": {
29
+ "sr": 44100,
30
+ "hl": 512,
31
+ "n_fft": 768,
32
+ "crop_start": 131,
33
+ "crop_stop": 313,
34
+ "hpf_start": 154,
35
+ "hpf_stop": 141,
36
+ "res_type": "sinc_medium"
37
+ }
38
+ },
39
+ "sr": 44100,
40
+ "pre_filter_start": 757,
41
+ "pre_filter_stop": 768
42
+ }
uvr5_pack/lib_v5/modelparams/3band_44100_mid.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mid_side": true,
3
+ "bins": 768,
4
+ "unstable_bins": 5,
5
+ "reduction_bins": 733,
6
+ "band": {
7
+ "1": {
8
+ "sr": 11025,
9
+ "hl": 128,
10
+ "n_fft": 768,
11
+ "crop_start": 0,
12
+ "crop_stop": 278,
13
+ "lpf_start": 28,
14
+ "lpf_stop": 140,
15
+ "res_type": "polyphase"
16
+ },
17
+ "2": {
18
+ "sr": 22050,
19
+ "hl": 256,
20
+ "n_fft": 768,
21
+ "crop_start": 14,
22
+ "crop_stop": 322,
23
+ "hpf_start": 70,
24
+ "hpf_stop": 14,
25
+ "lpf_start": 283,
26
+ "lpf_stop": 314,
27
+ "res_type": "polyphase"
28
+ },
29
+ "3": {
30
+ "sr": 44100,
31
+ "hl": 512,
32
+ "n_fft": 768,
33
+ "crop_start": 131,
34
+ "crop_stop": 313,
35
+ "hpf_start": 154,
36
+ "hpf_stop": 141,
37
+ "res_type": "sinc_medium"
38
+ }
39
+ },
40
+ "sr": 44100,
41
+ "pre_filter_start": 757,
42
+ "pre_filter_stop": 768
43
+ }
uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mid_side_b2": true,
3
+ "bins": 640,
4
+ "unstable_bins": 7,
5
+ "reduction_bins": 565,
6
+ "band": {
7
+ "1": {
8
+ "sr": 11025,
9
+ "hl": 108,
10
+ "n_fft": 1024,
11
+ "crop_start": 0,
12
+ "crop_stop": 187,
13
+ "lpf_start": 92,
14
+ "lpf_stop": 186,
15
+ "res_type": "polyphase"
16
+ },
17
+ "2": {
18
+ "sr": 22050,
19
+ "hl": 216,
20
+ "n_fft": 768,
21
+ "crop_start": 0,
22
+ "crop_stop": 212,
23
+ "hpf_start": 68,
24
+ "hpf_stop": 34,
25
+ "lpf_start": 174,
26
+ "lpf_stop": 209,
27
+ "res_type": "polyphase"
28
+ },
29
+ "3": {
30
+ "sr": 44100,
31
+ "hl": 432,
32
+ "n_fft": 640,
33
+ "crop_start": 66,
34
+ "crop_stop": 307,
35
+ "hpf_start": 86,
36
+ "hpf_stop": 72,
37
+ "res_type": "kaiser_fast"
38
+ }
39
+ },
40
+ "sr": 44100,
41
+ "pre_filter_start": 639,
42
+ "pre_filter_stop": 640
43
+ }
uvr5_pack/lib_v5/modelparams/4band_44100.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 768,
3
+ "unstable_bins": 7,
4
+ "reduction_bins": 668,
5
+ "band": {
6
+ "1": {
7
+ "sr": 11025,
8
+ "hl": 128,
9
+ "n_fft": 1024,
10
+ "crop_start": 0,
11
+ "crop_stop": 186,
12
+ "lpf_start": 37,
13
+ "lpf_stop": 73,
14
+ "res_type": "polyphase"
15
+ },
16
+ "2": {
17
+ "sr": 11025,
18
+ "hl": 128,
19
+ "n_fft": 512,
20
+ "crop_start": 4,
21
+ "crop_stop": 185,
22
+ "hpf_start": 36,
23
+ "hpf_stop": 18,
24
+ "lpf_start": 93,
25
+ "lpf_stop": 185,
26
+ "res_type": "polyphase"
27
+ },
28
+ "3": {
29
+ "sr": 22050,
30
+ "hl": 256,
31
+ "n_fft": 512,
32
+ "crop_start": 46,
33
+ "crop_stop": 186,
34
+ "hpf_start": 93,
35
+ "hpf_stop": 46,
36
+ "lpf_start": 164,
37
+ "lpf_stop": 186,
38
+ "res_type": "polyphase"
39
+ },
40
+ "4": {
41
+ "sr": 44100,
42
+ "hl": 512,
43
+ "n_fft": 768,
44
+ "crop_start": 121,
45
+ "crop_stop": 382,
46
+ "hpf_start": 138,
47
+ "hpf_stop": 123,
48
+ "res_type": "sinc_medium"
49
+ }
50
+ },
51
+ "sr": 44100,
52
+ "pre_filter_start": 740,
53
+ "pre_filter_stop": 768
54
+ }
uvr5_pack/lib_v5/modelparams/4band_44100_mid.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 768,
3
+ "unstable_bins": 7,
4
+ "mid_side": true,
5
+ "reduction_bins": 668,
6
+ "band": {
7
+ "1": {
8
+ "sr": 11025,
9
+ "hl": 128,
10
+ "n_fft": 1024,
11
+ "crop_start": 0,
12
+ "crop_stop": 186,
13
+ "lpf_start": 37,
14
+ "lpf_stop": 73,
15
+ "res_type": "polyphase"
16
+ },
17
+ "2": {
18
+ "sr": 11025,
19
+ "hl": 128,
20
+ "n_fft": 512,
21
+ "crop_start": 4,
22
+ "crop_stop": 185,
23
+ "hpf_start": 36,
24
+ "hpf_stop": 18,
25
+ "lpf_start": 93,
26
+ "lpf_stop": 185,
27
+ "res_type": "polyphase"
28
+ },
29
+ "3": {
30
+ "sr": 22050,
31
+ "hl": 256,
32
+ "n_fft": 512,
33
+ "crop_start": 46,
34
+ "crop_stop": 186,
35
+ "hpf_start": 93,
36
+ "hpf_stop": 46,
37
+ "lpf_start": 164,
38
+ "lpf_stop": 186,
39
+ "res_type": "polyphase"
40
+ },
41
+ "4": {
42
+ "sr": 44100,
43
+ "hl": 512,
44
+ "n_fft": 768,
45
+ "crop_start": 121,
46
+ "crop_stop": 382,
47
+ "hpf_start": 138,
48
+ "hpf_stop": 123,
49
+ "res_type": "sinc_medium"
50
+ }
51
+ },
52
+ "sr": 44100,
53
+ "pre_filter_start": 740,
54
+ "pre_filter_stop": 768
55
+ }
uvr5_pack/lib_v5/modelparams/4band_44100_msb.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mid_side_b": true,
3
+ "bins": 768,
4
+ "unstable_bins": 7,
5
+ "reduction_bins": 668,
6
+ "band": {
7
+ "1": {
8
+ "sr": 11025,
9
+ "hl": 128,
10
+ "n_fft": 1024,
11
+ "crop_start": 0,
12
+ "crop_stop": 186,
13
+ "lpf_start": 37,
14
+ "lpf_stop": 73,
15
+ "res_type": "polyphase"
16
+ },
17
+ "2": {
18
+ "sr": 11025,
19
+ "hl": 128,
20
+ "n_fft": 512,
21
+ "crop_start": 4,
22
+ "crop_stop": 185,
23
+ "hpf_start": 36,
24
+ "hpf_stop": 18,
25
+ "lpf_start": 93,
26
+ "lpf_stop": 185,
27
+ "res_type": "polyphase"
28
+ },
29
+ "3": {
30
+ "sr": 22050,
31
+ "hl": 256,
32
+ "n_fft": 512,
33
+ "crop_start": 46,
34
+ "crop_stop": 186,
35
+ "hpf_start": 93,
36
+ "hpf_stop": 46,
37
+ "lpf_start": 164,
38
+ "lpf_stop": 186,
39
+ "res_type": "polyphase"
40
+ },
41
+ "4": {
42
+ "sr": 44100,
43
+ "hl": 512,
44
+ "n_fft": 768,
45
+ "crop_start": 121,
46
+ "crop_stop": 382,
47
+ "hpf_start": 138,
48
+ "hpf_stop": 123,
49
+ "res_type": "sinc_medium"
50
+ }
51
+ },
52
+ "sr": 44100,
53
+ "pre_filter_start": 740,
54
+ "pre_filter_stop": 768
55
+ }
uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mid_side_b": true,
3
+ "bins": 768,
4
+ "unstable_bins": 7,
5
+ "reduction_bins": 668,
6
+ "band": {
7
+ "1": {
8
+ "sr": 11025,
9
+ "hl": 128,
10
+ "n_fft": 1024,
11
+ "crop_start": 0,
12
+ "crop_stop": 186,
13
+ "lpf_start": 37,
14
+ "lpf_stop": 73,
15
+ "res_type": "polyphase"
16
+ },
17
+ "2": {
18
+ "sr": 11025,
19
+ "hl": 128,
20
+ "n_fft": 512,
21
+ "crop_start": 4,
22
+ "crop_stop": 185,
23
+ "hpf_start": 36,
24
+ "hpf_stop": 18,
25
+ "lpf_start": 93,
26
+ "lpf_stop": 185,
27
+ "res_type": "polyphase"
28
+ },
29
+ "3": {
30
+ "sr": 22050,
31
+ "hl": 256,
32
+ "n_fft": 512,
33
+ "crop_start": 46,
34
+ "crop_stop": 186,
35
+ "hpf_start": 93,
36
+ "hpf_stop": 46,
37
+ "lpf_start": 164,
38
+ "lpf_stop": 186,
39
+ "res_type": "polyphase"
40
+ },
41
+ "4": {
42
+ "sr": 44100,
43
+ "hl": 512,
44
+ "n_fft": 768,
45
+ "crop_start": 121,
46
+ "crop_stop": 382,
47
+ "hpf_start": 138,
48
+ "hpf_stop": 123,
49
+ "res_type": "sinc_medium"
50
+ }
51
+ },
52
+ "sr": 44100,
53
+ "pre_filter_start": 740,
54
+ "pre_filter_stop": 768
55
+ }
uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "reverse": true,
3
+ "bins": 768,
4
+ "unstable_bins": 7,
5
+ "reduction_bins": 668,
6
+ "band": {
7
+ "1": {
8
+ "sr": 11025,
9
+ "hl": 128,
10
+ "n_fft": 1024,
11
+ "crop_start": 0,
12
+ "crop_stop": 186,
13
+ "lpf_start": 37,
14
+ "lpf_stop": 73,
15
+ "res_type": "polyphase"
16
+ },
17
+ "2": {
18
+ "sr": 11025,
19
+ "hl": 128,
20
+ "n_fft": 512,
21
+ "crop_start": 4,
22
+ "crop_stop": 185,
23
+ "hpf_start": 36,
24
+ "hpf_stop": 18,
25
+ "lpf_start": 93,
26
+ "lpf_stop": 185,
27
+ "res_type": "polyphase"
28
+ },
29
+ "3": {
30
+ "sr": 22050,
31
+ "hl": 256,
32
+ "n_fft": 512,
33
+ "crop_start": 46,
34
+ "crop_stop": 186,
35
+ "hpf_start": 93,
36
+ "hpf_stop": 46,
37
+ "lpf_start": 164,
38
+ "lpf_stop": 186,
39
+ "res_type": "polyphase"
40
+ },
41
+ "4": {
42
+ "sr": 44100,
43
+ "hl": 512,
44
+ "n_fft": 768,
45
+ "crop_start": 121,
46
+ "crop_stop": 382,
47
+ "hpf_start": 138,
48
+ "hpf_stop": 123,
49
+ "res_type": "sinc_medium"
50
+ }
51
+ },
52
+ "sr": 44100,
53
+ "pre_filter_start": 740,
54
+ "pre_filter_stop": 768
55
+ }
uvr5_pack/lib_v5/modelparams/4band_44100_sw.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "stereo_w": true,
3
+ "bins": 768,
4
+ "unstable_bins": 7,
5
+ "reduction_bins": 668,
6
+ "band": {
7
+ "1": {
8
+ "sr": 11025,
9
+ "hl": 128,
10
+ "n_fft": 1024,
11
+ "crop_start": 0,
12
+ "crop_stop": 186,
13
+ "lpf_start": 37,
14
+ "lpf_stop": 73,
15
+ "res_type": "polyphase"
16
+ },
17
+ "2": {
18
+ "sr": 11025,
19
+ "hl": 128,
20
+ "n_fft": 512,
21
+ "crop_start": 4,
22
+ "crop_stop": 185,
23
+ "hpf_start": 36,
24
+ "hpf_stop": 18,
25
+ "lpf_start": 93,
26
+ "lpf_stop": 185,
27
+ "res_type": "polyphase"
28
+ },
29
+ "3": {
30
+ "sr": 22050,
31
+ "hl": 256,
32
+ "n_fft": 512,
33
+ "crop_start": 46,
34
+ "crop_stop": 186,
35
+ "hpf_start": 93,
36
+ "hpf_stop": 46,
37
+ "lpf_start": 164,
38
+ "lpf_stop": 186,
39
+ "res_type": "polyphase"
40
+ },
41
+ "4": {
42
+ "sr": 44100,
43
+ "hl": 512,
44
+ "n_fft": 768,
45
+ "crop_start": 121,
46
+ "crop_stop": 382,
47
+ "hpf_start": 138,
48
+ "hpf_stop": 123,
49
+ "res_type": "sinc_medium"
50
+ }
51
+ },
52
+ "sr": 44100,
53
+ "pre_filter_start": 740,
54
+ "pre_filter_stop": 768
55
+ }
uvr5_pack/lib_v5/modelparams/4band_v2.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 672,
3
+ "unstable_bins": 8,
4
+ "reduction_bins": 637,
5
+ "band": {
6
+ "1": {
7
+ "sr": 7350,
8
+ "hl": 80,
9
+ "n_fft": 640,
10
+ "crop_start": 0,
11
+ "crop_stop": 85,
12
+ "lpf_start": 25,
13
+ "lpf_stop": 53,
14
+ "res_type": "polyphase"
15
+ },
16
+ "2": {
17
+ "sr": 7350,
18
+ "hl": 80,
19
+ "n_fft": 320,
20
+ "crop_start": 4,
21
+ "crop_stop": 87,
22
+ "hpf_start": 25,
23
+ "hpf_stop": 12,
24
+ "lpf_start": 31,
25
+ "lpf_stop": 62,
26
+ "res_type": "polyphase"
27
+ },
28
+ "3": {
29
+ "sr": 14700,
30
+ "hl": 160,
31
+ "n_fft": 512,
32
+ "crop_start": 17,
33
+ "crop_stop": 216,
34
+ "hpf_start": 48,
35
+ "hpf_stop": 24,
36
+ "lpf_start": 139,
37
+ "lpf_stop": 210,
38
+ "res_type": "polyphase"
39
+ },
40
+ "4": {
41
+ "sr": 44100,
42
+ "hl": 480,
43
+ "n_fft": 960,
44
+ "crop_start": 78,
45
+ "crop_stop": 383,
46
+ "hpf_start": 130,
47
+ "hpf_stop": 86,
48
+ "res_type": "kaiser_fast"
49
+ }
50
+ },
51
+ "sr": 44100,
52
+ "pre_filter_start": 668,
53
+ "pre_filter_stop": 672
54
+ }
uvr5_pack/lib_v5/modelparams/4band_v2_sn.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 672,
3
+ "unstable_bins": 8,
4
+ "reduction_bins": 637,
5
+ "band": {
6
+ "1": {
7
+ "sr": 7350,
8
+ "hl": 80,
9
+ "n_fft": 640,
10
+ "crop_start": 0,
11
+ "crop_stop": 85,
12
+ "lpf_start": 25,
13
+ "lpf_stop": 53,
14
+ "res_type": "polyphase"
15
+ },
16
+ "2": {
17
+ "sr": 7350,
18
+ "hl": 80,
19
+ "n_fft": 320,
20
+ "crop_start": 4,
21
+ "crop_stop": 87,
22
+ "hpf_start": 25,
23
+ "hpf_stop": 12,
24
+ "lpf_start": 31,
25
+ "lpf_stop": 62,
26
+ "res_type": "polyphase"
27
+ },
28
+ "3": {
29
+ "sr": 14700,
30
+ "hl": 160,
31
+ "n_fft": 512,
32
+ "crop_start": 17,
33
+ "crop_stop": 216,
34
+ "hpf_start": 48,
35
+ "hpf_stop": 24,
36
+ "lpf_start": 139,
37
+ "lpf_stop": 210,
38
+ "res_type": "polyphase"
39
+ },
40
+ "4": {
41
+ "sr": 44100,
42
+ "hl": 480,
43
+ "n_fft": 960,
44
+ "crop_start": 78,
45
+ "crop_stop": 383,
46
+ "hpf_start": 130,
47
+ "hpf_stop": 86,
48
+ "convert_channels": "stereo_n",
49
+ "res_type": "kaiser_fast"
50
+ }
51
+ },
52
+ "sr": 44100,
53
+ "pre_filter_start": 668,
54
+ "pre_filter_stop": 672
55
+ }
uvr5_pack/lib_v5/modelparams/4band_v3.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 672,
3
+ "unstable_bins": 8,
4
+ "reduction_bins": 530,
5
+ "band": {
6
+ "1": {
7
+ "sr": 7350,
8
+ "hl": 80,
9
+ "n_fft": 640,
10
+ "crop_start": 0,
11
+ "crop_stop": 85,
12
+ "lpf_start": 25,
13
+ "lpf_stop": 53,
14
+ "res_type": "polyphase"
15
+ },
16
+ "2": {
17
+ "sr": 7350,
18
+ "hl": 80,
19
+ "n_fft": 320,
20
+ "crop_start": 4,
21
+ "crop_stop": 87,
22
+ "hpf_start": 25,
23
+ "hpf_stop": 12,
24
+ "lpf_start": 31,
25
+ "lpf_stop": 62,
26
+ "res_type": "polyphase"
27
+ },
28
+ "3": {
29
+ "sr": 14700,
30
+ "hl": 160,
31
+ "n_fft": 512,
32
+ "crop_start": 17,
33
+ "crop_stop": 216,
34
+ "hpf_start": 48,
35
+ "hpf_stop": 24,
36
+ "lpf_start": 139,
37
+ "lpf_stop": 210,
38
+ "res_type": "polyphase"
39
+ },
40
+ "4": {
41
+ "sr": 44100,
42
+ "hl": 480,
43
+ "n_fft": 960,
44
+ "crop_start": 78,
45
+ "crop_stop": 383,
46
+ "hpf_start": 130,
47
+ "hpf_stop": 86,
48
+ "res_type": "kaiser_fast"
49
+ }
50
+ },
51
+ "sr": 44100,
52
+ "pre_filter_start": 668,
53
+ "pre_filter_stop": 672
54
+ }
uvr5_pack/lib_v5/modelparams/ensemble.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mid_side_b2": true,
3
+ "bins": 1280,
4
+ "unstable_bins": 7,
5
+ "reduction_bins": 565,
6
+ "band": {
7
+ "1": {
8
+ "sr": 11025,
9
+ "hl": 108,
10
+ "n_fft": 2048,
11
+ "crop_start": 0,
12
+ "crop_stop": 374,
13
+ "lpf_start": 92,
14
+ "lpf_stop": 186,
15
+ "res_type": "polyphase"
16
+ },
17
+ "2": {
18
+ "sr": 22050,
19
+ "hl": 216,
20
+ "n_fft": 1536,
21
+ "crop_start": 0,
22
+ "crop_stop": 424,
23
+ "hpf_start": 68,
24
+ "hpf_stop": 34,
25
+ "lpf_start": 348,
26
+ "lpf_stop": 418,
27
+ "res_type": "polyphase"
28
+ },
29
+ "3": {
30
+ "sr": 44100,
31
+ "hl": 432,
32
+ "n_fft": 1280,
33
+ "crop_start": 132,
34
+ "crop_stop": 614,
35
+ "hpf_start": 172,
36
+ "hpf_stop": 144,
37
+ "res_type": "polyphase"
38
+ }
39
+ },
40
+ "sr": 44100,
41
+ "pre_filter_start": 1280,
42
+ "pre_filter_stop": 1280
43
+ }
uvr5_pack/lib_v5/nets.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+
5
+ from uvr5_pack.lib_v5 import layers
6
+ from uvr5_pack.lib_v5 import spec_utils
7
+
8
+
9
+ class BaseASPPNet(nn.Module):
10
+ def __init__(self, nin, ch, dilations=(4, 8, 16)):
11
+ super(BaseASPPNet, self).__init__()
12
+ self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
13
+ self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
14
+ self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
15
+ self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
16
+
17
+ self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
18
+
19
+ self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
20
+ self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
21
+ self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
22
+ self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
23
+
24
+ def __call__(self, x):
25
+ h, e1 = self.enc1(x)
26
+ h, e2 = self.enc2(h)
27
+ h, e3 = self.enc3(h)
28
+ h, e4 = self.enc4(h)
29
+
30
+ h = self.aspp(h)
31
+
32
+ h = self.dec4(h, e4)
33
+ h = self.dec3(h, e3)
34
+ h = self.dec2(h, e2)
35
+ h = self.dec1(h, e1)
36
+
37
+ return h
38
+
39
+
40
+ class CascadedASPPNet(nn.Module):
41
+ def __init__(self, n_fft):
42
+ super(CascadedASPPNet, self).__init__()
43
+ self.stg1_low_band_net = BaseASPPNet(2, 16)
44
+ self.stg1_high_band_net = BaseASPPNet(2, 16)
45
+
46
+ self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0)
47
+ self.stg2_full_band_net = BaseASPPNet(8, 16)
48
+
49
+ self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
50
+ self.stg3_full_band_net = BaseASPPNet(16, 32)
51
+
52
+ self.out = nn.Conv2d(32, 2, 1, bias=False)
53
+ self.aux1_out = nn.Conv2d(16, 2, 1, bias=False)
54
+ self.aux2_out = nn.Conv2d(16, 2, 1, bias=False)
55
+
56
+ self.max_bin = n_fft // 2
57
+ self.output_bin = n_fft // 2 + 1
58
+
59
+ self.offset = 128
60
+
61
+ def forward(self, x, aggressiveness=None):
62
+ mix = x.detach()
63
+ x = x.clone()
64
+
65
+ x = x[:, :, : self.max_bin]
66
+
67
+ bandw = x.size()[2] // 2
68
+ aux1 = torch.cat(
69
+ [
70
+ self.stg1_low_band_net(x[:, :, :bandw]),
71
+ self.stg1_high_band_net(x[:, :, bandw:]),
72
+ ],
73
+ dim=2,
74
+ )
75
+
76
+ h = torch.cat([x, aux1], dim=1)
77
+ aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
78
+
79
+ h = torch.cat([x, aux1, aux2], dim=1)
80
+ h = self.stg3_full_band_net(self.stg3_bridge(h))
81
+
82
+ mask = torch.sigmoid(self.out(h))
83
+ mask = F.pad(
84
+ input=mask,
85
+ pad=(0, 0, 0, self.output_bin - mask.size()[2]),
86
+ mode="replicate",
87
+ )
88
+
89
+ if self.training:
90
+ aux1 = torch.sigmoid(self.aux1_out(aux1))
91
+ aux1 = F.pad(
92
+ input=aux1,
93
+ pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
94
+ mode="replicate",
95
+ )
96
+ aux2 = torch.sigmoid(self.aux2_out(aux2))
97
+ aux2 = F.pad(
98
+ input=aux2,
99
+ pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
100
+ mode="replicate",
101
+ )
102
+ return mask * mix, aux1 * mix, aux2 * mix
103
+ else:
104
+ if aggressiveness:
105
+ mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
106
+ mask[:, :, : aggressiveness["split_bin"]],
107
+ 1 + aggressiveness["value"] / 3,
108
+ )
109
+ mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
110
+ mask[:, :, aggressiveness["split_bin"] :],
111
+ 1 + aggressiveness["value"],
112
+ )
113
+
114
+ return mask * mix
115
+
116
+ def predict(self, x_mag, aggressiveness=None):
117
+ h = self.forward(x_mag, aggressiveness)
118
+
119
+ if self.offset > 0:
120
+ h = h[:, :, :, self.offset : -self.offset]
121
+ assert h.size()[3] > 0
122
+
123
+ return h
uvr5_pack/lib_v5/nets_123812KB.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+
5
+ from uvr5_pack.lib_v5 import layers_123821KB as layers
6
+
7
+
8
+ class BaseASPPNet(nn.Module):
9
+ def __init__(self, nin, ch, dilations=(4, 8, 16)):
10
+ super(BaseASPPNet, self).__init__()
11
+ self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
12
+ self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
13
+ self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
14
+ self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
15
+
16
+ self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
17
+
18
+ self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
19
+ self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
20
+ self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
21
+ self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
22
+
23
+ def __call__(self, x):
24
+ h, e1 = self.enc1(x)
25
+ h, e2 = self.enc2(h)
26
+ h, e3 = self.enc3(h)
27
+ h, e4 = self.enc4(h)
28
+
29
+ h = self.aspp(h)
30
+
31
+ h = self.dec4(h, e4)
32
+ h = self.dec3(h, e3)
33
+ h = self.dec2(h, e2)
34
+ h = self.dec1(h, e1)
35
+
36
+ return h
37
+
38
+
39
+ class CascadedASPPNet(nn.Module):
40
+ def __init__(self, n_fft):
41
+ super(CascadedASPPNet, self).__init__()
42
+ self.stg1_low_band_net = BaseASPPNet(2, 32)
43
+ self.stg1_high_band_net = BaseASPPNet(2, 32)
44
+
45
+ self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
46
+ self.stg2_full_band_net = BaseASPPNet(16, 32)
47
+
48
+ self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
49
+ self.stg3_full_band_net = BaseASPPNet(32, 64)
50
+
51
+ self.out = nn.Conv2d(64, 2, 1, bias=False)
52
+ self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
53
+ self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
54
+
55
+ self.max_bin = n_fft // 2
56
+ self.output_bin = n_fft // 2 + 1
57
+
58
+ self.offset = 128
59
+
60
+ def forward(self, x, aggressiveness=None):
61
+ mix = x.detach()
62
+ x = x.clone()
63
+
64
+ x = x[:, :, : self.max_bin]
65
+
66
+ bandw = x.size()[2] // 2
67
+ aux1 = torch.cat(
68
+ [
69
+ self.stg1_low_band_net(x[:, :, :bandw]),
70
+ self.stg1_high_band_net(x[:, :, bandw:]),
71
+ ],
72
+ dim=2,
73
+ )
74
+
75
+ h = torch.cat([x, aux1], dim=1)
76
+ aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
77
+
78
+ h = torch.cat([x, aux1, aux2], dim=1)
79
+ h = self.stg3_full_band_net(self.stg3_bridge(h))
80
+
81
+ mask = torch.sigmoid(self.out(h))
82
+ mask = F.pad(
83
+ input=mask,
84
+ pad=(0, 0, 0, self.output_bin - mask.size()[2]),
85
+ mode="replicate",
86
+ )
87
+
88
+ if self.training:
89
+ aux1 = torch.sigmoid(self.aux1_out(aux1))
90
+ aux1 = F.pad(
91
+ input=aux1,
92
+ pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
93
+ mode="replicate",
94
+ )
95
+ aux2 = torch.sigmoid(self.aux2_out(aux2))
96
+ aux2 = F.pad(
97
+ input=aux2,
98
+ pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
99
+ mode="replicate",
100
+ )
101
+ return mask * mix, aux1 * mix, aux2 * mix
102
+ else:
103
+ if aggressiveness:
104
+ mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105
+ mask[:, :, : aggressiveness["split_bin"]],
106
+ 1 + aggressiveness["value"] / 3,
107
+ )
108
+ mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109
+ mask[:, :, aggressiveness["split_bin"] :],
110
+ 1 + aggressiveness["value"],
111
+ )
112
+
113
+ return mask * mix
114
+
115
+ def predict(self, x_mag, aggressiveness=None):
116
+ h = self.forward(x_mag, aggressiveness)
117
+
118
+ if self.offset > 0:
119
+ h = h[:, :, :, self.offset : -self.offset]
120
+ assert h.size()[3] > 0
121
+
122
+ return h
uvr5_pack/lib_v5/nets_123821KB.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+
5
+ from uvr5_pack.lib_v5 import layers_123821KB as layers
6
+
7
+
8
+ class BaseASPPNet(nn.Module):
9
+ def __init__(self, nin, ch, dilations=(4, 8, 16)):
10
+ super(BaseASPPNet, self).__init__()
11
+ self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
12
+ self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
13
+ self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
14
+ self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
15
+
16
+ self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
17
+
18
+ self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
19
+ self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
20
+ self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
21
+ self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
22
+
23
+ def __call__(self, x):
24
+ h, e1 = self.enc1(x)
25
+ h, e2 = self.enc2(h)
26
+ h, e3 = self.enc3(h)
27
+ h, e4 = self.enc4(h)
28
+
29
+ h = self.aspp(h)
30
+
31
+ h = self.dec4(h, e4)
32
+ h = self.dec3(h, e3)
33
+ h = self.dec2(h, e2)
34
+ h = self.dec1(h, e1)
35
+
36
+ return h
37
+
38
+
39
+ class CascadedASPPNet(nn.Module):
40
+ def __init__(self, n_fft):
41
+ super(CascadedASPPNet, self).__init__()
42
+ self.stg1_low_band_net = BaseASPPNet(2, 32)
43
+ self.stg1_high_band_net = BaseASPPNet(2, 32)
44
+
45
+ self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
46
+ self.stg2_full_band_net = BaseASPPNet(16, 32)
47
+
48
+ self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
49
+ self.stg3_full_band_net = BaseASPPNet(32, 64)
50
+
51
+ self.out = nn.Conv2d(64, 2, 1, bias=False)
52
+ self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
53
+ self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
54
+
55
+ self.max_bin = n_fft // 2
56
+ self.output_bin = n_fft // 2 + 1
57
+
58
+ self.offset = 128
59
+
60
+ def forward(self, x, aggressiveness=None):
61
+ mix = x.detach()
62
+ x = x.clone()
63
+
64
+ x = x[:, :, : self.max_bin]
65
+
66
+ bandw = x.size()[2] // 2
67
+ aux1 = torch.cat(
68
+ [
69
+ self.stg1_low_band_net(x[:, :, :bandw]),
70
+ self.stg1_high_band_net(x[:, :, bandw:]),
71
+ ],
72
+ dim=2,
73
+ )
74
+
75
+ h = torch.cat([x, aux1], dim=1)
76
+ aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
77
+
78
+ h = torch.cat([x, aux1, aux2], dim=1)
79
+ h = self.stg3_full_band_net(self.stg3_bridge(h))
80
+
81
+ mask = torch.sigmoid(self.out(h))
82
+ mask = F.pad(
83
+ input=mask,
84
+ pad=(0, 0, 0, self.output_bin - mask.size()[2]),
85
+ mode="replicate",
86
+ )
87
+
88
+ if self.training:
89
+ aux1 = torch.sigmoid(self.aux1_out(aux1))
90
+ aux1 = F.pad(
91
+ input=aux1,
92
+ pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
93
+ mode="replicate",
94
+ )
95
+ aux2 = torch.sigmoid(self.aux2_out(aux2))
96
+ aux2 = F.pad(
97
+ input=aux2,
98
+ pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
99
+ mode="replicate",
100
+ )
101
+ return mask * mix, aux1 * mix, aux2 * mix
102
+ else:
103
+ if aggressiveness:
104
+ mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105
+ mask[:, :, : aggressiveness["split_bin"]],
106
+ 1 + aggressiveness["value"] / 3,
107
+ )
108
+ mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109
+ mask[:, :, aggressiveness["split_bin"] :],
110
+ 1 + aggressiveness["value"],
111
+ )
112
+
113
+ return mask * mix
114
+
115
+ def predict(self, x_mag, aggressiveness=None):
116
+ h = self.forward(x_mag, aggressiveness)
117
+
118
+ if self.offset > 0:
119
+ h = h[:, :, :, self.offset : -self.offset]
120
+ assert h.size()[3] > 0
121
+
122
+ return h
uvr5_pack/lib_v5/nets_33966KB.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+
5
+ from uvr5_pack.lib_v5 import layers_33966KB as layers
6
+
7
+
8
+ class BaseASPPNet(nn.Module):
9
+ def __init__(self, nin, ch, dilations=(4, 8, 16, 32)):
10
+ super(BaseASPPNet, self).__init__()
11
+ self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
12
+ self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
13
+ self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
14
+ self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
15
+
16
+ self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
17
+
18
+ self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
19
+ self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
20
+ self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
21
+ self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
22
+
23
+ def __call__(self, x):
24
+ h, e1 = self.enc1(x)
25
+ h, e2 = self.enc2(h)
26
+ h, e3 = self.enc3(h)
27
+ h, e4 = self.enc4(h)
28
+
29
+ h = self.aspp(h)
30
+
31
+ h = self.dec4(h, e4)
32
+ h = self.dec3(h, e3)
33
+ h = self.dec2(h, e2)
34
+ h = self.dec1(h, e1)
35
+
36
+ return h
37
+
38
+
39
+ class CascadedASPPNet(nn.Module):
40
+ def __init__(self, n_fft):
41
+ super(CascadedASPPNet, self).__init__()
42
+ self.stg1_low_band_net = BaseASPPNet(2, 16)
43
+ self.stg1_high_band_net = BaseASPPNet(2, 16)
44
+
45
+ self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0)
46
+ self.stg2_full_band_net = BaseASPPNet(8, 16)
47
+
48
+ self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
49
+ self.stg3_full_band_net = BaseASPPNet(16, 32)
50
+
51
+ self.out = nn.Conv2d(32, 2, 1, bias=False)
52
+ self.aux1_out = nn.Conv2d(16, 2, 1, bias=False)
53
+ self.aux2_out = nn.Conv2d(16, 2, 1, bias=False)
54
+
55
+ self.max_bin = n_fft // 2
56
+ self.output_bin = n_fft // 2 + 1
57
+
58
+ self.offset = 128
59
+
60
+ def forward(self, x, aggressiveness=None):
61
+ mix = x.detach()
62
+ x = x.clone()
63
+
64
+ x = x[:, :, : self.max_bin]
65
+
66
+ bandw = x.size()[2] // 2
67
+ aux1 = torch.cat(
68
+ [
69
+ self.stg1_low_band_net(x[:, :, :bandw]),
70
+ self.stg1_high_band_net(x[:, :, bandw:]),
71
+ ],
72
+ dim=2,
73
+ )
74
+
75
+ h = torch.cat([x, aux1], dim=1)
76
+ aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
77
+
78
+ h = torch.cat([x, aux1, aux2], dim=1)
79
+ h = self.stg3_full_band_net(self.stg3_bridge(h))
80
+
81
+ mask = torch.sigmoid(self.out(h))
82
+ mask = F.pad(
83
+ input=mask,
84
+ pad=(0, 0, 0, self.output_bin - mask.size()[2]),
85
+ mode="replicate",
86
+ )
87
+
88
+ if self.training:
89
+ aux1 = torch.sigmoid(self.aux1_out(aux1))
90
+ aux1 = F.pad(
91
+ input=aux1,
92
+ pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
93
+ mode="replicate",
94
+ )
95
+ aux2 = torch.sigmoid(self.aux2_out(aux2))
96
+ aux2 = F.pad(
97
+ input=aux2,
98
+ pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
99
+ mode="replicate",
100
+ )
101
+ return mask * mix, aux1 * mix, aux2 * mix
102
+ else:
103
+ if aggressiveness:
104
+ mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105
+ mask[:, :, : aggressiveness["split_bin"]],
106
+ 1 + aggressiveness["value"] / 3,
107
+ )
108
+ mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109
+ mask[:, :, aggressiveness["split_bin"] :],
110
+ 1 + aggressiveness["value"],
111
+ )
112
+
113
+ return mask * mix
114
+
115
+ def predict(self, x_mag, aggressiveness=None):
116
+ h = self.forward(x_mag, aggressiveness)
117
+
118
+ if self.offset > 0:
119
+ h = h[:, :, :, self.offset : -self.offset]
120
+ assert h.size()[3] > 0
121
+
122
+ return h
uvr5_pack/lib_v5/nets_537227KB.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ from torch import nn
4
+ import torch.nn.functional as F
5
+
6
+ from uvr5_pack.lib_v5 import layers_537238KB as layers
7
+
8
+
9
+ class BaseASPPNet(nn.Module):
10
+ def __init__(self, nin, ch, dilations=(4, 8, 16)):
11
+ super(BaseASPPNet, self).__init__()
12
+ self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
13
+ self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
14
+ self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
15
+ self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
16
+
17
+ self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
18
+
19
+ self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
20
+ self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
21
+ self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
22
+ self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
23
+
24
+ def __call__(self, x):
25
+ h, e1 = self.enc1(x)
26
+ h, e2 = self.enc2(h)
27
+ h, e3 = self.enc3(h)
28
+ h, e4 = self.enc4(h)
29
+
30
+ h = self.aspp(h)
31
+
32
+ h = self.dec4(h, e4)
33
+ h = self.dec3(h, e3)
34
+ h = self.dec2(h, e2)
35
+ h = self.dec1(h, e1)
36
+
37
+ return h
38
+
39
+
40
+ class CascadedASPPNet(nn.Module):
41
+ def __init__(self, n_fft):
42
+ super(CascadedASPPNet, self).__init__()
43
+ self.stg1_low_band_net = BaseASPPNet(2, 64)
44
+ self.stg1_high_band_net = BaseASPPNet(2, 64)
45
+
46
+ self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
47
+ self.stg2_full_band_net = BaseASPPNet(32, 64)
48
+
49
+ self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0)
50
+ self.stg3_full_band_net = BaseASPPNet(64, 128)
51
+
52
+ self.out = nn.Conv2d(128, 2, 1, bias=False)
53
+ self.aux1_out = nn.Conv2d(64, 2, 1, bias=False)
54
+ self.aux2_out = nn.Conv2d(64, 2, 1, bias=False)
55
+
56
+ self.max_bin = n_fft // 2
57
+ self.output_bin = n_fft // 2 + 1
58
+
59
+ self.offset = 128
60
+
61
+ def forward(self, x, aggressiveness=None):
62
+ mix = x.detach()
63
+ x = x.clone()
64
+
65
+ x = x[:, :, : self.max_bin]
66
+
67
+ bandw = x.size()[2] // 2
68
+ aux1 = torch.cat(
69
+ [
70
+ self.stg1_low_band_net(x[:, :, :bandw]),
71
+ self.stg1_high_band_net(x[:, :, bandw:]),
72
+ ],
73
+ dim=2,
74
+ )
75
+
76
+ h = torch.cat([x, aux1], dim=1)
77
+ aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
78
+
79
+ h = torch.cat([x, aux1, aux2], dim=1)
80
+ h = self.stg3_full_band_net(self.stg3_bridge(h))
81
+
82
+ mask = torch.sigmoid(self.out(h))
83
+ mask = F.pad(
84
+ input=mask,
85
+ pad=(0, 0, 0, self.output_bin - mask.size()[2]),
86
+ mode="replicate",
87
+ )
88
+
89
+ if self.training:
90
+ aux1 = torch.sigmoid(self.aux1_out(aux1))
91
+ aux1 = F.pad(
92
+ input=aux1,
93
+ pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
94
+ mode="replicate",
95
+ )
96
+ aux2 = torch.sigmoid(self.aux2_out(aux2))
97
+ aux2 = F.pad(
98
+ input=aux2,
99
+ pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
100
+ mode="replicate",
101
+ )
102
+ return mask * mix, aux1 * mix, aux2 * mix
103
+ else:
104
+ if aggressiveness:
105
+ mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
106
+ mask[:, :, : aggressiveness["split_bin"]],
107
+ 1 + aggressiveness["value"] / 3,
108
+ )
109
+ mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
110
+ mask[:, :, aggressiveness["split_bin"] :],
111
+ 1 + aggressiveness["value"],
112
+ )
113
+
114
+ return mask * mix
115
+
116
+ def predict(self, x_mag, aggressiveness=None):
117
+ h = self.forward(x_mag, aggressiveness)
118
+
119
+ if self.offset > 0:
120
+ h = h[:, :, :, self.offset : -self.offset]
121
+ assert h.size()[3] > 0
122
+
123
+ return h
uvr5_pack/lib_v5/nets_537238KB.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ from torch import nn
4
+ import torch.nn.functional as F
5
+
6
+ from uvr5_pack.lib_v5 import layers_537238KB as layers
7
+
8
+
9
+ class BaseASPPNet(nn.Module):
10
+ def __init__(self, nin, ch, dilations=(4, 8, 16)):
11
+ super(BaseASPPNet, self).__init__()
12
+ self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
13
+ self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
14
+ self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
15
+ self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
16
+
17
+ self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
18
+
19
+ self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
20
+ self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
21
+ self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
22
+ self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
23
+
24
+ def __call__(self, x):
25
+ h, e1 = self.enc1(x)
26
+ h, e2 = self.enc2(h)
27
+ h, e3 = self.enc3(h)
28
+ h, e4 = self.enc4(h)
29
+
30
+ h = self.aspp(h)
31
+
32
+ h = self.dec4(h, e4)
33
+ h = self.dec3(h, e3)
34
+ h = self.dec2(h, e2)
35
+ h = self.dec1(h, e1)
36
+
37
+ return h
38
+
39
+
40
+ class CascadedASPPNet(nn.Module):
41
+ def __init__(self, n_fft):
42
+ super(CascadedASPPNet, self).__init__()
43
+ self.stg1_low_band_net = BaseASPPNet(2, 64)
44
+ self.stg1_high_band_net = BaseASPPNet(2, 64)
45
+
46
+ self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
47
+ self.stg2_full_band_net = BaseASPPNet(32, 64)
48
+
49
+ self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0)
50
+ self.stg3_full_band_net = BaseASPPNet(64, 128)
51
+
52
+ self.out = nn.Conv2d(128, 2, 1, bias=False)
53
+ self.aux1_out = nn.Conv2d(64, 2, 1, bias=False)
54
+ self.aux2_out = nn.Conv2d(64, 2, 1, bias=False)
55
+
56
+ self.max_bin = n_fft // 2
57
+ self.output_bin = n_fft // 2 + 1
58
+
59
+ self.offset = 128
60
+
61
+ def forward(self, x, aggressiveness=None):
62
+ mix = x.detach()
63
+ x = x.clone()
64
+
65
+ x = x[:, :, : self.max_bin]
66
+
67
+ bandw = x.size()[2] // 2
68
+ aux1 = torch.cat(
69
+ [
70
+ self.stg1_low_band_net(x[:, :, :bandw]),
71
+ self.stg1_high_band_net(x[:, :, bandw:]),
72
+ ],
73
+ dim=2,
74
+ )
75
+
76
+ h = torch.cat([x, aux1], dim=1)
77
+ aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
78
+
79
+ h = torch.cat([x, aux1, aux2], dim=1)
80
+ h = self.stg3_full_band_net(self.stg3_bridge(h))
81
+
82
+ mask = torch.sigmoid(self.out(h))
83
+ mask = F.pad(
84
+ input=mask,
85
+ pad=(0, 0, 0, self.output_bin - mask.size()[2]),
86
+ mode="replicate",
87
+ )
88
+
89
+ if self.training:
90
+ aux1 = torch.sigmoid(self.aux1_out(aux1))
91
+ aux1 = F.pad(
92
+ input=aux1,
93
+ pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
94
+ mode="replicate",
95
+ )
96
+ aux2 = torch.sigmoid(self.aux2_out(aux2))
97
+ aux2 = F.pad(
98
+ input=aux2,
99
+ pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
100
+ mode="replicate",
101
+ )
102
+ return mask * mix, aux1 * mix, aux2 * mix
103
+ else:
104
+ if aggressiveness:
105
+ mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
106
+ mask[:, :, : aggressiveness["split_bin"]],
107
+ 1 + aggressiveness["value"] / 3,
108
+ )
109
+ mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
110
+ mask[:, :, aggressiveness["split_bin"] :],
111
+ 1 + aggressiveness["value"],
112
+ )
113
+
114
+ return mask * mix
115
+
116
+ def predict(self, x_mag, aggressiveness=None):
117
+ h = self.forward(x_mag, aggressiveness)
118
+
119
+ if self.offset > 0:
120
+ h = h[:, :, :, self.offset : -self.offset]
121
+ assert h.size()[3] > 0
122
+
123
+ return h
uvr5_pack/lib_v5/nets_61968KB.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+
5
+ from uvr5_pack.lib_v5 import layers_123821KB as layers
6
+
7
+
8
+ class BaseASPPNet(nn.Module):
9
+ def __init__(self, nin, ch, dilations=(4, 8, 16)):
10
+ super(BaseASPPNet, self).__init__()
11
+ self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
12
+ self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
13
+ self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
14
+ self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
15
+
16
+ self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
17
+
18
+ self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
19
+ self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
20
+ self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
21
+ self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
22
+
23
+ def __call__(self, x):
24
+ h, e1 = self.enc1(x)
25
+ h, e2 = self.enc2(h)
26
+ h, e3 = self.enc3(h)
27
+ h, e4 = self.enc4(h)
28
+
29
+ h = self.aspp(h)
30
+
31
+ h = self.dec4(h, e4)
32
+ h = self.dec3(h, e3)
33
+ h = self.dec2(h, e2)
34
+ h = self.dec1(h, e1)
35
+
36
+ return h
37
+
38
+
39
+ class CascadedASPPNet(nn.Module):
40
+ def __init__(self, n_fft):
41
+ super(CascadedASPPNet, self).__init__()
42
+ self.stg1_low_band_net = BaseASPPNet(2, 32)
43
+ self.stg1_high_band_net = BaseASPPNet(2, 32)
44
+
45
+ self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
46
+ self.stg2_full_band_net = BaseASPPNet(16, 32)
47
+
48
+ self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
49
+ self.stg3_full_band_net = BaseASPPNet(32, 64)
50
+
51
+ self.out = nn.Conv2d(64, 2, 1, bias=False)
52
+ self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
53
+ self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
54
+
55
+ self.max_bin = n_fft // 2
56
+ self.output_bin = n_fft // 2 + 1
57
+
58
+ self.offset = 128
59
+
60
+ def forward(self, x, aggressiveness=None):
61
+ mix = x.detach()
62
+ x = x.clone()
63
+
64
+ x = x[:, :, : self.max_bin]
65
+
66
+ bandw = x.size()[2] // 2
67
+ aux1 = torch.cat(
68
+ [
69
+ self.stg1_low_band_net(x[:, :, :bandw]),
70
+ self.stg1_high_band_net(x[:, :, bandw:]),
71
+ ],
72
+ dim=2,
73
+ )
74
+
75
+ h = torch.cat([x, aux1], dim=1)
76
+ aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
77
+
78
+ h = torch.cat([x, aux1, aux2], dim=1)
79
+ h = self.stg3_full_band_net(self.stg3_bridge(h))
80
+
81
+ mask = torch.sigmoid(self.out(h))
82
+ mask = F.pad(
83
+ input=mask,
84
+ pad=(0, 0, 0, self.output_bin - mask.size()[2]),
85
+ mode="replicate",
86
+ )
87
+
88
+ if self.training:
89
+ aux1 = torch.sigmoid(self.aux1_out(aux1))
90
+ aux1 = F.pad(
91
+ input=aux1,
92
+ pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
93
+ mode="replicate",
94
+ )
95
+ aux2 = torch.sigmoid(self.aux2_out(aux2))
96
+ aux2 = F.pad(
97
+ input=aux2,
98
+ pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
99
+ mode="replicate",
100
+ )
101
+ return mask * mix, aux1 * mix, aux2 * mix
102
+ else:
103
+ if aggressiveness:
104
+ mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105
+ mask[:, :, : aggressiveness["split_bin"]],
106
+ 1 + aggressiveness["value"] / 3,
107
+ )
108
+ mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109
+ mask[:, :, aggressiveness["split_bin"] :],
110
+ 1 + aggressiveness["value"],
111
+ )
112
+
113
+ return mask * mix
114
+
115
+ def predict(self, x_mag, aggressiveness=None):
116
+ h = self.forward(x_mag, aggressiveness)
117
+
118
+ if self.offset > 0:
119
+ h = h[:, :, :, self.offset : -self.offset]
120
+ assert h.size()[3] > 0
121
+
122
+ return h
uvr5_pack/lib_v5/nets_new.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+ from uvr5_pack.lib_v5 import layers_new as layers
5
+
6
+
7
+ class BaseNet(nn.Module):
8
+ def __init__(
9
+ self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6))
10
+ ):
11
+ super(BaseNet, self).__init__()
12
+ self.enc1 = layers.Conv2DBNActiv(nin, nout, 3, 1, 1)
13
+ self.enc2 = layers.Encoder(nout, nout * 2, 3, 2, 1)
14
+ self.enc3 = layers.Encoder(nout * 2, nout * 4, 3, 2, 1)
15
+ self.enc4 = layers.Encoder(nout * 4, nout * 6, 3, 2, 1)
16
+ self.enc5 = layers.Encoder(nout * 6, nout * 8, 3, 2, 1)
17
+
18
+ self.aspp = layers.ASPPModule(nout * 8, nout * 8, dilations, dropout=True)
19
+
20
+ self.dec4 = layers.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1)
21
+ self.dec3 = layers.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1)
22
+ self.dec2 = layers.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1)
23
+ self.lstm_dec2 = layers.LSTMModule(nout * 2, nin_lstm, nout_lstm)
24
+ self.dec1 = layers.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1)
25
+
26
+ def __call__(self, x):
27
+ e1 = self.enc1(x)
28
+ e2 = self.enc2(e1)
29
+ e3 = self.enc3(e2)
30
+ e4 = self.enc4(e3)
31
+ e5 = self.enc5(e4)
32
+
33
+ h = self.aspp(e5)
34
+
35
+ h = self.dec4(h, e4)
36
+ h = self.dec3(h, e3)
37
+ h = self.dec2(h, e2)
38
+ h = torch.cat([h, self.lstm_dec2(h)], dim=1)
39
+ h = self.dec1(h, e1)
40
+
41
+ return h
42
+
43
+
44
+ class CascadedNet(nn.Module):
45
+ def __init__(self, n_fft, nout=32, nout_lstm=128):
46
+ super(CascadedNet, self).__init__()
47
+
48
+ self.max_bin = n_fft // 2
49
+ self.output_bin = n_fft // 2 + 1
50
+ self.nin_lstm = self.max_bin // 2
51
+ self.offset = 64
52
+
53
+ self.stg1_low_band_net = nn.Sequential(
54
+ BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm),
55
+ layers.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0),
56
+ )
57
+
58
+ self.stg1_high_band_net = BaseNet(
59
+ 2, nout // 4, self.nin_lstm // 2, nout_lstm // 2
60
+ )
61
+
62
+ self.stg2_low_band_net = nn.Sequential(
63
+ BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm),
64
+ layers.Conv2DBNActiv(nout, nout // 2, 1, 1, 0),
65
+ )
66
+ self.stg2_high_band_net = BaseNet(
67
+ nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2
68
+ )
69
+
70
+ self.stg3_full_band_net = BaseNet(
71
+ 3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm
72
+ )
73
+
74
+ self.out = nn.Conv2d(nout, 2, 1, bias=False)
75
+ self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False)
76
+
77
+ def forward(self, x):
78
+ x = x[:, :, : self.max_bin]
79
+
80
+ bandw = x.size()[2] // 2
81
+ l1_in = x[:, :, :bandw]
82
+ h1_in = x[:, :, bandw:]
83
+ l1 = self.stg1_low_band_net(l1_in)
84
+ h1 = self.stg1_high_band_net(h1_in)
85
+ aux1 = torch.cat([l1, h1], dim=2)
86
+
87
+ l2_in = torch.cat([l1_in, l1], dim=1)
88
+ h2_in = torch.cat([h1_in, h1], dim=1)
89
+ l2 = self.stg2_low_band_net(l2_in)
90
+ h2 = self.stg2_high_band_net(h2_in)
91
+ aux2 = torch.cat([l2, h2], dim=2)
92
+
93
+ f3_in = torch.cat([x, aux1, aux2], dim=1)
94
+ f3 = self.stg3_full_band_net(f3_in)
95
+
96
+ mask = torch.sigmoid(self.out(f3))
97
+ mask = F.pad(
98
+ input=mask,
99
+ pad=(0, 0, 0, self.output_bin - mask.size()[2]),
100
+ mode="replicate",
101
+ )
102
+
103
+ if self.training:
104
+ aux = torch.cat([aux1, aux2], dim=1)
105
+ aux = torch.sigmoid(self.aux_out(aux))
106
+ aux = F.pad(
107
+ input=aux,
108
+ pad=(0, 0, 0, self.output_bin - aux.size()[2]),
109
+ mode="replicate",
110
+ )
111
+ return mask, aux
112
+ else:
113
+ return mask
114
+
115
+ def predict_mask(self, x):
116
+ mask = self.forward(x)
117
+
118
+ if self.offset > 0:
119
+ mask = mask[:, :, :, self.offset : -self.offset]
120
+ assert mask.size()[3] > 0
121
+
122
+ return mask
123
+
124
+ def predict(self, x, aggressiveness=None):
125
+ mask = self.forward(x)
126
+ pred_mag = x * mask
127
+
128
+ if self.offset > 0:
129
+ pred_mag = pred_mag[:, :, :, self.offset : -self.offset]
130
+ assert pred_mag.size()[3] > 0
131
+
132
+ return pred_mag
uvr5_pack/lib_v5/spec_utils.py ADDED
@@ -0,0 +1,667 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, librosa
2
+ import numpy as np
3
+ import soundfile as sf
4
+ from tqdm import tqdm
5
+ import json, math, hashlib
6
+
7
+
8
+ def crop_center(h1, h2):
9
+ h1_shape = h1.size()
10
+ h2_shape = h2.size()
11
+
12
+ if h1_shape[3] == h2_shape[3]:
13
+ return h1
14
+ elif h1_shape[3] < h2_shape[3]:
15
+ raise ValueError("h1_shape[3] must be greater than h2_shape[3]")
16
+
17
+ # s_freq = (h2_shape[2] - h1_shape[2]) // 2
18
+ # e_freq = s_freq + h1_shape[2]
19
+ s_time = (h1_shape[3] - h2_shape[3]) // 2
20
+ e_time = s_time + h2_shape[3]
21
+ h1 = h1[:, :, :, s_time:e_time]
22
+
23
+ return h1
24
+
25
+
26
+ def wave_to_spectrogram(
27
+ wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False
28
+ ):
29
+ if reverse:
30
+ wave_left = np.flip(np.asfortranarray(wave[0]))
31
+ wave_right = np.flip(np.asfortranarray(wave[1]))
32
+ elif mid_side:
33
+ wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
34
+ wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
35
+ elif mid_side_b2:
36
+ wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5))
37
+ wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5))
38
+ else:
39
+ wave_left = np.asfortranarray(wave[0])
40
+ wave_right = np.asfortranarray(wave[1])
41
+
42
+ spec_left = librosa.stft(wave_left, n_fft, hop_length=hop_length)
43
+ spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
44
+
45
+ spec = np.asfortranarray([spec_left, spec_right])
46
+
47
+ return spec
48
+
49
+
50
+ def wave_to_spectrogram_mt(
51
+ wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False
52
+ ):
53
+ import threading
54
+
55
+ if reverse:
56
+ wave_left = np.flip(np.asfortranarray(wave[0]))
57
+ wave_right = np.flip(np.asfortranarray(wave[1]))
58
+ elif mid_side:
59
+ wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
60
+ wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
61
+ elif mid_side_b2:
62
+ wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5))
63
+ wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5))
64
+ else:
65
+ wave_left = np.asfortranarray(wave[0])
66
+ wave_right = np.asfortranarray(wave[1])
67
+
68
+ def run_thread(**kwargs):
69
+ global spec_left
70
+ spec_left = librosa.stft(**kwargs)
71
+
72
+ thread = threading.Thread(
73
+ target=run_thread,
74
+ kwargs={"y": wave_left, "n_fft": n_fft, "hop_length": hop_length},
75
+ )
76
+ thread.start()
77
+ spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
78
+ thread.join()
79
+
80
+ spec = np.asfortranarray([spec_left, spec_right])
81
+
82
+ return spec
83
+
84
+
85
+ def combine_spectrograms(specs, mp):
86
+ l = min([specs[i].shape[2] for i in specs])
87
+ spec_c = np.zeros(shape=(2, mp.param["bins"] + 1, l), dtype=np.complex64)
88
+ offset = 0
89
+ bands_n = len(mp.param["band"])
90
+
91
+ for d in range(1, bands_n + 1):
92
+ h = mp.param["band"][d]["crop_stop"] - mp.param["band"][d]["crop_start"]
93
+ spec_c[:, offset : offset + h, :l] = specs[d][
94
+ :, mp.param["band"][d]["crop_start"] : mp.param["band"][d]["crop_stop"], :l
95
+ ]
96
+ offset += h
97
+
98
+ if offset > mp.param["bins"]:
99
+ raise ValueError("Too much bins")
100
+
101
+ # lowpass fiter
102
+ if (
103
+ mp.param["pre_filter_start"] > 0
104
+ ): # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']:
105
+ if bands_n == 1:
106
+ spec_c = fft_lp_filter(
107
+ spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"]
108
+ )
109
+ else:
110
+ gp = 1
111
+ for b in range(
112
+ mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"]
113
+ ):
114
+ g = math.pow(
115
+ 10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0
116
+ )
117
+ gp = g
118
+ spec_c[:, b, :] *= g
119
+
120
+ return np.asfortranarray(spec_c)
121
+
122
+
123
+ def spectrogram_to_image(spec, mode="magnitude"):
124
+ if mode == "magnitude":
125
+ if np.iscomplexobj(spec):
126
+ y = np.abs(spec)
127
+ else:
128
+ y = spec
129
+ y = np.log10(y**2 + 1e-8)
130
+ elif mode == "phase":
131
+ if np.iscomplexobj(spec):
132
+ y = np.angle(spec)
133
+ else:
134
+ y = spec
135
+
136
+ y -= y.min()
137
+ y *= 255 / y.max()
138
+ img = np.uint8(y)
139
+
140
+ if y.ndim == 3:
141
+ img = img.transpose(1, 2, 0)
142
+ img = np.concatenate([np.max(img, axis=2, keepdims=True), img], axis=2)
143
+
144
+ return img
145
+
146
+
147
+ def reduce_vocal_aggressively(X, y, softmask):
148
+ v = X - y
149
+ y_mag_tmp = np.abs(y)
150
+ v_mag_tmp = np.abs(v)
151
+
152
+ v_mask = v_mag_tmp > y_mag_tmp
153
+ y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf)
154
+
155
+ return y_mag * np.exp(1.0j * np.angle(y))
156
+
157
+
158
+ def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32):
159
+ if min_range < fade_size * 2:
160
+ raise ValueError("min_range must be >= fade_area * 2")
161
+
162
+ mag = mag.copy()
163
+
164
+ idx = np.where(ref.mean(axis=(0, 1)) < thres)[0]
165
+ starts = np.insert(idx[np.where(np.diff(idx) != 1)[0] + 1], 0, idx[0])
166
+ ends = np.append(idx[np.where(np.diff(idx) != 1)[0]], idx[-1])
167
+ uninformative = np.where(ends - starts > min_range)[0]
168
+ if len(uninformative) > 0:
169
+ starts = starts[uninformative]
170
+ ends = ends[uninformative]
171
+ old_e = None
172
+ for s, e in zip(starts, ends):
173
+ if old_e is not None and s - old_e < fade_size:
174
+ s = old_e - fade_size * 2
175
+
176
+ if s != 0:
177
+ weight = np.linspace(0, 1, fade_size)
178
+ mag[:, :, s : s + fade_size] += weight * ref[:, :, s : s + fade_size]
179
+ else:
180
+ s -= fade_size
181
+
182
+ if e != mag.shape[2]:
183
+ weight = np.linspace(1, 0, fade_size)
184
+ mag[:, :, e - fade_size : e] += weight * ref[:, :, e - fade_size : e]
185
+ else:
186
+ e += fade_size
187
+
188
+ mag[:, :, s + fade_size : e - fade_size] += ref[
189
+ :, :, s + fade_size : e - fade_size
190
+ ]
191
+ old_e = e
192
+
193
+ return mag
194
+
195
+
196
+ def align_wave_head_and_tail(a, b):
197
+ l = min([a[0].size, b[0].size])
198
+
199
+ return a[:l, :l], b[:l, :l]
200
+
201
+
202
+ def cache_or_load(mix_path, inst_path, mp):
203
+ mix_basename = os.path.splitext(os.path.basename(mix_path))[0]
204
+ inst_basename = os.path.splitext(os.path.basename(inst_path))[0]
205
+
206
+ cache_dir = "mph{}".format(
207
+ hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode("utf-8")).hexdigest()
208
+ )
209
+ mix_cache_dir = os.path.join("cache", cache_dir)
210
+ inst_cache_dir = os.path.join("cache", cache_dir)
211
+
212
+ os.makedirs(mix_cache_dir, exist_ok=True)
213
+ os.makedirs(inst_cache_dir, exist_ok=True)
214
+
215
+ mix_cache_path = os.path.join(mix_cache_dir, mix_basename + ".npy")
216
+ inst_cache_path = os.path.join(inst_cache_dir, inst_basename + ".npy")
217
+
218
+ if os.path.exists(mix_cache_path) and os.path.exists(inst_cache_path):
219
+ X_spec_m = np.load(mix_cache_path)
220
+ y_spec_m = np.load(inst_cache_path)
221
+ else:
222
+ X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
223
+
224
+ for d in range(len(mp.param["band"]), 0, -1):
225
+ bp = mp.param["band"][d]
226
+
227
+ if d == len(mp.param["band"]): # high-end band
228
+ X_wave[d], _ = librosa.load(
229
+ mix_path, bp["sr"], False, dtype=np.float32, res_type=bp["res_type"]
230
+ )
231
+ y_wave[d], _ = librosa.load(
232
+ inst_path,
233
+ bp["sr"],
234
+ False,
235
+ dtype=np.float32,
236
+ res_type=bp["res_type"],
237
+ )
238
+ else: # lower bands
239
+ X_wave[d] = librosa.resample(
240
+ X_wave[d + 1],
241
+ mp.param["band"][d + 1]["sr"],
242
+ bp["sr"],
243
+ res_type=bp["res_type"],
244
+ )
245
+ y_wave[d] = librosa.resample(
246
+ y_wave[d + 1],
247
+ mp.param["band"][d + 1]["sr"],
248
+ bp["sr"],
249
+ res_type=bp["res_type"],
250
+ )
251
+
252
+ X_wave[d], y_wave[d] = align_wave_head_and_tail(X_wave[d], y_wave[d])
253
+
254
+ X_spec_s[d] = wave_to_spectrogram(
255
+ X_wave[d],
256
+ bp["hl"],
257
+ bp["n_fft"],
258
+ mp.param["mid_side"],
259
+ mp.param["mid_side_b2"],
260
+ mp.param["reverse"],
261
+ )
262
+ y_spec_s[d] = wave_to_spectrogram(
263
+ y_wave[d],
264
+ bp["hl"],
265
+ bp["n_fft"],
266
+ mp.param["mid_side"],
267
+ mp.param["mid_side_b2"],
268
+ mp.param["reverse"],
269
+ )
270
+
271
+ del X_wave, y_wave
272
+
273
+ X_spec_m = combine_spectrograms(X_spec_s, mp)
274
+ y_spec_m = combine_spectrograms(y_spec_s, mp)
275
+
276
+ if X_spec_m.shape != y_spec_m.shape:
277
+ raise ValueError("The combined spectrograms are different: " + mix_path)
278
+
279
+ _, ext = os.path.splitext(mix_path)
280
+
281
+ np.save(mix_cache_path, X_spec_m)
282
+ np.save(inst_cache_path, y_spec_m)
283
+
284
+ return X_spec_m, y_spec_m
285
+
286
+
287
+ def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse):
288
+ spec_left = np.asfortranarray(spec[0])
289
+ spec_right = np.asfortranarray(spec[1])
290
+
291
+ wave_left = librosa.istft(spec_left, hop_length=hop_length)
292
+ wave_right = librosa.istft(spec_right, hop_length=hop_length)
293
+
294
+ if reverse:
295
+ return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
296
+ elif mid_side:
297
+ return np.asfortranarray(
298
+ [np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]
299
+ )
300
+ elif mid_side_b2:
301
+ return np.asfortranarray(
302
+ [
303
+ np.add(wave_right / 1.25, 0.4 * wave_left),
304
+ np.subtract(wave_left / 1.25, 0.4 * wave_right),
305
+ ]
306
+ )
307
+ else:
308
+ return np.asfortranarray([wave_left, wave_right])
309
+
310
+
311
+ def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2):
312
+ import threading
313
+
314
+ spec_left = np.asfortranarray(spec[0])
315
+ spec_right = np.asfortranarray(spec[1])
316
+
317
+ def run_thread(**kwargs):
318
+ global wave_left
319
+ wave_left = librosa.istft(**kwargs)
320
+
321
+ thread = threading.Thread(
322
+ target=run_thread, kwargs={"stft_matrix": spec_left, "hop_length": hop_length}
323
+ )
324
+ thread.start()
325
+ wave_right = librosa.istft(spec_right, hop_length=hop_length)
326
+ thread.join()
327
+
328
+ if reverse:
329
+ return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
330
+ elif mid_side:
331
+ return np.asfortranarray(
332
+ [np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]
333
+ )
334
+ elif mid_side_b2:
335
+ return np.asfortranarray(
336
+ [
337
+ np.add(wave_right / 1.25, 0.4 * wave_left),
338
+ np.subtract(wave_left / 1.25, 0.4 * wave_right),
339
+ ]
340
+ )
341
+ else:
342
+ return np.asfortranarray([wave_left, wave_right])
343
+
344
+
345
+ def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None):
346
+ wave_band = {}
347
+ bands_n = len(mp.param["band"])
348
+ offset = 0
349
+
350
+ for d in range(1, bands_n + 1):
351
+ bp = mp.param["band"][d]
352
+ spec_s = np.ndarray(
353
+ shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex
354
+ )
355
+ h = bp["crop_stop"] - bp["crop_start"]
356
+ spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[
357
+ :, offset : offset + h, :
358
+ ]
359
+
360
+ offset += h
361
+ if d == bands_n: # higher
362
+ if extra_bins_h: # if --high_end_process bypass
363
+ max_bin = bp["n_fft"] // 2
364
+ spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[
365
+ :, :extra_bins_h, :
366
+ ]
367
+ if bp["hpf_start"] > 0:
368
+ spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
369
+ if bands_n == 1:
370
+ wave = spectrogram_to_wave(
371
+ spec_s,
372
+ bp["hl"],
373
+ mp.param["mid_side"],
374
+ mp.param["mid_side_b2"],
375
+ mp.param["reverse"],
376
+ )
377
+ else:
378
+ wave = np.add(
379
+ wave,
380
+ spectrogram_to_wave(
381
+ spec_s,
382
+ bp["hl"],
383
+ mp.param["mid_side"],
384
+ mp.param["mid_side_b2"],
385
+ mp.param["reverse"],
386
+ ),
387
+ )
388
+ else:
389
+ sr = mp.param["band"][d + 1]["sr"]
390
+ if d == 1: # lower
391
+ spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
392
+ wave = librosa.resample(
393
+ spectrogram_to_wave(
394
+ spec_s,
395
+ bp["hl"],
396
+ mp.param["mid_side"],
397
+ mp.param["mid_side_b2"],
398
+ mp.param["reverse"],
399
+ ),
400
+ bp["sr"],
401
+ sr,
402
+ res_type="sinc_fastest",
403
+ )
404
+ else: # mid
405
+ spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
406
+ spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
407
+ wave2 = np.add(
408
+ wave,
409
+ spectrogram_to_wave(
410
+ spec_s,
411
+ bp["hl"],
412
+ mp.param["mid_side"],
413
+ mp.param["mid_side_b2"],
414
+ mp.param["reverse"],
415
+ ),
416
+ )
417
+ # wave = librosa.core.resample(wave2, bp['sr'], sr, res_type="sinc_fastest")
418
+ wave = librosa.core.resample(wave2, bp["sr"], sr, res_type="scipy")
419
+
420
+ return wave.T
421
+
422
+
423
+ def fft_lp_filter(spec, bin_start, bin_stop):
424
+ g = 1.0
425
+ for b in range(bin_start, bin_stop):
426
+ g -= 1 / (bin_stop - bin_start)
427
+ spec[:, b, :] = g * spec[:, b, :]
428
+
429
+ spec[:, bin_stop:, :] *= 0
430
+
431
+ return spec
432
+
433
+
434
+ def fft_hp_filter(spec, bin_start, bin_stop):
435
+ g = 1.0
436
+ for b in range(bin_start, bin_stop, -1):
437
+ g -= 1 / (bin_start - bin_stop)
438
+ spec[:, b, :] = g * spec[:, b, :]
439
+
440
+ spec[:, 0 : bin_stop + 1, :] *= 0
441
+
442
+ return spec
443
+
444
+
445
+ def mirroring(a, spec_m, input_high_end, mp):
446
+ if "mirroring" == a:
447
+ mirror = np.flip(
448
+ np.abs(
449
+ spec_m[
450
+ :,
451
+ mp.param["pre_filter_start"]
452
+ - 10
453
+ - input_high_end.shape[1] : mp.param["pre_filter_start"]
454
+ - 10,
455
+ :,
456
+ ]
457
+ ),
458
+ 1,
459
+ )
460
+ mirror = mirror * np.exp(1.0j * np.angle(input_high_end))
461
+
462
+ return np.where(
463
+ np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror
464
+ )
465
+
466
+ if "mirroring2" == a:
467
+ mirror = np.flip(
468
+ np.abs(
469
+ spec_m[
470
+ :,
471
+ mp.param["pre_filter_start"]
472
+ - 10
473
+ - input_high_end.shape[1] : mp.param["pre_filter_start"]
474
+ - 10,
475
+ :,
476
+ ]
477
+ ),
478
+ 1,
479
+ )
480
+ mi = np.multiply(mirror, input_high_end * 1.7)
481
+
482
+ return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi)
483
+
484
+
485
+ def ensembling(a, specs):
486
+ for i in range(1, len(specs)):
487
+ if i == 1:
488
+ spec = specs[0]
489
+
490
+ ln = min([spec.shape[2], specs[i].shape[2]])
491
+ spec = spec[:, :, :ln]
492
+ specs[i] = specs[i][:, :, :ln]
493
+
494
+ if "min_mag" == a:
495
+ spec = np.where(np.abs(specs[i]) <= np.abs(spec), specs[i], spec)
496
+ if "max_mag" == a:
497
+ spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec)
498
+
499
+ return spec
500
+
501
+
502
+ def stft(wave, nfft, hl):
503
+ wave_left = np.asfortranarray(wave[0])
504
+ wave_right = np.asfortranarray(wave[1])
505
+ spec_left = librosa.stft(wave_left, nfft, hop_length=hl)
506
+ spec_right = librosa.stft(wave_right, nfft, hop_length=hl)
507
+ spec = np.asfortranarray([spec_left, spec_right])
508
+
509
+ return spec
510
+
511
+
512
+ def istft(spec, hl):
513
+ spec_left = np.asfortranarray(spec[0])
514
+ spec_right = np.asfortranarray(spec[1])
515
+
516
+ wave_left = librosa.istft(spec_left, hop_length=hl)
517
+ wave_right = librosa.istft(spec_right, hop_length=hl)
518
+ wave = np.asfortranarray([wave_left, wave_right])
519
+
520
+
521
+ if __name__ == "__main__":
522
+ import cv2
523
+ import sys
524
+ import time
525
+ import argparse
526
+ from model_param_init import ModelParameters
527
+
528
+ p = argparse.ArgumentParser()
529
+ p.add_argument(
530
+ "--algorithm",
531
+ "-a",
532
+ type=str,
533
+ choices=["invert", "invert_p", "min_mag", "max_mag", "deep", "align"],
534
+ default="min_mag",
535
+ )
536
+ p.add_argument(
537
+ "--model_params",
538
+ "-m",
539
+ type=str,
540
+ default=os.path.join("modelparams", "1band_sr44100_hl512.json"),
541
+ )
542
+ p.add_argument("--output_name", "-o", type=str, default="output")
543
+ p.add_argument("--vocals_only", "-v", action="store_true")
544
+ p.add_argument("input", nargs="+")
545
+ args = p.parse_args()
546
+
547
+ start_time = time.time()
548
+
549
+ if args.algorithm.startswith("invert") and len(args.input) != 2:
550
+ raise ValueError("There should be two input files.")
551
+
552
+ if not args.algorithm.startswith("invert") and len(args.input) < 2:
553
+ raise ValueError("There must be at least two input files.")
554
+
555
+ wave, specs = {}, {}
556
+ mp = ModelParameters(args.model_params)
557
+
558
+ for i in range(len(args.input)):
559
+ spec = {}
560
+
561
+ for d in range(len(mp.param["band"]), 0, -1):
562
+ bp = mp.param["band"][d]
563
+
564
+ if d == len(mp.param["band"]): # high-end band
565
+ wave[d], _ = librosa.load(
566
+ args.input[i],
567
+ bp["sr"],
568
+ False,
569
+ dtype=np.float32,
570
+ res_type=bp["res_type"],
571
+ )
572
+
573
+ if len(wave[d].shape) == 1: # mono to stereo
574
+ wave[d] = np.array([wave[d], wave[d]])
575
+ else: # lower bands
576
+ wave[d] = librosa.resample(
577
+ wave[d + 1],
578
+ mp.param["band"][d + 1]["sr"],
579
+ bp["sr"],
580
+ res_type=bp["res_type"],
581
+ )
582
+
583
+ spec[d] = wave_to_spectrogram(
584
+ wave[d],
585
+ bp["hl"],
586
+ bp["n_fft"],
587
+ mp.param["mid_side"],
588
+ mp.param["mid_side_b2"],
589
+ mp.param["reverse"],
590
+ )
591
+
592
+ specs[i] = combine_spectrograms(spec, mp)
593
+
594
+ del wave
595
+
596
+ if args.algorithm == "deep":
597
+ d_spec = np.where(np.abs(specs[0]) <= np.abs(spec[1]), specs[0], spec[1])
598
+ v_spec = d_spec - specs[1]
599
+ sf.write(
600
+ os.path.join("{}.wav".format(args.output_name)),
601
+ cmb_spectrogram_to_wave(v_spec, mp),
602
+ mp.param["sr"],
603
+ )
604
+
605
+ if args.algorithm.startswith("invert"):
606
+ ln = min([specs[0].shape[2], specs[1].shape[2]])
607
+ specs[0] = specs[0][:, :, :ln]
608
+ specs[1] = specs[1][:, :, :ln]
609
+
610
+ if "invert_p" == args.algorithm:
611
+ X_mag = np.abs(specs[0])
612
+ y_mag = np.abs(specs[1])
613
+ max_mag = np.where(X_mag >= y_mag, X_mag, y_mag)
614
+ v_spec = specs[1] - max_mag * np.exp(1.0j * np.angle(specs[0]))
615
+ else:
616
+ specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2)
617
+ v_spec = specs[0] - specs[1]
618
+
619
+ if not args.vocals_only:
620
+ X_mag = np.abs(specs[0])
621
+ y_mag = np.abs(specs[1])
622
+ v_mag = np.abs(v_spec)
623
+
624
+ X_image = spectrogram_to_image(X_mag)
625
+ y_image = spectrogram_to_image(y_mag)
626
+ v_image = spectrogram_to_image(v_mag)
627
+
628
+ cv2.imwrite("{}_X.png".format(args.output_name), X_image)
629
+ cv2.imwrite("{}_y.png".format(args.output_name), y_image)
630
+ cv2.imwrite("{}_v.png".format(args.output_name), v_image)
631
+
632
+ sf.write(
633
+ "{}_X.wav".format(args.output_name),
634
+ cmb_spectrogram_to_wave(specs[0], mp),
635
+ mp.param["sr"],
636
+ )
637
+ sf.write(
638
+ "{}_y.wav".format(args.output_name),
639
+ cmb_spectrogram_to_wave(specs[1], mp),
640
+ mp.param["sr"],
641
+ )
642
+
643
+ sf.write(
644
+ "{}_v.wav".format(args.output_name),
645
+ cmb_spectrogram_to_wave(v_spec, mp),
646
+ mp.param["sr"],
647
+ )
648
+ else:
649
+ if not args.algorithm == "deep":
650
+ sf.write(
651
+ os.path.join("ensembled", "{}.wav".format(args.output_name)),
652
+ cmb_spectrogram_to_wave(ensembling(args.algorithm, specs), mp),
653
+ mp.param["sr"],
654
+ )
655
+
656
+ if args.algorithm == "align":
657
+ trackalignment = [
658
+ {
659
+ "file1": '"{}"'.format(args.input[0]),
660
+ "file2": '"{}"'.format(args.input[1]),
661
+ }
662
+ ]
663
+
664
+ for i, e in tqdm(enumerate(trackalignment), desc="Performing Alignment..."):
665
+ os.system(f"python lib/align_tracks.py {e['file1']} {e['file2']}")
666
+
667
+ # print('Total time: {0:.{1}f}s'.format(time.time() - start_time, 1))
uvr5_pack/name_params.json ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "equivalent" : [
3
+ {
4
+ "model_hash_name" : [
5
+ {
6
+ "hash_name": "47939caf0cfe52a0e81442b85b971dfd",
7
+ "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
8
+ "param_name": "4band_44100"
9
+ },
10
+ {
11
+ "hash_name": "4e4ecb9764c50a8c414fee6e10395bbe",
12
+ "model_params": "uvr5_pack/lib_v5/modelparams/4band_v2.json",
13
+ "param_name": "4band_v2"
14
+ },
15
+ {
16
+ "hash_name": "ca106edd563e034bde0bdec4bb7a4b36",
17
+ "model_params": "uvr5_pack/lib_v5/modelparams/4band_v2.json",
18
+ "param_name": "4band_v2"
19
+ },
20
+ {
21
+ "hash_name": "e60a1e84803ce4efc0a6551206cc4b71",
22
+ "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
23
+ "param_name": "4band_44100"
24
+ },
25
+ {
26
+ "hash_name": "a82f14e75892e55e994376edbf0c8435",
27
+ "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
28
+ "param_name": "4band_44100"
29
+ },
30
+ {
31
+ "hash_name": "6dd9eaa6f0420af9f1d403aaafa4cc06",
32
+ "model_params": "uvr5_pack/lib_v5/modelparams/4band_v2_sn.json",
33
+ "param_name": "4band_v2_sn"
34
+ },
35
+ {
36
+ "hash_name": "08611fb99bd59eaa79ad27c58d137727",
37
+ "model_params": "uvr5_pack/lib_v5/modelparams/4band_v2_sn.json",
38
+ "param_name": "4band_v2_sn"
39
+ },
40
+ {
41
+ "hash_name": "5c7bbca45a187e81abbbd351606164e5",
42
+ "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json",
43
+ "param_name": "3band_44100_msb2"
44
+ },
45
+ {
46
+ "hash_name": "d6b2cb685a058a091e5e7098192d3233",
47
+ "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json",
48
+ "param_name": "3band_44100_msb2"
49
+ },
50
+ {
51
+ "hash_name": "c1b9f38170a7c90e96f027992eb7c62b",
52
+ "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
53
+ "param_name": "4band_44100"
54
+ },
55
+ {
56
+ "hash_name": "c3448ec923fa0edf3d03a19e633faa53",
57
+ "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
58
+ "param_name": "4band_44100"
59
+ },
60
+ {
61
+ "hash_name": "68aa2c8093d0080704b200d140f59e54",
62
+ "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100.json",
63
+ "param_name": "3band_44100"
64
+ },
65
+ {
66
+ "hash_name": "fdc83be5b798e4bd29fe00fe6600e147",
67
+ "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_mid.json",
68
+ "param_name": "3band_44100_mid.json"
69
+ },
70
+ {
71
+ "hash_name": "2ce34bc92fd57f55db16b7a4def3d745",
72
+ "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_mid.json",
73
+ "param_name": "3band_44100_mid.json"
74
+ },
75
+ {
76
+ "hash_name": "52fdca89576f06cf4340b74a4730ee5f",
77
+ "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
78
+ "param_name": "4band_44100.json"
79
+ },
80
+ {
81
+ "hash_name": "41191165b05d38fc77f072fa9e8e8a30",
82
+ "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
83
+ "param_name": "4band_44100.json"
84
+ },
85
+ {
86
+ "hash_name": "89e83b511ad474592689e562d5b1f80e",
87
+ "model_params": "uvr5_pack/lib_v5/modelparams/2band_32000.json",
88
+ "param_name": "2band_32000.json"
89
+ },
90
+ {
91
+ "hash_name": "0b954da81d453b716b114d6d7c95177f",
92
+ "model_params": "uvr5_pack/lib_v5/modelparams/2band_32000.json",
93
+ "param_name": "2band_32000.json"
94
+ }
95
+
96
+ ],
97
+ "v4 Models": [
98
+ {
99
+ "hash_name": "6a00461c51c2920fd68937d4609ed6c8",
100
+ "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json",
101
+ "param_name": "1band_sr16000_hl512"
102
+ },
103
+ {
104
+ "hash_name": "0ab504864d20f1bd378fe9c81ef37140",
105
+ "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
106
+ "param_name": "1band_sr32000_hl512"
107
+ },
108
+ {
109
+ "hash_name": "7dd21065bf91c10f7fccb57d7d83b07f",
110
+ "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
111
+ "param_name": "1band_sr32000_hl512"
112
+ },
113
+ {
114
+ "hash_name": "80ab74d65e515caa3622728d2de07d23",
115
+ "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
116
+ "param_name": "1band_sr32000_hl512"
117
+ },
118
+ {
119
+ "hash_name": "edc115e7fc523245062200c00caa847f",
120
+ "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json",
121
+ "param_name": "1band_sr33075_hl384"
122
+ },
123
+ {
124
+ "hash_name": "28063e9f6ab5b341c5f6d3c67f2045b7",
125
+ "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json",
126
+ "param_name": "1band_sr33075_hl384"
127
+ },
128
+ {
129
+ "hash_name": "b58090534c52cbc3e9b5104bad666ef2",
130
+ "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json",
131
+ "param_name": "1band_sr44100_hl512"
132
+ },
133
+ {
134
+ "hash_name": "0cdab9947f1b0928705f518f3c78ea8f",
135
+ "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json",
136
+ "param_name": "1band_sr44100_hl512"
137
+ },
138
+ {
139
+ "hash_name": "ae702fed0238afb5346db8356fe25f13",
140
+ "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json",
141
+ "param_name": "1band_sr44100_hl1024"
142
+ }
143
+ ]
144
+ }
145
+ ],
146
+ "User Models" : [
147
+ {
148
+ "1 Band": [
149
+ {
150
+ "hash_name": "1band_sr16000_hl512",
151
+ "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json",
152
+ "param_name": "1band_sr16000_hl512"
153
+ },
154
+ {
155
+ "hash_name": "1band_sr32000_hl512",
156
+ "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
157
+ "param_name": "1band_sr16000_hl512"
158
+ },
159
+ {
160
+ "hash_name": "1band_sr33075_hl384",
161
+ "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json",
162
+ "param_name": "1band_sr33075_hl384"
163
+ },
164
+ {
165
+ "hash_name": "1band_sr44100_hl256",
166
+ "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json",
167
+ "param_name": "1band_sr44100_hl256"
168
+ },
169
+ {
170
+ "hash_name": "1band_sr44100_hl512",
171
+ "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json",
172
+ "param_name": "1band_sr44100_hl512"
173
+ },
174
+ {
175
+ "hash_name": "1band_sr44100_hl1024",
176
+ "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json",
177
+ "param_name": "1band_sr44100_hl1024"
178
+ }
179
+ ],
180
+ "2 Band": [
181
+ {
182
+ "hash_name": "2band_44100_lofi",
183
+ "model_params": "uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json",
184
+ "param_name": "2band_44100_lofi"
185
+ },
186
+ {
187
+ "hash_name": "2band_32000",
188
+ "model_params": "uvr5_pack/lib_v5/modelparams/2band_32000.json",
189
+ "param_name": "2band_32000"
190
+ },
191
+ {
192
+ "hash_name": "2band_48000",
193
+ "model_params": "uvr5_pack/lib_v5/modelparams/2band_48000.json",
194
+ "param_name": "2band_48000"
195
+ }
196
+ ],
197
+ "3 Band": [
198
+ {
199
+ "hash_name": "3band_44100",
200
+ "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100.json",
201
+ "param_name": "3band_44100"
202
+ },
203
+ {
204
+ "hash_name": "3band_44100_mid",
205
+ "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_mid.json",
206
+ "param_name": "3band_44100_mid"
207
+ },
208
+ {
209
+ "hash_name": "3band_44100_msb2",
210
+ "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json",
211
+ "param_name": "3band_44100_msb2"
212
+ }
213
+ ],
214
+ "4 Band": [
215
+ {
216
+ "hash_name": "4band_44100",
217
+ "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
218
+ "param_name": "4band_44100"
219
+ },
220
+ {
221
+ "hash_name": "4band_44100_mid",
222
+ "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_mid.json",
223
+ "param_name": "4band_44100_mid"
224
+ },
225
+ {
226
+ "hash_name": "4band_44100_msb",
227
+ "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_msb.json",
228
+ "param_name": "4band_44100_msb"
229
+ },
230
+ {
231
+ "hash_name": "4band_44100_msb2",
232
+ "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json",
233
+ "param_name": "4band_44100_msb2"
234
+ },
235
+ {
236
+ "hash_name": "4band_44100_reverse",
237
+ "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json",
238
+ "param_name": "4band_44100_reverse"
239
+ },
240
+ {
241
+ "hash_name": "4band_44100_sw",
242
+ "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_sw.json",
243
+ "param_name": "4band_44100_sw"
244
+ },
245
+ {
246
+ "hash_name": "4band_v2",
247
+ "model_params": "uvr5_pack/lib_v5/modelparams/4band_v2.json",
248
+ "param_name": "4band_v2"
249
+ },
250
+ {
251
+ "hash_name": "4band_v2_sn",
252
+ "model_params": "uvr5_pack/lib_v5/modelparams/4band_v2_sn.json",
253
+ "param_name": "4band_v2_sn"
254
+ },
255
+ {
256
+ "hash_name": "tmodelparam",
257
+ "model_params": "uvr5_pack/lib_v5/modelparams/tmodelparam.json",
258
+ "param_name": "User Model Param Set"
259
+ }
260
+ ]
261
+ }
262
+ ]
263
+ }
uvr5_pack/utils.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ from tqdm import tqdm
4
+ import json
5
+
6
+
7
+ def load_data(file_name: str = "./uvr5_pack/name_params.json") -> dict:
8
+ with open(file_name, "r") as f:
9
+ data = json.load(f)
10
+
11
+ return data
12
+
13
+
14
+ def make_padding(width, cropsize, offset):
15
+ left = offset
16
+ roi_size = cropsize - left * 2
17
+ if roi_size == 0:
18
+ roi_size = cropsize
19
+ right = roi_size - (width % roi_size) + left
20
+
21
+ return left, right, roi_size
22
+
23
+
24
+ def inference(X_spec, device, model, aggressiveness, data):
25
+ """
26
+ data : dic configs
27
+ """
28
+
29
+ def _execute(
30
+ X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True
31
+ ):
32
+ model.eval()
33
+ with torch.no_grad():
34
+ preds = []
35
+
36
+ iterations = [n_window]
37
+
38
+ total_iterations = sum(iterations)
39
+ for i in tqdm(range(n_window)):
40
+ start = i * roi_size
41
+ X_mag_window = X_mag_pad[
42
+ None, :, :, start : start + data["window_size"]
43
+ ]
44
+ X_mag_window = torch.from_numpy(X_mag_window)
45
+ if is_half:
46
+ X_mag_window = X_mag_window.half()
47
+ X_mag_window = X_mag_window.to(device)
48
+
49
+ pred = model.predict(X_mag_window, aggressiveness)
50
+
51
+ pred = pred.detach().cpu().numpy()
52
+ preds.append(pred[0])
53
+
54
+ pred = np.concatenate(preds, axis=2)
55
+ return pred
56
+
57
+ def preprocess(X_spec):
58
+ X_mag = np.abs(X_spec)
59
+ X_phase = np.angle(X_spec)
60
+
61
+ return X_mag, X_phase
62
+
63
+ X_mag, X_phase = preprocess(X_spec)
64
+
65
+ coef = X_mag.max()
66
+ X_mag_pre = X_mag / coef
67
+
68
+ n_frame = X_mag_pre.shape[2]
69
+ pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset)
70
+ n_window = int(np.ceil(n_frame / roi_size))
71
+
72
+ X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
73
+
74
+ if list(model.state_dict().values())[0].dtype == torch.float16:
75
+ is_half = True
76
+ else:
77
+ is_half = False
78
+ pred = _execute(
79
+ X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
80
+ )
81
+ pred = pred[:, :, :n_frame]
82
+
83
+ if data["tta"]:
84
+ pad_l += roi_size // 2
85
+ pad_r += roi_size // 2
86
+ n_window += 1
87
+
88
+ X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
89
+
90
+ pred_tta = _execute(
91
+ X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
92
+ )
93
+ pred_tta = pred_tta[:, :, roi_size // 2 :]
94
+ pred_tta = pred_tta[:, :, :n_frame]
95
+
96
+ return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase)
97
+ else:
98
+ return pred * coef, X_mag, np.exp(1.0j * X_phase)
99
+
100
+
101
+ def _get_name_params(model_path, model_hash):
102
+ data = load_data()
103
+ flag = False
104
+ ModelName = model_path
105
+ for type in list(data):
106
+ for model in list(data[type][0]):
107
+ for i in range(len(data[type][0][model])):
108
+ if str(data[type][0][model][i]["hash_name"]) == model_hash:
109
+ flag = True
110
+ elif str(data[type][0][model][i]["hash_name"]) in ModelName:
111
+ flag = True
112
+
113
+ if flag:
114
+ model_params_auto = data[type][0][model][i]["model_params"]
115
+ param_name_auto = data[type][0][model][i]["param_name"]
116
+ if type == "equivalent":
117
+ return param_name_auto, model_params_auto
118
+ else:
119
+ flag = False
120
+ return param_name_auto, model_params_auto