Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F4846279
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
90 KB
Subscribers
None
View Options
diff --git a/AIMeiSheng/diffuse_fang/diffUse_wraper_double.py b/AIMeiSheng/diffuse_fang/diffUse_wraper_double.py
new file mode 100644
index 0000000..a958906
--- /dev/null
+++ b/AIMeiSheng/diffuse_fang/diffUse_wraper_double.py
@@ -0,0 +1,59 @@
+from diffuse_fang.diffusion.wavenet import WaveNet
+from diffuse_fang.diffusion.diffusion import GaussianDiffusion
+
+import torch
+
+out_dims = 256#192 ##决定输出维度
+n_layers=20
+n_chans=384
+n_hidden=256#192#256 ##决定输入维度
+timesteps=1000
+k_step_max=1000
+
+
+#class WaveNet(nn.Module):
+# def __init__(self, in_dims=128, n_layers=20, n_chans=384, n_hidden=256):
+
+###out: B x n_frames x feat, 推理的话returrn 目标数据,训练的时候return 是 mse loss
+#input size
+#output size:
+diff_decoder = GaussianDiffusion(WaveNet(out_dims, n_layers, n_chans, n_hidden),timesteps=timesteps,k_step=k_step_max, out_dims=out_dims)
+
+'''
+gt_spec=None#这个是x0的数据,推理不需要,测试需要
+infer=True # train的时候设置成Fasle
+infer_speedup=10
+method='dpm-solver'
+k_step=100
+use_tqdm=True
+#'''
+
+class ddpm_para():
+ def __init__(self, gt_spec=None,infer=True,infer_speedup=10,method='dpm-solver',k_step=100,use_tqdm = True):
+ #self.use_tqdm = use_tqdm #True
+ self.gt_spec = gt_spec#None#这个是x0的数据,推理不需要,测试需要
+ self.infer = infer #True # train的时候设置成Fasle
+ self.infer_speedup = infer_speedup#10
+ self.method = method #'dpm-solver'
+ self.k_step = k_step
+ self.use_tqdm = use_tqdm
+
+
+if __name__ == "__main__":
+ ddpm_dp = ddpm_para()
+
+ B = 32
+ n_frames = 120
+ n_unit = 192
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+ diff_decoder = diff_decoder.to(device)
+ x = torch.randn(B, n_frames,n_unit).to(device) ##input: B x n_frames x n_unit
+ print("@@@ input x shape:", x.shape)
+ # 生成标签数据(假设简单线性分类)
+ # Y = torch.randint(0, 2, (num_samples, output_dim)).float()
+
+ out = diff_decoder(x, gt_spec=ddpm_dp.gt_spec, infer=ddpm_dp.infer, infer_speedup=ddpm_dp.infer_speedup, method=ddpm_dp.method, k_step=ddpm_dp.k_step, use_tqdm=ddpm_dp.use_tqdm)
+ print("@@@ out shape:",out.shape) #torch.Size([32, 120, 128]) ###out: B x n_frames x feat
+
+
diff --git a/AIMeiSheng/docker_demo/common.py b/AIMeiSheng/docker_demo/common.py
index 9d0ebff..6d28a72 100644
--- a/AIMeiSheng/docker_demo/common.py
+++ b/AIMeiSheng/docker_demo/common.py
@@ -1,112 +1,112 @@
import os
import sys
import time
# import logging
import urllib, urllib.request
# 测试/正式环境
gs_prod = False
if len(sys.argv) > 1 and sys.argv[1] == "prod":
gs_prod = True
print(gs_prod)
gs_tmp_dir = "/tmp/ai_meisheng_tmp"
gs_model_dir = "/tmp/ai_meisheng_models"
gs_resource_cache_dir = "/tmp/ai_meisheng_resource_cache"
gs_embed_model_path = os.path.join(gs_model_dir, "RawNet3/models/weights/model.pt")
gs_svc_model_path = os.path.join(gs_model_dir,
- "weights/xusong_v2_org_version_alldata_embed_spkenx200x_vocal_e22_s95040.pth")
+ "weights/xusong_v2_org_version_alldata_embed_spkenx200x_double_e14_s90706.pth")
gs_hubert_model_path = os.path.join(gs_model_dir, "hubert.pt")
gs_rmvpe_model_path = os.path.join(gs_model_dir, "rmvpe.pt")
gs_embed_model_spk_path = os.path.join(gs_model_dir, "SpeakerEncoder/pretrained_model/best_model.pth.tar")
gs_embed_config_spk_path = os.path.join(gs_model_dir, "SpeakerEncoder/pretrained_model/config.json")
# errcode
gs_err_code_success = 0
gs_err_code_download_vocal = 100
gs_err_code_download_svc_url = 101
gs_err_code_svc_process = 102
gs_err_code_transcode = 103
gs_err_code_volume_adjust = 104
gs_err_code_upload = 105
gs_err_code_params = 106
gs_err_code_pending = 107
gs_err_code_target_silence = 108
gs_err_code_too_many_connections = 429
gs_redis_conf = {
"host": "av-credis.starmaker.co",
"port": 6379,
"pwd": "lKoWEhz%jxTO",
}
gs_server_redis_conf = {
"producer": "test_ai_meisheng_producer", # 输入的队列
"ai_meisheng_key_prefix": "test_ai_meisheng_key_", # 存储结果情况
}
if gs_prod:
gs_server_redis_conf = {
"producer": "ai_meisheng_producer", # 输入的队列
"ai_meisheng_key_prefix": "ai_meisheng_key_", # 存储结果情况
}
def download2disk(url, dst_path):
try:
urllib.request.urlretrieve(url, dst_path)
return os.path.exists(dst_path)
except Exception as ex:
print(f"download url={url} error", ex)
return False
def exec_cmd(cmd):
# gs_logger.info(cmd)
print(cmd)
ret = os.system(cmd)
if ret != 0:
return False
return True
def exec_cmd_and_result(cmd):
r = os.popen(cmd)
text = r.read()
r.close()
return text
def upload_file2cos(key, file_path, region='ap-singapore', bucket_name='av-audit-sync-sg-1256122840'):
"""
将文件上传到cos
:param key: 桶上的具体地址
:param file_path: 本地文件地址
:param region: 区域
:param bucket_name: 桶地址
:return:
"""
gs_coscmd = "coscmd"
gs_coscmd_conf = "~/.cos.conf"
cmd = "{} -c {} -r {} -b {} upload {} {}".format(gs_coscmd, gs_coscmd_conf, region, bucket_name, file_path, key)
if exec_cmd(cmd):
cmd = "{} -c {} -r {} -b {} info {}".format(gs_coscmd, gs_coscmd_conf, region, bucket_name, key) \
+ "| grep Content-Length |awk \'{print $2}\'"
res_str = exec_cmd_and_result(cmd)
# logging.info("{},res={}".format(key, res_str))
size = float(res_str)
if size > 0:
return True
return False
return False
def check_input(input_data):
key_list = ["record_song_url", "target_url", "start", "end", "vocal_loudness", "female_recording_url",
"male_recording_url"]
for key in key_list:
if key not in input_data.keys():
return False
return True
diff --git a/AIMeiSheng/lib/infer_pack/attentions_in_dec_double.py b/AIMeiSheng/lib/infer_pack/attentions_in_dec_double.py
new file mode 100644
index 0000000..6f5b23c
--- /dev/null
+++ b/AIMeiSheng/lib/infer_pack/attentions_in_dec_double.py
@@ -0,0 +1,424 @@
+import copy
+import math
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from lib.infer_pack import commons
+from lib.infer_pack import modules
+from lib.infer_pack.modules import LayerNorm,AdaIN1d,AdaIN2d
+
+g2_dim = 256
+class Encoder(nn.Module):
+ def __init__(
+ self,
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size=1,
+ p_dropout=0.0,
+ window_size=10,
+ **kwargs
+ ):
+ super().__init__()
+ self.hidden_channels = hidden_channels
+ self.filter_channels = filter_channels
+ self.n_heads = n_heads
+ self.n_layers = n_layers
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+ self.window_size = window_size
+
+ self.drop = nn.Dropout(p_dropout)
+ self.attn_layers = nn.ModuleList()
+ self.norm_layers_1 = nn.ModuleList()
+ self.ffn_layers = nn.ModuleList()
+ self.norm_layers_2 = nn.ModuleList()
+ for i in range(self.n_layers):
+ self.attn_layers.append(
+ MultiHeadAttention(
+ hidden_channels,
+ hidden_channels,
+ n_heads,
+ p_dropout=p_dropout,
+ window_size=window_size,
+ )
+ )
+ #self.norm_layers_1.append(LayerNorm(hidden_channels))
+ #self.norm_layers_1.append(AdaIN1d(hidden_channels,256)) #fang add
+ self.norm_layers_1.append(AdaIN1d(256,g2_dim))#fang add
+ #print("xxxhidden_channels:",hidden_channels)
+ #print("xxxfilter_channels:",filter_channels)
+ self.ffn_layers.append(
+ FFN(
+ hidden_channels,
+ hidden_channels,
+ filter_channels,
+ kernel_size,
+ p_dropout=p_dropout,
+ )
+ )
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
+
+ def forward(self, x, x_mask,g):#fang add
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+ x = x * x_mask
+ for i in range(self.n_layers):
+ y = self.attn_layers[i](x, x, attn_mask)
+ y = self.drop(y)
+ #print("@@@ x:",x.shape) #fang add
+ #x = self.norm_layers_1[i](x + y)
+ #print("@@g:",g.shape)
+ x = self.norm_layers_1[i](x + y,torch.squeeze(g,dim=-1))#fang add
+ #print("@@@norm x:",x.shape)#fang add
+ y = self.ffn_layers[i](x, x_mask)
+ y = self.drop(y)
+ x = self.norm_layers_2[i](x + y)
+ x = x * x_mask
+ return x
+
+
+class Decoder(nn.Module):
+ def __init__(
+ self,
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size=1,
+ p_dropout=0.0,
+ proximal_bias=False,
+ proximal_init=True,
+ **kwargs
+ ):
+ super().__init__()
+ self.hidden_channels = hidden_channels
+ self.filter_channels = filter_channels
+ self.n_heads = n_heads
+ self.n_layers = n_layers
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+ self.proximal_bias = proximal_bias
+ self.proximal_init = proximal_init
+
+ self.drop = nn.Dropout(p_dropout)
+ self.self_attn_layers = nn.ModuleList()
+ self.norm_layers_0 = nn.ModuleList()
+ self.encdec_attn_layers = nn.ModuleList()
+ self.norm_layers_1 = nn.ModuleList()
+ self.ffn_layers = nn.ModuleList()
+ self.norm_layers_2 = nn.ModuleList()
+ for i in range(self.n_layers):
+ self.self_attn_layers.append(
+ MultiHeadAttention(
+ hidden_channels,
+ hidden_channels,
+ n_heads,
+ p_dropout=p_dropout,
+ proximal_bias=proximal_bias,
+ proximal_init=proximal_init,
+ )
+ )
+ self.norm_layers_0.append(LayerNorm(hidden_channels))
+ self.encdec_attn_layers.append(
+ MultiHeadAttention(
+ hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
+ )
+ )
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
+ self.ffn_layers.append(
+ FFN(
+ hidden_channels,
+ hidden_channels,
+ filter_channels,
+ kernel_size,
+ p_dropout=p_dropout,
+ causal=True,
+ )
+ )
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
+
+ def forward(self, x, x_mask, h, h_mask):
+ """
+ x: decoder input
+ h: encoder output
+ """
+ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
+ device=x.device, dtype=x.dtype
+ )
+ encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+ x = x * x_mask
+ for i in range(self.n_layers):
+ y = self.self_attn_layers[i](x, x, self_attn_mask)
+ y = self.drop(y)
+ x = self.norm_layers_0[i](x + y)
+
+ y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
+ y = self.drop(y)
+ x = self.norm_layers_1[i](x + y)
+
+ y = self.ffn_layers[i](x, x_mask)
+ y = self.drop(y)
+ x = self.norm_layers_2[i](x + y)
+ x = x * x_mask
+ return x
+
+
+class MultiHeadAttention(nn.Module):
+ def __init__(
+ self,
+ channels,
+ out_channels,
+ n_heads,
+ p_dropout=0.0,
+ window_size=None,
+ heads_share=True,
+ block_length=None,
+ proximal_bias=False,
+ proximal_init=False,
+ ):
+ super().__init__()
+ assert channels % n_heads == 0
+
+ self.channels = channels
+ self.out_channels = out_channels
+ self.n_heads = n_heads
+ self.p_dropout = p_dropout
+ self.window_size = window_size
+ self.heads_share = heads_share
+ self.block_length = block_length
+ self.proximal_bias = proximal_bias
+ self.proximal_init = proximal_init
+ self.attn = None
+
+ self.k_channels = channels // n_heads
+ self.conv_q = nn.Conv1d(channels, channels, 1)
+ self.conv_k = nn.Conv1d(channels, channels, 1)
+ self.conv_v = nn.Conv1d(channels, channels, 1)
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
+ self.drop = nn.Dropout(p_dropout)
+
+ if window_size is not None:
+ n_heads_rel = 1 if heads_share else n_heads
+ rel_stddev = self.k_channels**-0.5
+ self.emb_rel_k = nn.Parameter(
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
+ * rel_stddev
+ )
+ self.emb_rel_v = nn.Parameter(
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
+ * rel_stddev
+ )
+
+ nn.init.xavier_uniform_(self.conv_q.weight)
+ nn.init.xavier_uniform_(self.conv_k.weight)
+ nn.init.xavier_uniform_(self.conv_v.weight)
+ if proximal_init:
+ with torch.no_grad():
+ self.conv_k.weight.copy_(self.conv_q.weight)
+ self.conv_k.bias.copy_(self.conv_q.bias)
+
+ def forward(self, x, c, attn_mask=None):
+ q = self.conv_q(x)
+ k = self.conv_k(c)
+ v = self.conv_v(c)
+
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
+
+ x = self.conv_o(x)
+ return x
+
+ def attention(self, query, key, value, mask=None):
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
+ b, d, t_s, t_t = (*key.size(), query.size(2))
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
+ if self.window_size is not None:
+ assert (
+ t_s == t_t
+ ), "Relative attention is only available for self-attention."
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
+ rel_logits = self._matmul_with_relative_keys(
+ query / math.sqrt(self.k_channels), key_relative_embeddings
+ )
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
+ scores = scores + scores_local
+ if self.proximal_bias:
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
+ scores = scores + self._attention_bias_proximal(t_s).to(
+ device=scores.device, dtype=scores.dtype
+ )
+ if mask is not None:
+ scores = scores.masked_fill(mask == 0, -1e4)
+ if self.block_length is not None:
+ assert (
+ t_s == t_t
+ ), "Local attention is only available for self-attention."
+ block_mask = (
+ torch.ones_like(scores)
+ .triu(-self.block_length)
+ .tril(self.block_length)
+ )
+ scores = scores.masked_fill(block_mask == 0, -1e4)
+ p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
+ p_attn = self.drop(p_attn)
+ output = torch.matmul(p_attn, value)
+ if self.window_size is not None:
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
+ value_relative_embeddings = self._get_relative_embeddings(
+ self.emb_rel_v, t_s
+ )
+ output = output + self._matmul_with_relative_values(
+ relative_weights, value_relative_embeddings
+ )
+ output = (
+ output.transpose(2, 3).contiguous().view(b, d, t_t)
+ ) # [b, n_h, t_t, d_k] -> [b, d, t_t]
+ return output, p_attn
+
+ def _matmul_with_relative_values(self, x, y):
+ """
+ x: [b, h, l, m]
+ y: [h or 1, m, d]
+ ret: [b, h, l, d]
+ """
+ ret = torch.matmul(x, y.unsqueeze(0))
+ return ret
+
+ def _matmul_with_relative_keys(self, x, y):
+ """
+ x: [b, h, l, d]
+ y: [h or 1, m, d]
+ ret: [b, h, l, m]
+ """
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+ return ret
+
+ def _get_relative_embeddings(self, relative_embeddings, length):
+ max_relative_position = 2 * self.window_size + 1
+ # Pad first before slice to avoid using cond ops.
+ pad_length = max(length - (self.window_size + 1), 0)
+ slice_start_position = max((self.window_size + 1) - length, 0)
+ slice_end_position = slice_start_position + 2 * length - 1
+ if pad_length > 0:
+ padded_relative_embeddings = F.pad(
+ relative_embeddings,
+ commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
+ )
+ else:
+ padded_relative_embeddings = relative_embeddings
+ used_relative_embeddings = padded_relative_embeddings[
+ :, slice_start_position:slice_end_position
+ ]
+ return used_relative_embeddings
+
+ def _relative_position_to_absolute_position(self, x):
+ """
+ x: [b, h, l, 2*l-1]
+ ret: [b, h, l, l]
+ """
+ batch, heads, length, _ = x.size()
+ # Concat columns of pad to shift from relative to absolute indexing.
+ x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
+
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
+ x_flat = x.view([batch, heads, length * 2 * length])
+ x_flat = F.pad(
+ x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
+ )
+
+ # Reshape and slice out the padded elements.
+ x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
+ :, :, :length, length - 1 :
+ ]
+ return x_final
+
+ def _absolute_position_to_relative_position(self, x):
+ """
+ x: [b, h, l, l]
+ ret: [b, h, l, 2*l-1]
+ """
+ batch, heads, length, _ = x.size()
+ # padd along column
+ x = F.pad(
+ x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
+ )
+ x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
+ # add 0's in the beginning that will skew the elements after reshape
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
+ x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
+ return x_final
+
+ def _attention_bias_proximal(self, length):
+ """Bias for self-attention to encourage attention to close positions.
+ Args:
+ length: an integer scalar.
+ Returns:
+ a Tensor with shape [1, 1, length, length]
+ """
+ r = torch.arange(length, dtype=torch.float32)
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
+
+
+class FFN(nn.Module):
+ def __init__(
+ self,
+ in_channels,
+ out_channels,
+ filter_channels,
+ kernel_size,
+ p_dropout=0.0,
+ activation=None,
+ causal=False,
+ ):
+ super().__init__()
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.filter_channels = filter_channels
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+ self.activation = activation
+ self.causal = causal
+
+ if causal:
+ self.padding = self._causal_padding
+ else:
+ self.padding = self._same_padding
+
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
+ self.drop = nn.Dropout(p_dropout)
+
+ def forward(self, x, x_mask):
+ x = self.conv_1(self.padding(x * x_mask))
+ if self.activation == "gelu":
+ x = x * torch.sigmoid(1.702 * x)
+ else:
+ x = torch.relu(x)
+ x = self.drop(x)
+ x = self.conv_2(self.padding(x * x_mask))
+ return x * x_mask
+
+ def _causal_padding(self, x):
+ if self.kernel_size == 1:
+ return x
+ pad_l = self.kernel_size - 1
+ pad_r = 0
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+ x = F.pad(x, commons.convert_pad_shape(padding))
+ return x
+
+ def _same_padding(self, x):
+ if self.kernel_size == 1:
+ return x
+ pad_l = (self.kernel_size - 1) // 2
+ pad_r = self.kernel_size // 2
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+ x = F.pad(x, commons.convert_pad_shape(padding))
+ return x
diff --git a/AIMeiSheng/lib/infer_pack/models_embed_in_dec_diff_control_enc_spken200x_onlyspk_double.py b/AIMeiSheng/lib/infer_pack/models_embed_in_dec_diff_control_enc_spken200x_onlyspk_double.py
new file mode 100644
index 0000000..1268f17
--- /dev/null
+++ b/AIMeiSheng/lib/infer_pack/models_embed_in_dec_diff_control_enc_spken200x_onlyspk_double.py
@@ -0,0 +1,1301 @@
+import math, pdb, os
+from time import time as ttime
+import torch
+from torch import nn
+from torch.nn import functional as F
+from lib.infer_pack import modules
+from lib.infer_pack import attentions_in_dec_double as attentions
+from lib.infer_pack import commons
+from lib.infer_pack.commons import init_weights, get_padding
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from lib.infer_pack.commons import init_weights
+import numpy as np
+from lib.infer_pack import commons
+from thop import profile
+from diffuse_fang.diffUse_wraper_double import diff_decoder,ddpm_para
+ddpm_dp = ddpm_para()
+g2_dim = 256
+
+class TextEncoder256(nn.Module):
+ def __init__(
+ self,
+ out_channels,
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size,
+ p_dropout,
+ f0=True,
+ ):
+ super().__init__()
+ self.out_channels = out_channels
+ self.hidden_channels = hidden_channels
+ self.filter_channels = filter_channels
+ self.n_heads = n_heads
+ self.n_layers = n_layers
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+ self.emb_phone = nn.Linear(256, hidden_channels)
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
+ if f0 == True:
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
+ self.encoder = attentions.Encoder(
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+ )
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+ def forward(self, phone, pitch, lengths):
+ if pitch == None:
+ x = self.emb_phone(phone)
+ else:
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
+ x = self.lrelu(x)
+ x = torch.transpose(x, 1, -1) # [b, h, t]
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
+ x.dtype
+ )
+ x = self.encoder(x * x_mask, x_mask)
+ stats = self.proj(x) * x_mask
+
+ m, logs = torch.split(stats, self.out_channels, dim=1)
+ return m, logs, x_mask
+
+
+class TextEncoder768(nn.Module):
+ def __init__(
+ self,
+ out_channels,
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size,
+ p_dropout,
+ f0=True,
+ ):
+ super().__init__()
+ self.out_channels = out_channels
+ self.hidden_channels = hidden_channels
+ self.filter_channels = filter_channels
+ self.n_heads = n_heads
+ self.n_layers = n_layers
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+ self.emb_phone = nn.Linear(768, hidden_channels)
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
+ if f0 == True:
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
+ self.encoder = attentions.Encoder(
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+ )
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+ #self.emb_g = nn.Linear(256, hidden_channels)
+
+ def forward(self, phone, pitch, lengths,g):#fang add
+ if pitch == None:
+ x = self.emb_phone(phone)
+ else:
+ x = self.emb_phone(phone) + self.emb_pitch(pitch) #+ self.emb_g(g)
+ #print("@@@x:",x.shape)
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
+ x = self.lrelu(x)
+ x = torch.transpose(x, 1, -1) # [b, h, t]
+ #print("@@@x1:",x.shape)
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
+ x.dtype
+ )
+ #x = self.encoder(x * x_mask, x_mask,g)
+ x = self.encoder(x * x_mask, x_mask,g)#fang add
+ stats = self.proj(x) * x_mask
+
+ m, logs = torch.split(stats, self.out_channels, dim=1)
+ return m, logs, x_mask,x
+
+
+class ResidualCouplingBlock(nn.Module):
+ def __init__(
+ self,
+ channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ n_flows=4,
+ gin_channels=0,
+ ):
+ super().__init__()
+ self.channels = channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.dilation_rate = dilation_rate
+ self.n_layers = n_layers
+ self.n_flows = n_flows
+ self.gin_channels = gin_channels
+
+ self.flows = nn.ModuleList()
+ for i in range(n_flows):
+ self.flows.append(
+ modules.ResidualCouplingLayer(
+ channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ gin_channels=gin_channels,
+ mean_only=True,
+ )
+ )
+ self.flows.append(modules.Flip())
+
+ def forward(self, x, x_mask, g=None, reverse=False):
+ if not reverse:
+ for flow in self.flows:
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
+ else:
+ for flow in reversed(self.flows):
+ x = flow(x, x_mask, g=g, reverse=reverse)
+ return x
+
+ def remove_weight_norm(self):
+ for i in range(self.n_flows):
+ self.flows[i * 2].remove_weight_norm()
+
+
+class PosteriorEncoder(nn.Module):
+ def __init__(
+ self,
+ in_channels,
+ out_channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ gin_channels=0,
+ ):
+ super().__init__()
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.dilation_rate = dilation_rate
+ self.n_layers = n_layers
+ self.gin_channels = gin_channels
+
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+ self.enc = modules.WN(
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ gin_channels=gin_channels,
+ )
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+ def forward(self, x, x_lengths, g=None):
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
+ x.dtype
+ )
+ x = self.pre(x) * x_mask
+ x = self.enc(x, x_mask, g=g)
+ stats = self.proj(x) * x_mask
+ m, logs = torch.split(stats, self.out_channels, dim=1)#均值和方差 fang
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask ##随机采样 fang
+ return z, m, logs, x_mask
+
+ def remove_weight_norm(self):
+ self.enc.remove_weight_norm()
+
+
+class Generator(torch.nn.Module):
+ def __init__(
+ self,
+ initial_channel,
+ resblock,
+ resblock_kernel_sizes,
+ resblock_dilation_sizes,
+ upsample_rates,
+ upsample_initial_channel,
+ upsample_kernel_sizes,
+ gin_channels=0,
+ ):
+ super(Generator, self).__init__()
+ self.num_kernels = len(resblock_kernel_sizes)
+ self.num_upsamples = len(upsample_rates)
+ self.conv_pre = Conv1d(
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
+ )
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+
+ self.ups = nn.ModuleList()
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+ self.ups.append(
+ weight_norm(
+ ConvTranspose1d(
+ upsample_initial_channel // (2**i),
+ upsample_initial_channel // (2 ** (i + 1)),
+ k,
+ u,
+ padding=(k - u) // 2,
+ )
+ )
+ )
+
+ self.resblocks = nn.ModuleList()
+ for i in range(len(self.ups)):
+ ch = upsample_initial_channel // (2 ** (i + 1))
+ for j, (k, d) in enumerate(
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
+ ):
+ self.resblocks.append(resblock(ch, k, d))
+
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+ self.ups.apply(init_weights)
+
+ if gin_channels != 0:
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+
+ def forward(self, x, g=None):
+ x = self.conv_pre(x)
+ if g is not None:
+ x = x + self.cond(g)
+
+ for i in range(self.num_upsamples):
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
+ x = self.ups[i](x)
+ xs = None
+ for j in range(self.num_kernels):
+ if xs is None:
+ xs = self.resblocks[i * self.num_kernels + j](x)
+ else:
+ xs += self.resblocks[i * self.num_kernels + j](x)
+ x = xs / self.num_kernels
+ x = F.leaky_relu(x)
+ x = self.conv_post(x)
+ x = torch.tanh(x)
+
+ return x
+
+ def remove_weight_norm(self):
+ for l in self.ups:
+ remove_weight_norm(l)
+ for l in self.resblocks:
+ l.remove_weight_norm()
+
+
+class SineGen(torch.nn.Module):
+ """Definition of sine generator
+ SineGen(samp_rate, harmonic_num = 0,
+ sine_amp = 0.1, noise_std = 0.003,
+ voiced_threshold = 0,
+ flag_for_pulse=False)
+ samp_rate: sampling rate in Hz
+ harmonic_num: number of harmonic overtones (default 0)
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
+ noise_std: std of Gaussian noise (default 0.003)
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
+ Note: when flag_for_pulse is True, the first time step of a voiced
+ segment is always sin(np.pi) or cos(0)
+ """
+
+ def __init__(
+ self,
+ samp_rate,
+ harmonic_num=0,
+ sine_amp=0.1,
+ noise_std=0.003,
+ voiced_threshold=0,
+ flag_for_pulse=False,
+ ):
+ super(SineGen, self).__init__()
+ self.sine_amp = sine_amp
+ self.noise_std = noise_std
+ self.harmonic_num = harmonic_num
+ self.dim = self.harmonic_num + 1
+ self.sampling_rate = samp_rate
+ self.voiced_threshold = voiced_threshold
+
+ def _f02uv(self, f0):
+ # generate uv signal
+ uv = torch.ones_like(f0)
+ uv = uv * (f0 > self.voiced_threshold)
+ return uv
+
+ def forward(self, f0, upp):
+ """sine_tensor, uv = forward(f0)
+ input F0: tensor(batchsize=1, length, dim=1)
+ f0 for unvoiced steps should be 0
+ output sine_tensor: tensor(batchsize=1, length, dim)
+ output uv: tensor(batchsize=1, length, 1)
+ """
+ with torch.no_grad():
+ f0 = f0[:, None].transpose(1, 2)
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
+ # fundamental component
+ f0_buf[:, :, 0] = f0[:, :, 0]
+ for idx in np.arange(self.harmonic_num):
+ f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
+ idx + 2
+ ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
+ rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
+ rand_ini = torch.rand(
+ f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
+ )
+ rand_ini[:, 0] = 0
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+ tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
+ tmp_over_one *= upp
+ tmp_over_one = F.interpolate(
+ tmp_over_one.transpose(2, 1),
+ scale_factor=upp,
+ mode="linear",
+ align_corners=True,
+ ).transpose(2, 1)
+ rad_values = F.interpolate(
+ rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
+ ).transpose(
+ 2, 1
+ ) #######
+ tmp_over_one %= 1
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
+ cumsum_shift = torch.zeros_like(rad_values)
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+ sine_waves = torch.sin(
+ torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
+ )
+ sine_waves = sine_waves * self.sine_amp
+ uv = self._f02uv(f0)
+ uv = F.interpolate(
+ uv.transpose(2, 1), scale_factor=upp, mode="nearest"
+ ).transpose(2, 1)
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+ noise = noise_amp * torch.randn_like(sine_waves)
+ sine_waves = sine_waves * uv + noise
+ return sine_waves, uv, noise
+
+
+class SourceModuleHnNSF(torch.nn.Module):
+ """SourceModule for hn-nsf
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+ add_noise_std=0.003, voiced_threshod=0)
+ sampling_rate: sampling_rate in Hz
+ harmonic_num: number of harmonic above F0 (default: 0)
+ sine_amp: amplitude of sine source signal (default: 0.1)
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
+ note that amplitude of noise in unvoiced is decided
+ by sine_amp
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+ F0_sampled (batchsize, length, 1)
+ Sine_source (batchsize, length, 1)
+ noise_source (batchsize, length 1)
+ uv (batchsize, length, 1)
+ """
+
+ def __init__(
+ self,
+ sampling_rate,
+ harmonic_num=0,
+ sine_amp=0.1,
+ add_noise_std=0.003,
+ voiced_threshod=0,
+ is_half=True,
+ ):
+ super(SourceModuleHnNSF, self).__init__()
+
+ self.sine_amp = sine_amp
+ self.noise_std = add_noise_std
+ self.is_half = is_half
+ # to produce sine waveforms
+ self.l_sin_gen = SineGen(
+ sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
+ )
+
+ # to merge source harmonics into a single excitation
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+ self.l_tanh = torch.nn.Tanh()
+
+ def forward(self, x, upp=None):
+ sine_wavs, uv, _ = self.l_sin_gen(x, upp)
+ if self.is_half:
+ sine_wavs = sine_wavs.half()
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+ return sine_merge, None, None # noise, uv
+
+
+class GeneratorNSF(torch.nn.Module):
+ def __init__(
+ self,
+ initial_channel,
+ resblock,
+ resblock_kernel_sizes,
+ resblock_dilation_sizes,
+ upsample_rates,
+ upsample_initial_channel,
+ upsample_kernel_sizes,
+ gin_channels,
+ sr,
+ is_half=False,
+ ):
+ super(GeneratorNSF, self).__init__()
+ self.num_kernels = len(resblock_kernel_sizes)
+ self.num_upsamples = len(upsample_rates)
+
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
+ self.m_source = SourceModuleHnNSF(
+ sampling_rate=sr, harmonic_num=0, is_half=is_half
+ )
+ self.noise_convs = nn.ModuleList()
+ self.conv_pre = Conv1d(
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
+ )
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+
+ self.ups = nn.ModuleList()
+ self.ups_g = nn.ModuleList()# fang add
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+ c_cur = upsample_initial_channel // (2 ** (i + 1))
+ self.ups.append(
+ weight_norm(
+ ConvTranspose1d(
+ upsample_initial_channel // (2**i),
+ upsample_initial_channel // (2 ** (i + 1)),
+ k,
+ u,
+ padding=(k - u) // 2,
+ )
+ )
+ )
+ self.ups_g.append(
+ nn.Conv1d(upsample_initial_channel,upsample_initial_channel // (2 ** (i + 1) ), 1)
+ #F.interpolate(input, scale_factor=2, mode='nearest')
+ )# fang add
+ if i + 1 < len(upsample_rates):
+ stride_f0 = np.prod(upsample_rates[i + 1 :])
+ self.noise_convs.append(
+ Conv1d(
+ 1,
+ c_cur,
+ kernel_size=stride_f0 * 2,
+ stride=stride_f0,
+ padding=stride_f0 // 2,
+ )
+ )
+ else:
+ self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
+
+ self.resblocks = nn.ModuleList()
+ for i in range(len(self.ups)):
+ ch = upsample_initial_channel // (2 ** (i + 1))
+ for j, (k, d) in enumerate(
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
+ ):
+ self.resblocks.append(resblock(ch, k, d))
+
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+ self.ups.apply(init_weights)
+
+ if gin_channels != 0:
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+
+ self.upp = np.prod(upsample_rates)
+
+ def forward(self, x, f0, g=None):
+ har_source, noi_source, uv = self.m_source(f0, self.upp)
+ har_source = har_source.transpose(1, 2)
+ x = self.conv_pre(x)
+ if g is not None:
+ #x = x + self.cond(g) ##org
+ tmp_g = self.cond(g) ##fang add
+ x = x + tmp_g ##fang add
+ #print('###@@@@##x:',x.shape )
+ for i in range(self.num_upsamples):
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
+ x = self.ups[i](x)
+ x_source = self.noise_convs[i](har_source)
+ x = x + x_source
+ xg = self.ups_g[i](tmp_g) #fang add
+ x = x + xg #fang add
+ xs = None
+ for j in range(self.num_kernels):
+ if xs is None:
+ xs = self.resblocks[i * self.num_kernels + j](x)
+ else:
+ xs += self.resblocks[i * self.num_kernels + j](x)
+ x = xs / self.num_kernels
+ #print('@@@@##x:',x.shape)
+ x = F.leaky_relu(x)
+ x = self.conv_post(x)
+ x = torch.tanh(x)
+ return x
+
+ def remove_weight_norm(self):
+ for l in self.ups:
+ remove_weight_norm(l)
+ for l in self.resblocks:
+ l.remove_weight_norm()
+
+
+sr2sr = {
+ "32k": 32000,
+ "40k": 40000,
+ "48k": 48000,
+ "24k": 24000,
+}
+
+
+class SynthesizerTrnMs256NSFsid(nn.Module):
+ def __init__(
+ self,
+ spec_channels,
+ segment_size,
+ inter_channels,
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size,
+ p_dropout,
+ resblock,
+ resblock_kernel_sizes,
+ resblock_dilation_sizes,
+ upsample_rates,
+ upsample_initial_channel,
+ upsample_kernel_sizes,
+ spk_embed_dim,
+ gin_channels,
+ sr,
+ **kwargs
+ ):
+ super().__init__()
+ if type(sr) == type("strr"):
+ sr = sr2sr[sr]
+ self.spec_channels = spec_channels
+ self.inter_channels = inter_channels
+ self.hidden_channels = hidden_channels
+ self.filter_channels = filter_channels
+ self.n_heads = n_heads
+ self.n_layers = n_layers
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+ self.resblock = resblock
+ self.resblock_kernel_sizes = resblock_kernel_sizes
+ self.resblock_dilation_sizes = resblock_dilation_sizes
+ self.upsample_rates = upsample_rates
+ self.upsample_initial_channel = upsample_initial_channel
+ self.upsample_kernel_sizes = upsample_kernel_sizes
+ self.segment_size = segment_size
+ self.gin_channels = gin_channels
+ # self.hop_length = hop_length#
+ self.spk_embed_dim = spk_embed_dim
+ self.enc_p = TextEncoder256(
+ inter_channels,
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size,
+ p_dropout,
+ )
+ self.dec = GeneratorNSF(
+ inter_channels,
+ resblock,
+ resblock_kernel_sizes,
+ resblock_dilation_sizes,
+ upsample_rates,
+ upsample_initial_channel,
+ upsample_kernel_sizes,
+ gin_channels=gin_channels,
+ sr=sr,
+ is_half=kwargs["is_half"],
+ )
+ self.enc_q = PosteriorEncoder(
+ spec_channels,
+ inter_channels,
+ hidden_channels,
+ 5,
+ 1,
+ 16,
+ gin_channels=gin_channels,
+ )
+ self.flow = ResidualCouplingBlock(
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+ )
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+
+ def remove_weight_norm(self):
+ self.dec.remove_weight_norm()
+ self.flow.remove_weight_norm()
+ self.enc_q.remove_weight_norm()
+
+ def forward(
+ self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
+ ): # 这里ds是id,[bs,1]
+ # print(1,pitch.shape)#[bs,t]
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
+ #print("@@@pitch.shape: ",pitch.shape)
+ #g = ds.unsqueeze(-1)
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
+ z_p = self.flow(z, y_mask, g=g)
+ z_slice, ids_slice = commons.rand_slice_segments(
+ z, y_lengths, self.segment_size
+ ) #按照self.segment_size这个长度,进行随机切割z,长度固定,开始位置不同存在ids_slice中,z_slice是切割的结果, fang
+ # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
+ pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
+ # print(-2,pitchf.shape,z_slice.shape)
+ o = self.dec(z_slice, pitchf, g=g)
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
+
+ def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None):
+ g = self.emb_g(sid).unsqueeze(-1)
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
+ if rate:
+ head = int(z_p.shape[2] * rate)
+ z_p = z_p[:, :, -head:]
+ x_mask = x_mask[:, :, -head:]
+ nsff0 = nsff0[:, -head:]
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
+ print('z shape: ',z.shape)
+ print('x_mask shape: ',x_mask.shape)
+ z_x_mask = z * x_mask
+ print('z_x_mask shape: ',z_x_mask.shape)
+ print('nsff0 shape:p', nsff0.shape)
+ print('g shape: ',g.shape)
+ o = self.dec(z * x_mask, nsff0, g=g)
+
+ self.get_floats()
+ return o, x_mask, (z, z_p, m_p, logs_p)
+
+ def get_floats(self,):
+ T = 21.4 #郭宇_但愿人长久_40k.wav
+ z = torch.randn(1,g2_dim ,2740)# 2s data(同时用2s数据验证,整数倍就对了,防止干扰)
+ x_mask = torch.randn(1,1 ,2740)
+ g = torch.randn(1,256 ,1)
+
+ inputs_bfcc = z #z * x_mask
+ nsff0 = torch.randn(1, 2740)
+ devices = 'cuda' #'cpu'
+ self.dec = self.dec.to(devices).half()
+ inputs_bfcc , nsff0, g = inputs_bfcc.to(devices).half(), nsff0.to(devices).half(), g.to(devices).half()
+ flops, params = profile(self.dec, (inputs_bfcc, nsff0, g))
+ print(f'@@@hifi-gan nsf decflops: {flops/(T*pow(10,9))} GFLOPS, params: { params/pow(10,6)} M')
+ return 0
+
+class SynthesizerTrnMs768NSFsid(nn.Module):
+ def __init__(
+ self,
+ spec_channels,
+ segment_size,
+ inter_channels,
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size,
+ p_dropout,
+ resblock,
+ resblock_kernel_sizes,
+ resblock_dilation_sizes,
+ upsample_rates,
+ upsample_initial_channel,
+ upsample_kernel_sizes,
+ spk_embed_dim,
+ gin_channels,
+ sr,
+ **kwargs
+ ):
+ super().__init__()
+ if type(sr) == type("strr"):
+ sr = sr2sr[sr]
+ self.spec_channels = spec_channels
+ self.inter_channels = inter_channels
+ self.hidden_channels = hidden_channels
+ self.filter_channels = filter_channels
+ self.n_heads = n_heads
+ self.n_layers = n_layers
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+ self.resblock = resblock
+ self.resblock_kernel_sizes = resblock_kernel_sizes
+ self.resblock_dilation_sizes = resblock_dilation_sizes
+ self.upsample_rates = upsample_rates
+ self.upsample_initial_channel = upsample_initial_channel
+ self.upsample_kernel_sizes = upsample_kernel_sizes
+ self.segment_size = segment_size
+ self.gin_channels = gin_channels
+ # self.hop_length = hop_length#
+ self.spk_embed_dim = spk_embed_dim
+ self.enc_p = TextEncoder768(
+ inter_channels,
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size,
+ p_dropout,
+ )
+ self.dec = GeneratorNSF(
+ inter_channels,
+ resblock,
+ resblock_kernel_sizes,
+ resblock_dilation_sizes,
+ upsample_rates,
+ upsample_initial_channel,
+ upsample_kernel_sizes,
+ gin_channels=gin_channels,
+ sr=sr,
+ is_half=kwargs["is_half"],
+ )
+ self.enc_q = PosteriorEncoder(
+ spec_channels,
+ inter_channels,
+ hidden_channels,
+ 5,
+ 1,
+ 16,
+ gin_channels=gin_channels,
+ )
+ self.flow = ResidualCouplingBlock(
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+ )
+ #for p in self.flow.parameters():
+ # p.requires_grad=False
+ #for p in self.enc_p.parameters():
+ # p.requires_grad=False
+
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+
+ self.diff_decoder = diff_decoder
+ #self.diff_cond_g = nn.Conv1d(256,g2_dim, 1)
+ self.diff_cond_gx = self.zero_module(self.conv_nd(1, 256, g2_dim, 3, padding=1))
+ self.diff_cond_out = self.zero_module(self.conv_nd(1, g2_dim, g2_dim, 3, padding=1))
+ self.lzp = 0.1
+ self.ssl_proj = self.zero_module(nn.Conv1d(256*2, 256, 1, stride=1))
+ self.ssl_proj1 = self.zero_module(nn.Conv1d(256, 256, 1, stride=1))
+ self.ssl_proj1_norm = nn.BatchNorm1d(256)#nn.LayerNorm(256)
+ self.ssl_proj2 = self.zero_module(nn.Conv1d(256, 256, 1, stride=1))
+ self.ssl_proj2_norm = nn.BatchNorm1d(256)#nn.LayerNorm(256)
+
+ def zero_module(self,module):
+ """
+ Zero out the parameters of a module and return it.
+ """
+ for p in module.parameters():
+ p.detach().zero_()
+ return module
+
+ def conv_nd(self, dims, *args, **kwargs):
+ """
+ Create a 1D, 2D, or 3D convolution module.
+ """
+ if dims == 1:
+ return nn.Conv1d(*args, **kwargs)
+ elif dims == 2:
+ return nn.Conv2d(*args, **kwargs)
+ elif dims == 3:
+ return nn.Conv3d(*args, **kwargs)
+ raise ValueError(f"unsupported dimensions: {dims}")
+
+ def remove_weight_norm(self):
+ self.dec.remove_weight_norm()
+ self.flow.remove_weight_norm()
+ self.enc_q.remove_weight_norm()
+
+ def forward(
+ self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
+ ): # 这里ds是id,[bs,1]
+ # print(1,pitch.shape)#[bs,t]
+ #g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
+ #print("@@@@@fang@@@@@")
+
+ #print("@@@@g:",ds.size())
+ #g, ge = ds[0], ds[1]
+ g = ds.unsqueeze(-1)
+ #g = self.ssl_proj(g)#[:,256:,:])
+ g1 = self.ssl_proj1_norm( self.ssl_proj1(g[:,:256,:]))
+ g2 = self.ssl_proj2_norm( self.ssl_proj2(g[:,256:,:]))
+ g = g1 + g[:,256:,:]#+ g2
+ #g = g[:,:256,:] + ge
+ #print("@@@@g1:",g.size())
+ #print("g:",g.size())
+ #print("phone_lengths: ",phone_lengths.size())
+ #print("pitch: ",pitch.size())
+ #m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
+ m_p, logs_p, x_mask, x_embed = self.enc_p(phone, pitch, phone_lengths,g)#fang add
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)#self.enc_q = PosteriorEncoder ##这里面预测出了随机采样的隐变量z,m_q是均值,logs_q是方差,y_mask是mask的数据 fangi
+
+ z_p = self.flow(z, y_mask, g=g)# z是y_msk的输入
+ z_p_sample = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * y_mask
+ zx = self.flow(z_p_sample, y_mask, g=g, reverse=True)
+ #print("@@@@@g:",g.shape)
+ g_z_p = self.diff_cond_gx(g)
+ #print("@@@@@g_z_p:",g_z_p.shape)
+ z_res = z - zx
+
+ #print('#######x_embed:',x_embed.shape)
+ #print('#######z_p_sample:',z_p_sample.shape)
+ #print('#######g_z_p:',g_z_p.shape)
+ #z_p1 = z_p_sample + g_z_p
+ z_p1 = x_embed + g_z_p
+ ###diff st
+ z_p_diff = z_p1.transpose(1,2) ##b,frames,feat
+ z_diff = z_res.transpose(1,2) ##b,frames,feat
+
+ diff_loss,_ = self.diff_decoder(z_p_diff, gt_spec=z_diff, infer=False, infer_speedup=ddpm_dp.infer_speedup, method=ddpm_dp.method, use_tqdm=ddpm_dp.use_tqdm)
+
+ #self.diff_decoder = self.diff_decoder.float()
+ #print("@@@z: ",z.shape)
+ #b = z_p_diff.shape[0]
+ t = 200#np.random.randint(100,1000)#200#torch.randint(0, 1000, (b,), device=g.device).long()
+ z_diff = zx.transpose(1,2)
+ z_x_diff = self.diff_decoder(z_p_diff, gt_spec=z_diff*self.lzp, infer=True, infer_speedup=ddpm_dp.infer_speedup, method=ddpm_dp.method, k_step=t, use_tqdm=False)
+ #print("@@@z_x: ",z_x.shape)
+ z1 = z_x_diff.transpose(1,2)
+ z1 = self.diff_cond_out(z1)
+ z_in = (zx + z1)
+ #z_p = z_p_rec.transpose(1,2)
+ ##diff en
+ ##oneflow
+ #z_p = self.flow(z, y_mask, g=g)
+
+ z_slice, ids_slice = commons.rand_slice_segments(
+ z_in, y_lengths, self.segment_size
+ )
+ # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
+ pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
+ # print(-2,pitchf.shape,z_slice.shape)
+ o = self.dec(z_slice, pitchf, g=g)
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q),diff_loss
+
+ def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None):
+ #g = self.emb_g(sid).unsqueeze(-1)
+ g = sid.unsqueeze(-1).unsqueeze(0)
+ g = self.ssl_proj(g)
+ #g1 = self.ssl_proj1_norm(g[:,:256,:])
+ #g2 = self.ssl_proj2_norm(g[:,256:,:])
+ #g1 = self.ssl_proj1_norm( self.ssl_proj1(g[:,:256,:]))
+ #g2 = self.ssl_proj2_norm( self.ssl_proj2(g[:,256:,:]))
+ #g1 = self.ssl_proj1(g[:,:256,:])
+ #g2 = self.ssl_proj1(g[:,:256,:])
+ #g = g1 + g2
+ #g = g[:,256:,:]#+ g2
+ #m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) #org
+ print("@@@@@@pitch:",pitch.shape,"phone:",phone.shape)
+ m_p, logs_p, x_mask, x_embed = self.enc_p(phone, pitch, phone_lengths,g) #fang add
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
+ if rate:
+ head = int(z_p.shape[2] * rate)
+ z_p = z_p[:, :, -head:]
+ x_mask = x_mask[:, :, -head:]
+ nsff0 = nsff0[:, -head:]
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
+
+ g_z_p = self.diff_cond_gx(g)
+ #z_p1 = z_p + g_z_p
+ z_p1 = x_embed + g_z_p
+ #if is_half:
+ #self.diff_decoder = self.diff_decoder.float()
+ z_p_diff = z_p1.transpose(1,2).float() ##b,frames,feat
+ z_diff = z.transpose(1,2) ##b,frames,feat
+ print("@@z_p_diff", z_p_diff[0,0,:])
+ self.diff_decoder = self.diff_decoder.float()
+ z_x = self.diff_decoder(z_p_diff, gt_spec=z_diff*self.lzp, infer=True, infer_speedup=ddpm_dp.infer_speedup, method=ddpm_dp.method, k_step=200, use_tqdm=ddpm_dp.use_tqdm)
+ print("@@z_x", z_x[0,0,:])
+ z1 = z_x.transpose(1,2).half()
+ z_res = self.diff_cond_out(z1)
+ z = z + z_res
+ o = self.dec(z * x_mask, nsff0, g=g)
+ #self.get_floats()
+ return o, x_mask, (z, z_p, m_p, logs_p)
+
+
+class SynthesizerTrnMs256NSFsid_nono(nn.Module):
+ def __init__(
+ self,
+ spec_channels,
+ segment_size,
+ inter_channels,
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size,
+ p_dropout,
+ resblock,
+ resblock_kernel_sizes,
+ resblock_dilation_sizes,
+ upsample_rates,
+ upsample_initial_channel,
+ upsample_kernel_sizes,
+ spk_embed_dim,
+ gin_channels,
+ sr=None,
+ **kwargs
+ ):
+ super().__init__()
+ self.spec_channels = spec_channels
+ self.inter_channels = inter_channels
+ self.hidden_channels = hidden_channels
+ self.filter_channels = filter_channels
+ self.n_heads = n_heads
+ self.n_layers = n_layers
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+ self.resblock = resblock
+ self.resblock_kernel_sizes = resblock_kernel_sizes
+ self.resblock_dilation_sizes = resblock_dilation_sizes
+ self.upsample_rates = upsample_rates
+ self.upsample_initial_channel = upsample_initial_channel
+ self.upsample_kernel_sizes = upsample_kernel_sizes
+ self.segment_size = segment_size
+ self.gin_channels = gin_channels
+ # self.hop_length = hop_length#
+ self.spk_embed_dim = spk_embed_dim
+ self.enc_p = TextEncoder256(
+ inter_channels,
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size,
+ p_dropout,
+ f0=False,
+ )
+ self.dec = Generator(
+ inter_channels,
+ resblock,
+ resblock_kernel_sizes,
+ resblock_dilation_sizes,
+ upsample_rates,
+ upsample_initial_channel,
+ upsample_kernel_sizes,
+ gin_channels=gin_channels,
+ )
+ self.enc_q = PosteriorEncoder(
+ spec_channels,
+ inter_channels,
+ hidden_channels,
+ 5,
+ 1,
+ 16,
+ gin_channels=gin_channels,
+ )
+ self.flow = ResidualCouplingBlock(
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+ )
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+
+ def remove_weight_norm(self):
+ self.dec.remove_weight_norm()
+ self.flow.remove_weight_norm()
+ self.enc_q.remove_weight_norm()
+
+ def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
+ z_p = self.flow(z, y_mask, g=g)
+ z_slice, ids_slice = commons.rand_slice_segments(
+ z, y_lengths, self.segment_size
+ )
+ o = self.dec(z_slice, g=g)
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
+
+ def infer(self, phone, phone_lengths, sid, rate=None):
+ g = self.emb_g(sid).unsqueeze(-1)
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
+ if rate:
+ head = int(z_p.shape[2] * rate)
+ z_p = z_p[:, :, -head:]
+ x_mask = x_mask[:, :, -head:]
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
+ o = self.dec(z * x_mask, g=g)
+ return o, x_mask, (z, z_p, m_p, logs_p)
+
+
+class SynthesizerTrnMs768NSFsid_nono(nn.Module):
+ def __init__(
+ self,
+ spec_channels,
+ segment_size,
+ inter_channels,
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size,
+ p_dropout,
+ resblock,
+ resblock_kernel_sizes,
+ resblock_dilation_sizes,
+ upsample_rates,
+ upsample_initial_channel,
+ upsample_kernel_sizes,
+ spk_embed_dim,
+ gin_channels,
+ sr=None,
+ **kwargs
+ ):
+ super().__init__()
+ self.spec_channels = spec_channels
+ self.inter_channels = inter_channels
+ self.hidden_channels = hidden_channels
+ self.filter_channels = filter_channels
+ self.n_heads = n_heads
+ self.n_layers = n_layers
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+ self.resblock = resblock
+ self.resblock_kernel_sizes = resblock_kernel_sizes
+ self.resblock_dilation_sizes = resblock_dilation_sizes
+ self.upsample_rates = upsample_rates
+ self.upsample_initial_channel = upsample_initial_channel
+ self.upsample_kernel_sizes = upsample_kernel_sizes
+ self.segment_size = segment_size
+ self.gin_channels = gin_channels
+ # self.hop_length = hop_length#
+ self.spk_embed_dim = spk_embed_dim
+ self.enc_p = TextEncoder768(
+ inter_channels,
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size,
+ p_dropout,
+ f0=False,
+ )
+ self.dec = Generator(
+ inter_channels,
+ resblock,
+ resblock_kernel_sizes,
+ resblock_dilation_sizes,
+ upsample_rates,
+ upsample_initial_channel,
+ upsample_kernel_sizes,
+ gin_channels=gin_channels,
+ )
+ self.enc_q = PosteriorEncoder(
+ spec_channels,
+ inter_channels,
+ hidden_channels,
+ 5,
+ 1,
+ 16,
+ gin_channels=gin_channels,
+ )
+ self.flow = ResidualCouplingBlock(
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+ )
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+
+ def remove_weight_norm(self):
+ self.dec.remove_weight_norm()
+ self.flow.remove_weight_norm()
+ self.enc_q.remove_weight_norm()
+
+ def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
+ #g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
+ g = ds.unsqueeze(-1)
+ #m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) #org
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths,g=g)#fang add
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
+ z_p = self.flow(z, y_mask, g=g)
+ z_slice, ids_slice = commons.rand_slice_segments(
+ z, y_lengths, self.segment_size
+ )
+ o = self.dec(z_slice, g=g)
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
+
+ def infer(self, phone, phone_lengths, sid, rate=None):
+ #g = self.emb_g(sid).unsqueeze(-1)
+ g = sid.unsqueeze(-1).unsqueeze(0)
+ #m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths,g=g)#fang add
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
+ if rate:
+ head = int(z_p.shape[2] * rate)
+ z_p = z_p[:, :, -head:]
+ x_mask = x_mask[:, :, -head:]
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
+ o = self.dec(z * x_mask, g=g)
+ return o, x_mask, (z, z_p, m_p, logs_p)
+
+
+class MultiPeriodDiscriminator(torch.nn.Module):
+ def __init__(self, use_spectral_norm=False):
+ super(MultiPeriodDiscriminator, self).__init__()
+ periods = [2, 3, 5, 7, 11, 17]
+ # periods = [3, 5, 7, 11, 17, 23, 37]
+
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+ discs = discs + [
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
+ ]
+ self.discriminators = nn.ModuleList(discs)
+
+ def forward(self, y, y_hat):
+ y_d_rs = [] #
+ y_d_gs = []
+ fmap_rs = []
+ fmap_gs = []
+ for i, d in enumerate(self.discriminators):
+ y_d_r, fmap_r = d(y)
+ y_d_g, fmap_g = d(y_hat)
+ # for j in range(len(fmap_r)):
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
+ y_d_rs.append(y_d_r)
+ y_d_gs.append(y_d_g)
+ fmap_rs.append(fmap_r)
+ fmap_gs.append(fmap_g)
+
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class MultiPeriodDiscriminatorV2(torch.nn.Module):
+ def __init__(self, use_spectral_norm=False):
+ super(MultiPeriodDiscriminatorV2, self).__init__()
+ # periods = [2, 3, 5, 7, 11, 17]
+ periods = [2, 3, 5, 7, 11, 17, 23, 37]
+
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+ discs = discs + [
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
+ ]
+ self.discriminators = nn.ModuleList(discs)
+
+ def forward(self, y, y_hat):
+ y_d_rs = [] #
+ y_d_gs = []
+ fmap_rs = []
+ fmap_gs = []
+ for i, d in enumerate(self.discriminators):
+ y_d_r, fmap_r = d(y)
+ y_d_g, fmap_g = d(y_hat)
+ # for j in range(len(fmap_r)):
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
+ y_d_rs.append(y_d_r)
+ y_d_gs.append(y_d_g)
+ fmap_rs.append(fmap_r)
+ fmap_gs.append(fmap_g)
+
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class DiscriminatorS(torch.nn.Module):
+ def __init__(self, use_spectral_norm=False):
+ super(DiscriminatorS, self).__init__()
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+ self.convs = nn.ModuleList(
+ [
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+ ]
+ )
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+
+ def forward(self, x):
+ fmap = []
+
+ for l in self.convs:
+ x = l(x)
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
+ fmap.append(x)
+ x = self.conv_post(x)
+ fmap.append(x)
+ x = torch.flatten(x, 1, -1)
+
+ return x, fmap
+
+
+class DiscriminatorP(torch.nn.Module):
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+ super(DiscriminatorP, self).__init__()
+ self.period = period
+ self.use_spectral_norm = use_spectral_norm
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+ self.convs = nn.ModuleList(
+ [
+ norm_f(
+ Conv2d(
+ 1,
+ 32,
+ (kernel_size, 1),
+ (stride, 1),
+ padding=(get_padding(kernel_size, 1), 0),
+ )
+ ),
+ norm_f(
+ Conv2d(
+ 32,
+ 128,
+ (kernel_size, 1),
+ (stride, 1),
+ padding=(get_padding(kernel_size, 1), 0),
+ )
+ ),
+ norm_f(
+ Conv2d(
+ 128,
+ 512,
+ (kernel_size, 1),
+ (stride, 1),
+ padding=(get_padding(kernel_size, 1), 0),
+ )
+ ),
+ norm_f(
+ Conv2d(
+ 512,
+ 1024,
+ (kernel_size, 1),
+ (stride, 1),
+ padding=(get_padding(kernel_size, 1), 0),
+ )
+ ),
+ norm_f(
+ Conv2d(
+ 1024,
+ 1024,
+ (kernel_size, 1),
+ 1,
+ padding=(get_padding(kernel_size, 1), 0),
+ )
+ ),
+ ]
+ )
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+
+ def forward(self, x):
+ fmap = []
+
+ # 1d to 2d
+ b, c, t = x.shape
+ if t % self.period != 0: # pad first
+ n_pad = self.period - (t % self.period)
+ x = F.pad(x, (0, n_pad), "reflect")
+ t = t + n_pad
+ x = x.view(b, c, t // self.period, self.period)
+
+ for l in self.convs:
+ x = l(x)
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
+ fmap.append(x)
+ x = self.conv_post(x)
+ fmap.append(x)
+ x = torch.flatten(x, 1, -1)
+
+ return x, fmap
diff --git a/AIMeiSheng/meisheng_env_preparex.py b/AIMeiSheng/meisheng_env_preparex.py
index 62f0afb..079ba38 100644
--- a/AIMeiSheng/meisheng_env_preparex.py
+++ b/AIMeiSheng/meisheng_env_preparex.py
@@ -1,55 +1,56 @@
import os
from AIMeiSheng.docker_demo.common import (gs_svc_model_path, gs_hubert_model_path, gs_embed_model_path,gs_embed_model_spk_path, gs_embed_config_spk_path, gs_rmvpe_model_path, download2disk)
def meisheng_env_prepare(logging, AIMeiSheng_Path='./'):
cos_path = "https://av-audit-sync-sg-1256122840.cos.ap-singapore.myqcloud.com/dataset/AIMeiSheng/"
rmvpe_model_url = cos_path + "rmvpe.pt"
if not os.path.exists(gs_rmvpe_model_path):
if not download2disk(rmvpe_model_url, gs_rmvpe_model_path):
logging.fatal(f"download rmvpe_model err={rmvpe_model_url}")
gs_hubert_model_url = cos_path + "hubert_base.pt"
if not os.path.exists(gs_hubert_model_path):
if not download2disk(gs_hubert_model_url, gs_hubert_model_path):
logging.fatal(f"download hubert_model err={gs_hubert_model_url}")
#model_svc = "xusong_v2_org_version_alldata_embed1_enzx_diff_fi_e15_s244110.pth"
#model_svc = "xusong_v2_org_version_alldata_embed1_enzx_diff_ocean_ctl_enc_e22_s363704.pth"
- model_svc = "xusong_v2_org_version_alldata_embed_spkenx200x_vocal_e22_s95040.pth"
+ #model_svc = "xusong_v2_org_version_alldata_embed_spkenx200x_vocal_e22_s95040.pth"
+ model_svc = "xusong_v2_org_version_alldata_embed_spkenx200x_double_e14_s90706.pth"
base_dir = os.path.dirname(gs_svc_model_path)
os.makedirs(base_dir, exist_ok=True)
svc_model_url = cos_path + model_svc
if not os.path.exists(gs_svc_model_path):
if not download2disk(svc_model_url, gs_svc_model_path):
logging.fatal(f"download svc_model err={svc_model_url}")
model_embed = "model.pt"
base_dir = os.path.dirname(gs_embed_model_path)
os.makedirs(base_dir, exist_ok=True)
embed_model_url = cos_path + model_embed
if not os.path.exists(gs_embed_model_path):
if not download2disk(embed_model_url, gs_embed_model_path):
logging.fatal(f"download embed_model err={embed_model_url}")
model_spk_embed = "best_model.pth.tar"
base_dir = os.path.dirname(gs_embed_model_spk_path)
os.makedirs(base_dir, exist_ok=True)
embed_model_url = cos_path + model_spk_embed
if not os.path.exists(gs_embed_model_spk_path):
if not download2disk(embed_model_url, gs_embed_model_spk_path):
logging.fatal(f"download embed_model err={embed_model_url}")
model_spk_embed_cfg = "config.json"
base_dir = os.path.dirname(gs_embed_config_spk_path)
os.makedirs(base_dir, exist_ok=True)
embed_model_url = cos_path + model_spk_embed_cfg
if not os.path.exists(gs_embed_config_spk_path):
if not download2disk(embed_model_url, gs_embed_config_spk_path):
logging.fatal(f"download embed_model err={embed_model_url}")
if __name__ == "__main__":
meisheng_env_prepare()
diff --git a/AIMeiSheng/meisheng_svc_final.py b/AIMeiSheng/meisheng_svc_final.py
index 2080cba..273038c 100644
--- a/AIMeiSheng/meisheng_svc_final.py
+++ b/AIMeiSheng/meisheng_svc_final.py
@@ -1,247 +1,245 @@
import os
import sys
sys.path.append(os.path.dirname(__file__))
import time
import shutil
import glob
import hashlib
import librosa
import soundfile
import gradio as gr
import pandas as pd
import numpy as np
from AIMeiSheng.RawNet3.infererence_fang_meisheng import get_embed, get_embed_model
-#from myinfer_multi_spk_embed_in_dec_diff_fi_meisheng import svc_main, load_hubert, get_vc, get_rmvpe
-from AIMeiSheng.myinfer_multi_spk_embed_in_dec_diff_meisheng_ctl_enc_spk200x import svc_main,load_hubert, get_vc,get_rmvpe
-
+from myinfer_multi_spk_embed_in_dec_diff_meisheng_ctl_enc_spk200x_onlyspk_double import svc_main,load_hubert, get_vc,get_rmvpe
from gender_classify import load_gender_model
from AIMeiSheng.docker_demo.common import gs_svc_model_path, gs_embed_model_path, gs_rmvpe_model_path, gs_err_code_target_silence
from slicex.slice_set_silence import del_noise
gs_simple_mixer_path = "/data/gpu_env_common/bin/simple_mixer" ##混音执行文件
tmp_workspace_name = "batch_test_ocean_fi" # 工作空间名
song_folder = "./data_meisheng/" ##song folder
gs_work_dir = f"./data_meisheng/{tmp_workspace_name}" # 工作空间路径
pth_model_path = "./weights/xusong_v2_org_version_alldata_embed1_enzx_diff_fi_e15_s244110.pth" ##模型文件
cur_dir = os.path.abspath(os.path.dirname(__file__))
abs_path = os.path.join(cur_dir, song_folder, tmp_workspace_name) + '/'
f0_method = None
def mix(in_path, acc_path, dst_path):
# svc转码到442
svc_442_file = in_path + "_442.wav"
st = time.time()
cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {} -loglevel fatal".format(in_path, svc_442_file)
os.system(cmd)
if not os.path.exists(svc_442_file):
return -1
print("transcode,{},sp={}".format(in_path, time.time() - st))
# 混合
st = time.time()
cmd = "{} {} {} {} 1".format(gs_simple_mixer_path, svc_442_file, acc_path, dst_path)
os.system(cmd)
print("mixer,{},sp={}".format(in_path, time.time() - st))
def load_model():
global f0_method
embed_model = get_embed_model(gs_embed_model_path)
hubert_model = load_hubert()
get_vc(gs_svc_model_path)
f0_method = get_rmvpe(gs_rmvpe_model_path)
print("model preload finish!!!")
return embed_model, hubert_model # ,svc_model
def meisheng_init():
embed_model, hubert_model = load_model() ##提前加载模型
gender_model = load_gender_model()
return embed_model, hubert_model, gender_model
def pyin_process_single_rmvpe(input_file):
global f0_method
if f0_method is None:
f0_method = get_rmvpe()
rate = 16000 # 44100
# 读取音频文件
y, sr = librosa.load(input_file, sr=rate)
len_s = len(y) / sr
lim_s = 15 # 10
f0_limit_10ms = 10
if (len_s > lim_s):
y1 = y[:sr * lim_s]
y2 = y[-sr * lim_s:]
f0 = f0_method.infer_from_audio(y1, thred=0.03)
f0 = f0[f0 < 600]
valid_f0 = f0[f0 > 50]
if len(valid_f0) > f0_limit_10ms:
mean_pitch1 = np.mean(valid_f0)
else:
mean_pitch1 = 0
f0 = f0_method.infer_from_audio(y2, thred=0.03)
f0 = f0[f0 < 600]
valid_f0 = f0[f0 > 50]
if len(valid_f0) > f0_limit_10ms:
mean_pitch2 = np.mean(valid_f0)
else:
mean_pitch2 = 0
if mean_pitch2 == 0 and mean_pitch1 == 0:
mean_pitch_cur = 0
elif mean_pitch2 == 0 or mean_pitch1 == 0:
mean_pitch_cur = max(mean_pitch1, mean_pitch2)
elif abs(mean_pitch1 - mean_pitch2) > 55:
mean_pitch_cur = min(mean_pitch1, mean_pitch2)
else:
mean_pitch_cur = (mean_pitch1 + mean_pitch2) / 2
else:
f0 = f0_method.infer_from_audio(y, thred=0.03)
f0 = f0[f0 < 600]
valid_f0 = f0[f0 > 50]
if len(valid_f0) > f0_limit_10ms:
mean_pitch_cur = np.mean(valid_f0)
else:
mean_pitch_cur = 0
return mean_pitch_cur
def meisheng_svc(song_wav, target_wav, svc_out_path, embed_npy, embed_md, hubert_md, cs_sim, paras):
##计算pitch
f0up_key = pyin_process_single_rmvpe(target_wav)
if f0up_key < 40 or np.isnan(f0up_key):#unvoice
return gs_err_code_target_silence
## get embed, 音色
get_embed(target_wav, embed_npy, embed_md)
embed_npy_spk = embed_npy[:-4] + '_spk.npy'
cs_sim.get_spk_embed(target_wav, embed_npy_spk)
print("get embed_npy_spk: {embed_npy_spk} ")
print("svc main start...")
svc_main(song_wav, svc_out_path, embed_npy, f0up_key, hubert_md, paras)
print("svc main finished!!")
del_noise(song_wav,svc_out_path,paras)
print("del noise in silence")
return 0
def process_svc_online(song_wav, target_wav, svc_out_path, embed_md, hubert_md, cs_sim, paras):
embed_npy = target_wav[:-4] + '.npy' ##embd npy存储位置
err_code = meisheng_svc(song_wav, target_wav, svc_out_path, embed_npy, embed_md, hubert_md, cs_sim, paras)
return err_code
def process_svc(song_wav, target_wav, svc_out_path, embed_md, hubert_md, cs_sim, paras):
song_wav1, target_wav, svc_out_path = os.path.basename(song_wav), os.path.basename(
target_wav), os.path.basename(svc_out_path) # 绝对路径
song_wav, target_wav, svc_out_path = song_wav, abs_path + target_wav, abs_path + svc_out_path
embed_npy = target_wav[:-4] + '.npy' ##embd npy存储位置
# similar = meisheng_svc(song_wav,target_wav,svc_out_path,embed_npy,paras)
similar = meisheng_svc(song_wav, target_wav, svc_out_path, embed_npy, embed_md, hubert_md, cs_sim, paras)
return similar
def get_svc(target_yinse_wav, song_name, embed_model, hubert_model, paras):
'''
:param target_yinse_wav: 目标音色
:param song_name: 歌曲名字
;param paras: 其他参数
:return: svc路径名
'''
##清空工作空间临时路径
if os.path.exists(gs_work_dir):
# shutil.rmtree(gs_work_dir)
cmd = f"rm -rf {gs_work_dir}/*"
os.system(cmd)
else:
os.makedirs(gs_work_dir)
gender = paras['gender'] ##为了确定歌曲
##目标音色读取
f_dst = os.path.join(gs_work_dir, os.path.basename(target_yinse_wav))
# print("dir :", f_dst,"target_yinse_wav:",target_yinse_wav)
# shutil.move(target_yinse_wav, f_dst) ##放在工作目录
shutil.copy(target_yinse_wav, f_dst)
target_yinse_wav = f_dst
##歌曲/伴奏 读取(路径需要修改)
song_wav = os.path.join("{}{}/{}/vocal321.wav".format(song_folder, gender, song_name)) # 歌曲vocal
inf_acc_path = os.path.join("{}{}/{}/acc.wav".format(song_folder, gender, song_name))
# song_wav = './xusong_long.wav'
svc_out_path = os.path.join(gs_work_dir, "svc.wav") ###svc结果名字
print("inputMsg:", song_wav, target_yinse_wav, svc_out_path)
## svc process
st = time.time()
print("start inference...")
similar = process_svc(song_wav, target_yinse_wav, svc_out_path, embed_model, hubert_model, cs_sim, paras)
print("svc finished!!")
print("time cost = {}".format(time.time() - st))
print("out path name {} ".format(svc_out_path))
# '''
##加混响
print("add reverbration...")
svc_out_path_effect = svc_out_path[:-4] + '_effect.wav'
cmd = f"/data/gpu_env_common/bin/effect_tool {svc_out_path} {svc_out_path_effect}"
print("cmd :", cmd)
os.system(cmd)
# # 人声伴奏合并
print("add acc...")
out_path = svc_out_path_effect[:-4] + '_music.wav'
mix(svc_out_path_effect, inf_acc_path, out_path)
print("time cost = {}".format(time.time() - st))
print("out path name {} ".format(out_path))
# '''
return svc_out_path
def meisheng_func(target_yinse_wav, song_name, paras):
##init
embed_model, hubert_model, gender_model = meisheng_init()
###gender predict
gender, female_rate, is_pure = gender_model.process(target_yinse_wav)
print('=====================')
print("gender:{}, female_rate:{},is_pure:{}".format(gender, female_rate, is_pure))
if gender == 0:
gender = 'female'
elif gender == 1:
gender = 'male'
elif female_rate > 0.5:
gender = 'female'
else:
gender = 'male'
print("modified gender:{} ".format(gender))
print('=====================')
##美声main
paras['gender'] = gender ##单位都是ms
get_svc(target_yinse_wav, song_name, embed_model, hubert_model, paras)
if __name__ == '__main__':
# target_yinse_wav = "./raw/meisheng_yinse/female/changying.wav" # 需要完整路径
target_yinse_wav = "./raw/meisheng_yinse/female/target_yinse_cloris.m4a"
song_name = "lost_stars" ##歌曲名字
paras = {'gender': None, 'tst': 0, "tnd": None, 'delay': 0, 'song_path': None}
# paras = {'gender': 'female', 'tst': 0, "tnd": 30, 'delay': 0} ###片段svc测试
meisheng_func(target_yinse_wav, song_name, paras)
diff --git a/AIMeiSheng/myinfer_multi_spk_embed_in_dec_diff_meisheng_ctl_enc_spk200x_onlyspk_double.py b/AIMeiSheng/myinfer_multi_spk_embed_in_dec_diff_meisheng_ctl_enc_spk200x_onlyspk_double.py
new file mode 100644
index 0000000..4b0bed0
--- /dev/null
+++ b/AIMeiSheng/myinfer_multi_spk_embed_in_dec_diff_meisheng_ctl_enc_spk200x_onlyspk_double.py
@@ -0,0 +1,217 @@
+
+import os,sys,pdb,torch
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+import argparse
+import glob
+import sys
+import torch
+from multiprocessing import cpu_count
+class Config:
+ def __init__(self,device,is_half):
+ self.device = device
+ self.is_half = is_half
+ self.n_cpu = 0
+ self.gpu_name = None
+ self.gpu_mem = None
+ self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
+
+ def device_config(self) -> tuple:
+ current_dir = os.path.dirname(os.path.abspath(__file__))
+ config_path = os.path.join(current_dir, "configs")
+ if torch.cuda.is_available():
+ i_device = int(self.device.split(":")[-1])
+ self.gpu_name = torch.cuda.get_device_name(i_device)
+ if (
+ ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
+ or "P40" in self.gpu_name.upper()
+ or "1060" in self.gpu_name
+ or "1070" in self.gpu_name
+ or "1080" in self.gpu_name
+ ):
+ print("16系/10系显卡和P40强制单精度")
+ self.is_half = False
+ for config_file in ["32k.json", "40k.json", "48k.json"]:
+ with open(f"{config_path}/{config_file}", "r") as f:
+ strr = f.read().replace("true", "false")
+ with open(f"{config_path}/{config_file}", "w") as f:
+ f.write(strr)
+ with open(f"{current_dir}/trainset_preprocess_pipeline_print.py", "r") as f:
+ strr = f.read().replace("3.7", "3.0")
+ with open(f"{current_dir}/trainset_preprocess_pipeline_print.py", "w") as f:
+ f.write(strr)
+ else:
+ self.gpu_name = None
+ self.gpu_mem = int(
+ torch.cuda.get_device_properties(i_device).total_memory
+ / 1024
+ / 1024
+ / 1024
+ + 0.4
+ )
+ if self.gpu_mem <= 4:
+ with open(f"{current_dir}/trainset_preprocess_pipeline_print.py", "r") as f:
+ strr = f.read().replace("3.7", "3.0")
+ with open(f"{current_dir}/trainset_preprocess_pipeline_print.py", "w") as f:
+ f.write(strr)
+ elif torch.backends.mps.is_available():
+ print("没有发现支持的N卡, 使用MPS进行推理")
+ self.device = "mps"
+ else:
+ print("没有发现支持的N卡, 使用CPU进行推理")
+ self.device = "cpu"
+ self.is_half = True
+
+ if self.n_cpu == 0:
+ self.n_cpu = cpu_count()
+
+ if self.is_half:
+ # 6G显存配置
+ x_pad = 3
+ x_query = 10
+ x_center = 80 #60
+ x_max = 85#65
+ else:
+ # 5G显存配置
+ x_pad = 1
+ x_query = 6
+ x_center = 38
+ x_max = 41
+
+ if self.gpu_mem != None and self.gpu_mem <= 4:
+ x_pad = 1
+ x_query = 5
+ x_center = 30
+ x_max = 32
+
+ return x_pad, x_query, x_center, x_max
+
+
+index_path="./logs/xusong_v2_org_version_multispk_charlie_puth_embed_in_dec_muloss_show/added_IVF614_Flat_nprobe_1_xusong_v2_org_version_multispk_charlie_puth_embed_in_dec_show_v2.index"
+# f0method="rmvpe" #harvest or pm
+index_rate=float("0.0") #index rate
+device="cuda:0"
+is_half=True
+filter_radius=int(3) ##3
+resample_sr=int(0) # 0
+rms_mix_rate=float(1) # rms混合比例 1,不等于1混合
+protect=float(0.33 )## ??? 0.33 fang
+
+
+
+#print(sys.argv)
+config=Config(device,is_half)
+now_dir=os.getcwd()
+sys.path.append(now_dir)
+
+from vc_infer_pipeline_org_embed_spk import VC
+from lib.infer_pack.models_embed_in_dec_diff_control_enc_spken200x_onlyspk_double import (
+ SynthesizerTrnMs256NSFsid,
+ SynthesizerTrnMs256NSFsid_nono,
+ SynthesizerTrnMs768NSFsid,
+ SynthesizerTrnMs768NSFsid_nono,
+)
+from lib.audio import load_audio
+from fairseq import checkpoint_utils
+from scipy.io import wavfile
+from AIMeiSheng.docker_demo.common import gs_hubert_model_path
+# hubert_model=None
+def load_hubert():
+ # global hubert_model
+ models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task([gs_hubert_model_path],suffix="",)
+ #models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(["checkpoint_best_legacy_500.pt"],suffix="",)
+ hubert_model = models[0]
+ hubert_model = hubert_model.to(device)
+ if(is_half):hubert_model = hubert_model.half()
+ else:hubert_model = hubert_model.float()
+ hubert_model.eval()
+ return hubert_model
+
+def vc_single(sid,input_audio,f0_up_key,f0_file,f0_method,file_index,index_rate,hubert_model,paras):
+ global tgt_sr,net_g,vc,version
+ if input_audio is None:return "You need to upload an audio", None
+ f0_up_key = int(f0_up_key)
+ # print("@@xxxf0_up_key:",f0_up_key)
+ audio = load_audio(input_audio,16000)
+ if paras != None:
+ st = int(paras['tst'] * 16000/1000)
+ en = len(audio)
+ if paras['tnd'] != None:
+ en = min(en,int(paras['tnd'] * 16000/1000))
+ audio = audio[st:en]
+
+ times = [0, 0, 0]
+ if(hubert_model==None):
+ hubert_model = load_hubert()
+ if_f0 = cpt.get("f0", 1)
+ audio_opt=vc.pipeline_mulprocess(hubert_model,net_g,sid,audio,input_audio,times,f0_up_key,f0_method,file_index,index_rate,if_f0,filter_radius,tgt_sr,resample_sr,rms_mix_rate,version,protect,f0_file=f0_file)
+
+ #print(times)
+ #print("@@using multi process")
+ return audio_opt
+
+
+def get_vc_core(model_path,is_half):
+
+ #print("loading pth %s" % model_path)
+ cpt = torch.load(model_path, map_location="cpu")
+ tgt_sr = cpt["config"][-1]
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
+ if_f0 = cpt.get("f0", 1)
+ version = cpt.get("version", "v1")
+ if version == "v1":
+ if if_f0 == 1:
+ net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half)
+ else:
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
+ elif version == "v2":
+ if if_f0 == 1: #
+ net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=is_half)
+ else:
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
+ #print("load model finished")
+ del net_g.enc_q
+ net_g.load_state_dict(cpt["weight"], strict=False)
+ #print("load net_g finished")
+
+ return tgt_sr,net_g,cpt,version
+
+def get_vc1(model_path,is_half):
+ tgt_sr, net_g, cpt, version = get_vc_core(model_path, is_half)
+
+ net_g.eval().to(device)
+ if (is_half):net_g = net_g.half()
+ else:net_g = net_g.float()
+ vc = VC(tgt_sr, config)
+ n_spk=cpt["config"][-3]
+ return
+def get_rmvpe(model_path="rmvpe.pt"):
+ from lib.rmvpe import RMVPE
+ global f0_method
+ #print("loading rmvpe model")
+ f0_method = RMVPE(model_path, is_half=True, device='cuda')
+ return f0_method
+
+
+def get_vc(model_path):
+ global n_spk,tgt_sr,net_g,vc,cpt,device,is_half,version
+ tgt_sr, net_g, cpt, version = get_vc_core(model_path, is_half)
+
+ net_g.eval().to(device)
+ if (is_half):net_g = net_g.half()
+ else:net_g = net_g.float()
+ vc = VC(tgt_sr, config)
+ n_spk=cpt["config"][-3]
+ # return {"visible": True,"maximum": n_spk, "__type__": "update"}
+ # return net_g
+
+
+def svc_main(input_path,opt_path,sid_embed,f0up_key=0,hubert_model=None, paras=None):
+ #print("sid_embed: ",sid_embed)
+ wav_opt = vc_single(sid_embed,input_path,f0up_key,None,f0_method,index_path,index_rate,hubert_model,paras)
+ #print("out_path: ",opt_path)
+ wavfile.write(opt_path, tgt_sr, wav_opt)
+
+
+
+
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Mon, Nov 25, 09:00 (1 d, 8 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1326575
Default Alt Text
(90 KB)
Attached To
R350 av_svc
Event Timeline
Log In to Comment