Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F4880356
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
54 KB
Subscribers
None
View Options
diff --git a/AIMeiSheng/docker_demo/Dockerfile b/AIMeiSheng/docker_demo/Dockerfile
index 1f2806d..5e2bd37 100644
--- a/AIMeiSheng/docker_demo/Dockerfile
+++ b/AIMeiSheng/docker_demo/Dockerfile
@@ -1,28 +1,28 @@
# 系统版本 CUDA Version 11.8.0
# NAME="CentOS Linux" VERSION="7 (Core)"
# FROM starmaker.tencentcloudcr.com/starmaker/av/av:1.1
# 基础镜像, python3.9,cuda118,centos7,外加ffmpeg
#FROM starmaker.tencentcloudcr.com/starmaker/av/av_base:1.0
FROM registry.ushow.media/av/av_base:1.0
#FROM av_base_test:1.0
RUN source /etc/profile && sed -i 's|mirrorlist=|#mirrorlist=|g' /etc/yum.repos.d/CentOS-Base.repo && sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-Base.repo && yum clean all && yum install -y unzip && yum install -y libsndfile && yum install -y libsamplerate libsamplerate-devel
RUN source /etc/profile && pip3 install librosa==0.9.1 && pip3 install gradio && pip3 install torch==2.1.2 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
RUN source /etc/profile && pip3 install urllib3==1.26.15 && pip3 install coscmd && coscmd config -a AKIDoQmshFWXGitnQmrfCTYNwEExPaU6RVHm -s F9n9E2ZonWy93f04qMaYFfogHadPt62h -b log-sg-1256122840 -r ap-singapore
RUN source /etc/profile && pip3 install asteroid-filterbanks
RUN source /etc/profile && pip3 install praat-parselmouth==0.4.3
RUN source /etc/profile && pip3 install pyworld
RUN source /etc/profile && pip3 install faiss-cpu
RUN source /etc/profile && pip3 install torchcrepe
RUN source /etc/profile && pip3 install thop
RUN source /etc/profile && pip3 install ffmpeg-python
RUN source /etc/profile && pip3 install fairseq
RUN source /etc/profile && pip3 install redis==4.5.0
-RUN #source /etc/profile && pip3 install numpy=1.26.4
+RUN source /etc/profile && pip3 install numpy=1.26.4
COPY ./ /data/code/
WORKDIR /data/code
-#CMD ["/bin/bash", "-c", "source /etc/profile; export PYTHONPATH=/data/code; cd /data/code/AIMeiSheng/docker_demo; python3 offline_server.py"]
-CMD ["/bin/bash", "-c", "source /etc/profile; export PYTHONPATH=/data/code; cd /data/code/AIMeiSheng/docker_demo; python3 tmp.py"]
\ No newline at end of file
+CMD ["/bin/bash", "-c", "source /etc/profile; export PYTHONPATH=/data/code; cd /data/code/AIMeiSheng/docker_demo; python3 offline_server.py"]
+#CMD ["/bin/bash", "-c", "source /etc/profile; export PYTHONPATH=/data/code; cd /data/code/AIMeiSheng/docker_demo; python3 tmp.py"]
\ No newline at end of file
diff --git a/AIMeiSheng/docker_demo/readme.txt b/AIMeiSheng/docker_demo/readme.txt
index a03e512..bb9fc16 100644
--- a/AIMeiSheng/docker_demo/readme.txt
+++ b/AIMeiSheng/docker_demo/readme.txt
@@ -1,24 +1,24 @@
简介: ai美声功能,其核心是输入一段15-30s的人声作为音色信息,再给定输入音源,将音源转换为指定音色的声音的效果。例如,孙燕姿演唱的东风破
架构方案: http_server.py (1个) 作为服务端,接收外部传来的数据,塞入到redis中,由offline_server.py (多个服务) 进行承接
# 部署要求:
1. http_server.py 部署在sg-prod-songrefresh-gpu-7 上
2. offline_server.py 使用docker 部署在超级节点上,由运维进行控制
# http_server.py 环境要求:
pip install redis
pip install flask
# offline_server.py 环境要求(docker)
cd docker_demo目录下(例子如下):
1. docker build -f Dockerfile -t av_ai_meisheng .
(通过docker images 获取av_ai_meisheng的image_id)
2. docker run --gpus all -it -v /data/rsync/jianli.yang/av_svc:/data/code image_id # 即可启动服务
# 测试代码: docker 环境下, offline_server.py 即可验证
# http测试命令:
-curl http://127.0.0.1:5000/ai_meisheng -H "Content-Type: application/json" -d '{ "record_song_url": "http://starmaker-sg-1256122840.cos.ap-singapore.myqcloud.com/production/ai_voice/6755399445110104/f7ced5f67bcb2351a5b9a03fb8f81620-source.mp4", "target_url": "http://starmaker-sg-1256122840.cos.ap-singapore.myqcloud.com/production/ai_voice/6755399445110104/f7ced5f67bcb2351a5b9a03fb8f81620-target_test.mp4","start": 33300,"end": 208677,"vocal_loudness": -14.57,"female_recording_url": "http://starmaker-sg-1256122840.cos.ap-singapore.myqcloud.com/production/uploading/recordings_origin/4222124723437931/origin_master.mp4", "male_recording_url": "http://starmaker-sg-1256122840.cos.ap-singapore.myqcloud.com/production/uploading/recordings_origin/12666374036224383/origin_master.mp4"}'
+curl http://127.0.0.1:5000/ai_meisheng -H "Content-Type: application/json" -d "{'record_song_url': 'http://starmaker-sg-1256122840.cos.ap-singapore.myqcloud.com/production/ai_voice/10414574146376859/5940142f51165c9dfcbee4702c7df977-source.mp4', 'target_url': 'http://starmaker-sg-1256122840.cos.ap-singapore.myqcloud.com/production/ai_voice/10414574146376859/5940142f51165c9dfcbee4702c7df977-target111.mp4', 'start': 30778, 'end': 221169, 'vocal_loudness': -31.821442613813534, 'female_recording_url': 'http://songbook-starmaker-sg-1256122840.cos.ap-singapore.myqcloud.com/production/songbook/ai-voice/57edc985a8c2e59f5069bb2b77ac5eff.m4a', 'male_recording_url': 'http://songbook-starmaker-sg-1256122840.cos.ap-singapore.myqcloud.com/production/songbook/ai-voice/e24dd7772c52c61ea6cf0b6031c77235.m4a'}"
{"gender":"male","schedule":100,"status":0,"target_song_url":"https://av-audit-sync-sg-1256122840.cos.ap-singapore.myqcloud.com/dataset/AIMeiSheng/vocal_test/out.m4a"}
# 资源消耗: 显存占用约2G,但是最高能到9G, 所以,一台机器部署一个即可
注意: 通过common.py 的prod可以控制是否是线上环境
\ No newline at end of file
diff --git a/AIMeiSheng/lib/infer_pack/models_embed_in_dec_diff_control_enc_spken200x_onlyspk_double.py b/AIMeiSheng/lib/infer_pack/models_embed_in_dec_diff_control_enc_spken200x_onlyspk_double.py
index 1268f17..8348721 100644
--- a/AIMeiSheng/lib/infer_pack/models_embed_in_dec_diff_control_enc_spken200x_onlyspk_double.py
+++ b/AIMeiSheng/lib/infer_pack/models_embed_in_dec_diff_control_enc_spken200x_onlyspk_double.py
@@ -1,1301 +1,1301 @@
import math, pdb, os
from time import time as ttime
import torch
from torch import nn
from torch.nn import functional as F
from lib.infer_pack import modules
from lib.infer_pack import attentions_in_dec_double as attentions
from lib.infer_pack import commons
from lib.infer_pack.commons import init_weights, get_padding
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
from lib.infer_pack.commons import init_weights
import numpy as np
from lib.infer_pack import commons
from thop import profile
from diffuse_fang.diffUse_wraper_double import diff_decoder,ddpm_para
ddpm_dp = ddpm_para()
g2_dim = 256
class TextEncoder256(nn.Module):
def __init__(
self,
out_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
f0=True,
):
super().__init__()
self.out_channels = out_channels
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.emb_phone = nn.Linear(256, hidden_channels)
self.lrelu = nn.LeakyReLU(0.1, inplace=True)
if f0 == True:
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
self.encoder = attentions.Encoder(
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
)
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
def forward(self, phone, pitch, lengths):
if pitch == None:
x = self.emb_phone(phone)
else:
x = self.emb_phone(phone) + self.emb_pitch(pitch)
x = x * math.sqrt(self.hidden_channels) # [b, t, h]
x = self.lrelu(x)
x = torch.transpose(x, 1, -1) # [b, h, t]
x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
x.dtype
)
x = self.encoder(x * x_mask, x_mask)
stats = self.proj(x) * x_mask
m, logs = torch.split(stats, self.out_channels, dim=1)
return m, logs, x_mask
class TextEncoder768(nn.Module):
def __init__(
self,
out_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
f0=True,
):
super().__init__()
self.out_channels = out_channels
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.emb_phone = nn.Linear(768, hidden_channels)
self.lrelu = nn.LeakyReLU(0.1, inplace=True)
if f0 == True:
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
self.encoder = attentions.Encoder(
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
)
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
#self.emb_g = nn.Linear(256, hidden_channels)
def forward(self, phone, pitch, lengths,g):#fang add
if pitch == None:
x = self.emb_phone(phone)
else:
x = self.emb_phone(phone) + self.emb_pitch(pitch) #+ self.emb_g(g)
#print("@@@x:",x.shape)
x = x * math.sqrt(self.hidden_channels) # [b, t, h]
x = self.lrelu(x)
x = torch.transpose(x, 1, -1) # [b, h, t]
#print("@@@x1:",x.shape)
x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
x.dtype
)
#x = self.encoder(x * x_mask, x_mask,g)
x = self.encoder(x * x_mask, x_mask,g)#fang add
stats = self.proj(x) * x_mask
m, logs = torch.split(stats, self.out_channels, dim=1)
return m, logs, x_mask,x
class ResidualCouplingBlock(nn.Module):
def __init__(
self,
channels,
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
n_flows=4,
gin_channels=0,
):
super().__init__()
self.channels = channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.n_flows = n_flows
self.gin_channels = gin_channels
self.flows = nn.ModuleList()
for i in range(n_flows):
self.flows.append(
modules.ResidualCouplingLayer(
channels,
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
gin_channels=gin_channels,
mean_only=True,
)
)
self.flows.append(modules.Flip())
def forward(self, x, x_mask, g=None, reverse=False):
if not reverse:
for flow in self.flows:
x, _ = flow(x, x_mask, g=g, reverse=reverse)
else:
for flow in reversed(self.flows):
x = flow(x, x_mask, g=g, reverse=reverse)
return x
def remove_weight_norm(self):
for i in range(self.n_flows):
self.flows[i * 2].remove_weight_norm()
class PosteriorEncoder(nn.Module):
def __init__(
self,
in_channels,
out_channels,
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
gin_channels=0,
):
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.gin_channels = gin_channels
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
self.enc = modules.WN(
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
gin_channels=gin_channels,
)
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
def forward(self, x, x_lengths, g=None):
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
x.dtype
)
x = self.pre(x) * x_mask
x = self.enc(x, x_mask, g=g)
stats = self.proj(x) * x_mask
m, logs = torch.split(stats, self.out_channels, dim=1)#均值和方差 fang
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask ##随机采样 fang
return z, m, logs, x_mask
def remove_weight_norm(self):
self.enc.remove_weight_norm()
class Generator(torch.nn.Module):
def __init__(
self,
initial_channel,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
gin_channels=0,
):
super(Generator, self).__init__()
self.num_kernels = len(resblock_kernel_sizes)
self.num_upsamples = len(upsample_rates)
self.conv_pre = Conv1d(
initial_channel, upsample_initial_channel, 7, 1, padding=3
)
resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
self.ups = nn.ModuleList()
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
self.ups.append(
weight_norm(
ConvTranspose1d(
upsample_initial_channel // (2**i),
upsample_initial_channel // (2 ** (i + 1)),
k,
u,
padding=(k - u) // 2,
)
)
)
self.resblocks = nn.ModuleList()
for i in range(len(self.ups)):
ch = upsample_initial_channel // (2 ** (i + 1))
for j, (k, d) in enumerate(
zip(resblock_kernel_sizes, resblock_dilation_sizes)
):
self.resblocks.append(resblock(ch, k, d))
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
self.ups.apply(init_weights)
if gin_channels != 0:
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
def forward(self, x, g=None):
x = self.conv_pre(x)
if g is not None:
x = x + self.cond(g)
for i in range(self.num_upsamples):
x = F.leaky_relu(x, modules.LRELU_SLOPE)
x = self.ups[i](x)
xs = None
for j in range(self.num_kernels):
if xs is None:
xs = self.resblocks[i * self.num_kernels + j](x)
else:
xs += self.resblocks[i * self.num_kernels + j](x)
x = xs / self.num_kernels
x = F.leaky_relu(x)
x = self.conv_post(x)
x = torch.tanh(x)
return x
def remove_weight_norm(self):
for l in self.ups:
remove_weight_norm(l)
for l in self.resblocks:
l.remove_weight_norm()
class SineGen(torch.nn.Module):
"""Definition of sine generator
SineGen(samp_rate, harmonic_num = 0,
sine_amp = 0.1, noise_std = 0.003,
voiced_threshold = 0,
flag_for_pulse=False)
samp_rate: sampling rate in Hz
harmonic_num: number of harmonic overtones (default 0)
sine_amp: amplitude of sine-wavefrom (default 0.1)
noise_std: std of Gaussian noise (default 0.003)
voiced_thoreshold: F0 threshold for U/V classification (default 0)
flag_for_pulse: this SinGen is used inside PulseGen (default False)
Note: when flag_for_pulse is True, the first time step of a voiced
segment is always sin(np.pi) or cos(0)
"""
def __init__(
self,
samp_rate,
harmonic_num=0,
sine_amp=0.1,
noise_std=0.003,
voiced_threshold=0,
flag_for_pulse=False,
):
super(SineGen, self).__init__()
self.sine_amp = sine_amp
self.noise_std = noise_std
self.harmonic_num = harmonic_num
self.dim = self.harmonic_num + 1
self.sampling_rate = samp_rate
self.voiced_threshold = voiced_threshold
def _f02uv(self, f0):
# generate uv signal
uv = torch.ones_like(f0)
uv = uv * (f0 > self.voiced_threshold)
return uv
def forward(self, f0, upp):
"""sine_tensor, uv = forward(f0)
input F0: tensor(batchsize=1, length, dim=1)
f0 for unvoiced steps should be 0
output sine_tensor: tensor(batchsize=1, length, dim)
output uv: tensor(batchsize=1, length, 1)
"""
with torch.no_grad():
f0 = f0[:, None].transpose(1, 2)
f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
# fundamental component
f0_buf[:, :, 0] = f0[:, :, 0]
for idx in np.arange(self.harmonic_num):
f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
idx + 2
) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
rand_ini = torch.rand(
f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
)
rand_ini[:, 0] = 0
rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
tmp_over_one *= upp
tmp_over_one = F.interpolate(
tmp_over_one.transpose(2, 1),
scale_factor=upp,
mode="linear",
align_corners=True,
).transpose(2, 1)
rad_values = F.interpolate(
rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
).transpose(
2, 1
) #######
tmp_over_one %= 1
tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
cumsum_shift = torch.zeros_like(rad_values)
cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
sine_waves = torch.sin(
torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
)
sine_waves = sine_waves * self.sine_amp
uv = self._f02uv(f0)
uv = F.interpolate(
uv.transpose(2, 1), scale_factor=upp, mode="nearest"
).transpose(2, 1)
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
noise = noise_amp * torch.randn_like(sine_waves)
sine_waves = sine_waves * uv + noise
return sine_waves, uv, noise
class SourceModuleHnNSF(torch.nn.Module):
"""SourceModule for hn-nsf
SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
add_noise_std=0.003, voiced_threshod=0)
sampling_rate: sampling_rate in Hz
harmonic_num: number of harmonic above F0 (default: 0)
sine_amp: amplitude of sine source signal (default: 0.1)
add_noise_std: std of additive Gaussian noise (default: 0.003)
note that amplitude of noise in unvoiced is decided
by sine_amp
voiced_threshold: threhold to set U/V given F0 (default: 0)
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
F0_sampled (batchsize, length, 1)
Sine_source (batchsize, length, 1)
noise_source (batchsize, length 1)
uv (batchsize, length, 1)
"""
def __init__(
self,
sampling_rate,
harmonic_num=0,
sine_amp=0.1,
add_noise_std=0.003,
voiced_threshod=0,
is_half=True,
):
super(SourceModuleHnNSF, self).__init__()
self.sine_amp = sine_amp
self.noise_std = add_noise_std
self.is_half = is_half
# to produce sine waveforms
self.l_sin_gen = SineGen(
sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
)
# to merge source harmonics into a single excitation
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
self.l_tanh = torch.nn.Tanh()
def forward(self, x, upp=None):
sine_wavs, uv, _ = self.l_sin_gen(x, upp)
if self.is_half:
sine_wavs = sine_wavs.half()
sine_merge = self.l_tanh(self.l_linear(sine_wavs))
return sine_merge, None, None # noise, uv
class GeneratorNSF(torch.nn.Module):
def __init__(
self,
initial_channel,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
gin_channels,
sr,
is_half=False,
):
super(GeneratorNSF, self).__init__()
self.num_kernels = len(resblock_kernel_sizes)
self.num_upsamples = len(upsample_rates)
self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
self.m_source = SourceModuleHnNSF(
sampling_rate=sr, harmonic_num=0, is_half=is_half
)
self.noise_convs = nn.ModuleList()
self.conv_pre = Conv1d(
initial_channel, upsample_initial_channel, 7, 1, padding=3
)
resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
self.ups = nn.ModuleList()
self.ups_g = nn.ModuleList()# fang add
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
c_cur = upsample_initial_channel // (2 ** (i + 1))
self.ups.append(
weight_norm(
ConvTranspose1d(
upsample_initial_channel // (2**i),
upsample_initial_channel // (2 ** (i + 1)),
k,
u,
padding=(k - u) // 2,
)
)
)
self.ups_g.append(
nn.Conv1d(upsample_initial_channel,upsample_initial_channel // (2 ** (i + 1) ), 1)
#F.interpolate(input, scale_factor=2, mode='nearest')
)# fang add
if i + 1 < len(upsample_rates):
stride_f0 = np.prod(upsample_rates[i + 1 :])
self.noise_convs.append(
Conv1d(
1,
c_cur,
kernel_size=stride_f0 * 2,
stride=stride_f0,
padding=stride_f0 // 2,
)
)
else:
self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
self.resblocks = nn.ModuleList()
for i in range(len(self.ups)):
ch = upsample_initial_channel // (2 ** (i + 1))
for j, (k, d) in enumerate(
zip(resblock_kernel_sizes, resblock_dilation_sizes)
):
self.resblocks.append(resblock(ch, k, d))
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
self.ups.apply(init_weights)
if gin_channels != 0:
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
self.upp = np.prod(upsample_rates)
def forward(self, x, f0, g=None):
har_source, noi_source, uv = self.m_source(f0, self.upp)
har_source = har_source.transpose(1, 2)
x = self.conv_pre(x)
if g is not None:
#x = x + self.cond(g) ##org
tmp_g = self.cond(g) ##fang add
x = x + tmp_g ##fang add
#print('###@@@@##x:',x.shape )
for i in range(self.num_upsamples):
x = F.leaky_relu(x, modules.LRELU_SLOPE)
x = self.ups[i](x)
x_source = self.noise_convs[i](har_source)
x = x + x_source
xg = self.ups_g[i](tmp_g) #fang add
x = x + xg #fang add
xs = None
for j in range(self.num_kernels):
if xs is None:
xs = self.resblocks[i * self.num_kernels + j](x)
else:
xs += self.resblocks[i * self.num_kernels + j](x)
x = xs / self.num_kernels
#print('@@@@##x:',x.shape)
x = F.leaky_relu(x)
x = self.conv_post(x)
x = torch.tanh(x)
return x
def remove_weight_norm(self):
for l in self.ups:
remove_weight_norm(l)
for l in self.resblocks:
l.remove_weight_norm()
sr2sr = {
"32k": 32000,
"40k": 40000,
"48k": 48000,
"24k": 24000,
}
class SynthesizerTrnMs256NSFsid(nn.Module):
def __init__(
self,
spec_channels,
segment_size,
inter_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
spk_embed_dim,
gin_channels,
sr,
**kwargs
):
super().__init__()
if type(sr) == type("strr"):
sr = sr2sr[sr]
self.spec_channels = spec_channels
self.inter_channels = inter_channels
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.resblock = resblock
self.resblock_kernel_sizes = resblock_kernel_sizes
self.resblock_dilation_sizes = resblock_dilation_sizes
self.upsample_rates = upsample_rates
self.upsample_initial_channel = upsample_initial_channel
self.upsample_kernel_sizes = upsample_kernel_sizes
self.segment_size = segment_size
self.gin_channels = gin_channels
# self.hop_length = hop_length#
self.spk_embed_dim = spk_embed_dim
self.enc_p = TextEncoder256(
inter_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
)
self.dec = GeneratorNSF(
inter_channels,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
gin_channels=gin_channels,
sr=sr,
is_half=kwargs["is_half"],
)
self.enc_q = PosteriorEncoder(
spec_channels,
inter_channels,
hidden_channels,
5,
1,
16,
gin_channels=gin_channels,
)
self.flow = ResidualCouplingBlock(
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
)
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
def remove_weight_norm(self):
self.dec.remove_weight_norm()
self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm()
def forward(
self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
): # 这里ds是id,[bs,1]
# print(1,pitch.shape)#[bs,t]
g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
#print("@@@pitch.shape: ",pitch.shape)
#g = ds.unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
z_p = self.flow(z, y_mask, g=g)
z_slice, ids_slice = commons.rand_slice_segments(
z, y_lengths, self.segment_size
) #按照self.segment_size这个长度,进行随机切割z,长度固定,开始位置不同存在ids_slice中,z_slice是切割的结果, fang
# print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
# print(-2,pitchf.shape,z_slice.shape)
o = self.dec(z_slice, pitchf, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None):
g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
if rate:
head = int(z_p.shape[2] * rate)
z_p = z_p[:, :, -head:]
x_mask = x_mask[:, :, -head:]
nsff0 = nsff0[:, -head:]
z = self.flow(z_p, x_mask, g=g, reverse=True)
print('z shape: ',z.shape)
print('x_mask shape: ',x_mask.shape)
z_x_mask = z * x_mask
print('z_x_mask shape: ',z_x_mask.shape)
print('nsff0 shape:p', nsff0.shape)
print('g shape: ',g.shape)
o = self.dec(z * x_mask, nsff0, g=g)
self.get_floats()
return o, x_mask, (z, z_p, m_p, logs_p)
def get_floats(self,):
T = 21.4 #郭宇_但愿人长久_40k.wav
z = torch.randn(1,g2_dim ,2740)# 2s data(同时用2s数据验证,整数倍就对了,防止干扰)
x_mask = torch.randn(1,1 ,2740)
g = torch.randn(1,256 ,1)
inputs_bfcc = z #z * x_mask
nsff0 = torch.randn(1, 2740)
devices = 'cuda' #'cpu'
self.dec = self.dec.to(devices).half()
inputs_bfcc , nsff0, g = inputs_bfcc.to(devices).half(), nsff0.to(devices).half(), g.to(devices).half()
flops, params = profile(self.dec, (inputs_bfcc, nsff0, g))
print(f'@@@hifi-gan nsf decflops: {flops/(T*pow(10,9))} GFLOPS, params: { params/pow(10,6)} M')
return 0
class SynthesizerTrnMs768NSFsid(nn.Module):
def __init__(
self,
spec_channels,
segment_size,
inter_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
spk_embed_dim,
gin_channels,
sr,
**kwargs
):
super().__init__()
if type(sr) == type("strr"):
sr = sr2sr[sr]
self.spec_channels = spec_channels
self.inter_channels = inter_channels
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.resblock = resblock
self.resblock_kernel_sizes = resblock_kernel_sizes
self.resblock_dilation_sizes = resblock_dilation_sizes
self.upsample_rates = upsample_rates
self.upsample_initial_channel = upsample_initial_channel
self.upsample_kernel_sizes = upsample_kernel_sizes
self.segment_size = segment_size
self.gin_channels = gin_channels
# self.hop_length = hop_length#
self.spk_embed_dim = spk_embed_dim
self.enc_p = TextEncoder768(
inter_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
)
self.dec = GeneratorNSF(
inter_channels,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
gin_channels=gin_channels,
sr=sr,
is_half=kwargs["is_half"],
)
self.enc_q = PosteriorEncoder(
spec_channels,
inter_channels,
hidden_channels,
5,
1,
16,
gin_channels=gin_channels,
)
self.flow = ResidualCouplingBlock(
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
)
#for p in self.flow.parameters():
# p.requires_grad=False
#for p in self.enc_p.parameters():
# p.requires_grad=False
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
self.diff_decoder = diff_decoder
#self.diff_cond_g = nn.Conv1d(256,g2_dim, 1)
self.diff_cond_gx = self.zero_module(self.conv_nd(1, 256, g2_dim, 3, padding=1))
self.diff_cond_out = self.zero_module(self.conv_nd(1, g2_dim, g2_dim, 3, padding=1))
self.lzp = 0.1
self.ssl_proj = self.zero_module(nn.Conv1d(256*2, 256, 1, stride=1))
self.ssl_proj1 = self.zero_module(nn.Conv1d(256, 256, 1, stride=1))
self.ssl_proj1_norm = nn.BatchNorm1d(256)#nn.LayerNorm(256)
self.ssl_proj2 = self.zero_module(nn.Conv1d(256, 256, 1, stride=1))
self.ssl_proj2_norm = nn.BatchNorm1d(256)#nn.LayerNorm(256)
def zero_module(self,module):
"""
Zero out the parameters of a module and return it.
"""
for p in module.parameters():
p.detach().zero_()
return module
def conv_nd(self, dims, *args, **kwargs):
"""
Create a 1D, 2D, or 3D convolution module.
"""
if dims == 1:
return nn.Conv1d(*args, **kwargs)
elif dims == 2:
return nn.Conv2d(*args, **kwargs)
elif dims == 3:
return nn.Conv3d(*args, **kwargs)
raise ValueError(f"unsupported dimensions: {dims}")
def remove_weight_norm(self):
self.dec.remove_weight_norm()
self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm()
def forward(
self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
): # 这里ds是id,[bs,1]
# print(1,pitch.shape)#[bs,t]
#g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
#print("@@@@@fang@@@@@")
#print("@@@@g:",ds.size())
#g, ge = ds[0], ds[1]
g = ds.unsqueeze(-1)
#g = self.ssl_proj(g)#[:,256:,:])
g1 = self.ssl_proj1_norm( self.ssl_proj1(g[:,:256,:]))
g2 = self.ssl_proj2_norm( self.ssl_proj2(g[:,256:,:]))
g = g1 + g[:,256:,:]#+ g2
#g = g[:,:256,:] + ge
#print("@@@@g1:",g.size())
#print("g:",g.size())
#print("phone_lengths: ",phone_lengths.size())
#print("pitch: ",pitch.size())
#m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
m_p, logs_p, x_mask, x_embed = self.enc_p(phone, pitch, phone_lengths,g)#fang add
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)#self.enc_q = PosteriorEncoder ##这里面预测出了随机采样的隐变量z,m_q是均值,logs_q是方差,y_mask是mask的数据 fangi
z_p = self.flow(z, y_mask, g=g)# z是y_msk的输入
z_p_sample = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * y_mask
zx = self.flow(z_p_sample, y_mask, g=g, reverse=True)
#print("@@@@@g:",g.shape)
g_z_p = self.diff_cond_gx(g)
#print("@@@@@g_z_p:",g_z_p.shape)
z_res = z - zx
#print('#######x_embed:',x_embed.shape)
#print('#######z_p_sample:',z_p_sample.shape)
#print('#######g_z_p:',g_z_p.shape)
#z_p1 = z_p_sample + g_z_p
z_p1 = x_embed + g_z_p
###diff st
z_p_diff = z_p1.transpose(1,2) ##b,frames,feat
z_diff = z_res.transpose(1,2) ##b,frames,feat
diff_loss,_ = self.diff_decoder(z_p_diff, gt_spec=z_diff, infer=False, infer_speedup=ddpm_dp.infer_speedup, method=ddpm_dp.method, use_tqdm=ddpm_dp.use_tqdm)
#self.diff_decoder = self.diff_decoder.float()
#print("@@@z: ",z.shape)
#b = z_p_diff.shape[0]
t = 200#np.random.randint(100,1000)#200#torch.randint(0, 1000, (b,), device=g.device).long()
z_diff = zx.transpose(1,2)
z_x_diff = self.diff_decoder(z_p_diff, gt_spec=z_diff*self.lzp, infer=True, infer_speedup=ddpm_dp.infer_speedup, method=ddpm_dp.method, k_step=t, use_tqdm=False)
#print("@@@z_x: ",z_x.shape)
z1 = z_x_diff.transpose(1,2)
z1 = self.diff_cond_out(z1)
z_in = (zx + z1)
#z_p = z_p_rec.transpose(1,2)
##diff en
##oneflow
#z_p = self.flow(z, y_mask, g=g)
z_slice, ids_slice = commons.rand_slice_segments(
z_in, y_lengths, self.segment_size
)
# print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
# print(-2,pitchf.shape,z_slice.shape)
o = self.dec(z_slice, pitchf, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q),diff_loss
def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None):
#g = self.emb_g(sid).unsqueeze(-1)
g = sid.unsqueeze(-1).unsqueeze(0)
g = self.ssl_proj(g)
#g1 = self.ssl_proj1_norm(g[:,:256,:])
#g2 = self.ssl_proj2_norm(g[:,256:,:])
#g1 = self.ssl_proj1_norm( self.ssl_proj1(g[:,:256,:]))
#g2 = self.ssl_proj2_norm( self.ssl_proj2(g[:,256:,:]))
#g1 = self.ssl_proj1(g[:,:256,:])
#g2 = self.ssl_proj1(g[:,:256,:])
#g = g1 + g2
#g = g[:,256:,:]#+ g2
#m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) #org
print("@@@@@@pitch:",pitch.shape,"phone:",phone.shape)
m_p, logs_p, x_mask, x_embed = self.enc_p(phone, pitch, phone_lengths,g) #fang add
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
if rate:
head = int(z_p.shape[2] * rate)
z_p = z_p[:, :, -head:]
x_mask = x_mask[:, :, -head:]
nsff0 = nsff0[:, -head:]
z = self.flow(z_p, x_mask, g=g, reverse=True)
g_z_p = self.diff_cond_gx(g)
#z_p1 = z_p + g_z_p
z_p1 = x_embed + g_z_p
#if is_half:
#self.diff_decoder = self.diff_decoder.float()
z_p_diff = z_p1.transpose(1,2).float() ##b,frames,feat
z_diff = z.transpose(1,2) ##b,frames,feat
- print("@@z_p_diff", z_p_diff[0,0,:])
+ # print("@@z_p_diff", z_p_diff[0,0,:])
self.diff_decoder = self.diff_decoder.float()
z_x = self.diff_decoder(z_p_diff, gt_spec=z_diff*self.lzp, infer=True, infer_speedup=ddpm_dp.infer_speedup, method=ddpm_dp.method, k_step=200, use_tqdm=ddpm_dp.use_tqdm)
- print("@@z_x", z_x[0,0,:])
+ # print("@@z_x", z_x[0,0,:])
z1 = z_x.transpose(1,2).half()
z_res = self.diff_cond_out(z1)
z = z + z_res
o = self.dec(z * x_mask, nsff0, g=g)
#self.get_floats()
return o, x_mask, (z, z_p, m_p, logs_p)
class SynthesizerTrnMs256NSFsid_nono(nn.Module):
def __init__(
self,
spec_channels,
segment_size,
inter_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
spk_embed_dim,
gin_channels,
sr=None,
**kwargs
):
super().__init__()
self.spec_channels = spec_channels
self.inter_channels = inter_channels
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.resblock = resblock
self.resblock_kernel_sizes = resblock_kernel_sizes
self.resblock_dilation_sizes = resblock_dilation_sizes
self.upsample_rates = upsample_rates
self.upsample_initial_channel = upsample_initial_channel
self.upsample_kernel_sizes = upsample_kernel_sizes
self.segment_size = segment_size
self.gin_channels = gin_channels
# self.hop_length = hop_length#
self.spk_embed_dim = spk_embed_dim
self.enc_p = TextEncoder256(
inter_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
f0=False,
)
self.dec = Generator(
inter_channels,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
gin_channels=gin_channels,
)
self.enc_q = PosteriorEncoder(
spec_channels,
inter_channels,
hidden_channels,
5,
1,
16,
gin_channels=gin_channels,
)
self.flow = ResidualCouplingBlock(
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
)
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
def remove_weight_norm(self):
self.dec.remove_weight_norm()
self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm()
def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
z_p = self.flow(z, y_mask, g=g)
z_slice, ids_slice = commons.rand_slice_segments(
z, y_lengths, self.segment_size
)
o = self.dec(z_slice, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
def infer(self, phone, phone_lengths, sid, rate=None):
g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
if rate:
head = int(z_p.shape[2] * rate)
z_p = z_p[:, :, -head:]
x_mask = x_mask[:, :, -head:]
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec(z * x_mask, g=g)
return o, x_mask, (z, z_p, m_p, logs_p)
class SynthesizerTrnMs768NSFsid_nono(nn.Module):
def __init__(
self,
spec_channels,
segment_size,
inter_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
spk_embed_dim,
gin_channels,
sr=None,
**kwargs
):
super().__init__()
self.spec_channels = spec_channels
self.inter_channels = inter_channels
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.resblock = resblock
self.resblock_kernel_sizes = resblock_kernel_sizes
self.resblock_dilation_sizes = resblock_dilation_sizes
self.upsample_rates = upsample_rates
self.upsample_initial_channel = upsample_initial_channel
self.upsample_kernel_sizes = upsample_kernel_sizes
self.segment_size = segment_size
self.gin_channels = gin_channels
# self.hop_length = hop_length#
self.spk_embed_dim = spk_embed_dim
self.enc_p = TextEncoder768(
inter_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
f0=False,
)
self.dec = Generator(
inter_channels,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
gin_channels=gin_channels,
)
self.enc_q = PosteriorEncoder(
spec_channels,
inter_channels,
hidden_channels,
5,
1,
16,
gin_channels=gin_channels,
)
self.flow = ResidualCouplingBlock(
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
)
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
def remove_weight_norm(self):
self.dec.remove_weight_norm()
self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm()
def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
#g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
g = ds.unsqueeze(-1)
#m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) #org
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths,g=g)#fang add
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
z_p = self.flow(z, y_mask, g=g)
z_slice, ids_slice = commons.rand_slice_segments(
z, y_lengths, self.segment_size
)
o = self.dec(z_slice, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
def infer(self, phone, phone_lengths, sid, rate=None):
#g = self.emb_g(sid).unsqueeze(-1)
g = sid.unsqueeze(-1).unsqueeze(0)
#m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths,g=g)#fang add
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
if rate:
head = int(z_p.shape[2] * rate)
z_p = z_p[:, :, -head:]
x_mask = x_mask[:, :, -head:]
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec(z * x_mask, g=g)
return o, x_mask, (z, z_p, m_p, logs_p)
class MultiPeriodDiscriminator(torch.nn.Module):
def __init__(self, use_spectral_norm=False):
super(MultiPeriodDiscriminator, self).__init__()
periods = [2, 3, 5, 7, 11, 17]
# periods = [3, 5, 7, 11, 17, 23, 37]
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
discs = discs + [
DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
]
self.discriminators = nn.ModuleList(discs)
def forward(self, y, y_hat):
y_d_rs = [] #
y_d_gs = []
fmap_rs = []
fmap_gs = []
for i, d in enumerate(self.discriminators):
y_d_r, fmap_r = d(y)
y_d_g, fmap_g = d(y_hat)
# for j in range(len(fmap_r)):
# print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
y_d_rs.append(y_d_r)
y_d_gs.append(y_d_g)
fmap_rs.append(fmap_r)
fmap_gs.append(fmap_g)
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
class MultiPeriodDiscriminatorV2(torch.nn.Module):
def __init__(self, use_spectral_norm=False):
super(MultiPeriodDiscriminatorV2, self).__init__()
# periods = [2, 3, 5, 7, 11, 17]
periods = [2, 3, 5, 7, 11, 17, 23, 37]
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
discs = discs + [
DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
]
self.discriminators = nn.ModuleList(discs)
def forward(self, y, y_hat):
y_d_rs = [] #
y_d_gs = []
fmap_rs = []
fmap_gs = []
for i, d in enumerate(self.discriminators):
y_d_r, fmap_r = d(y)
y_d_g, fmap_g = d(y_hat)
# for j in range(len(fmap_r)):
# print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
y_d_rs.append(y_d_r)
y_d_gs.append(y_d_g)
fmap_rs.append(fmap_r)
fmap_gs.append(fmap_g)
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
class DiscriminatorS(torch.nn.Module):
def __init__(self, use_spectral_norm=False):
super(DiscriminatorS, self).__init__()
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
self.convs = nn.ModuleList(
[
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
]
)
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
def forward(self, x):
fmap = []
for l in self.convs:
x = l(x)
x = F.leaky_relu(x, modules.LRELU_SLOPE)
fmap.append(x)
x = self.conv_post(x)
fmap.append(x)
x = torch.flatten(x, 1, -1)
return x, fmap
class DiscriminatorP(torch.nn.Module):
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
super(DiscriminatorP, self).__init__()
self.period = period
self.use_spectral_norm = use_spectral_norm
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
self.convs = nn.ModuleList(
[
norm_f(
Conv2d(
1,
32,
(kernel_size, 1),
(stride, 1),
padding=(get_padding(kernel_size, 1), 0),
)
),
norm_f(
Conv2d(
32,
128,
(kernel_size, 1),
(stride, 1),
padding=(get_padding(kernel_size, 1), 0),
)
),
norm_f(
Conv2d(
128,
512,
(kernel_size, 1),
(stride, 1),
padding=(get_padding(kernel_size, 1), 0),
)
),
norm_f(
Conv2d(
512,
1024,
(kernel_size, 1),
(stride, 1),
padding=(get_padding(kernel_size, 1), 0),
)
),
norm_f(
Conv2d(
1024,
1024,
(kernel_size, 1),
1,
padding=(get_padding(kernel_size, 1), 0),
)
),
]
)
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
def forward(self, x):
fmap = []
# 1d to 2d
b, c, t = x.shape
if t % self.period != 0: # pad first
n_pad = self.period - (t % self.period)
x = F.pad(x, (0, n_pad), "reflect")
t = t + n_pad
x = x.view(b, c, t // self.period, self.period)
for l in self.convs:
x = l(x)
x = F.leaky_relu(x, modules.LRELU_SLOPE)
fmap.append(x)
x = self.conv_post(x)
fmap.append(x)
x = torch.flatten(x, 1, -1)
return x, fmap
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Sun, Jan 12, 08:35 (1 d, 15 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1346160
Default Alt Text
(54 KB)
Attached To
R350 av_svc
Event Timeline
Log In to Comment