Page MenuHomePhabricator

No OneTemporary

diff --git a/AutoCoverTool/script/train_user_by_one_media.py b/AutoCoverTool/script/train_user_by_one_media.py
index e733f67..561cd63 100644
--- a/AutoCoverTool/script/train_user_by_one_media.py
+++ b/AutoCoverTool/script/train_user_by_one_media.py
@@ -1,547 +1,544 @@
"""
使用一句话进行人声训练
1. 数据集
2. 训练
"""
from ref.so_vits_svc.models import SynthesizerTrn, MultiPeriodDiscriminator
from ref.so_vits_svc.mel_processing import spectrogram_torch, spec_to_mel_torch, mel_spectrogram_torch
import ref.so_vits_svc.utils as utils
import ref.so_vits_svc.commons as commons
from ref.so_vits_svc.losses import kl_loss, generator_loss, discriminator_loss, feature_loss
import logging
logging.getLogger('numba').setLevel(logging.WARNING)
import os
import time
import torch
import random
import librosa
import soundfile
import torchaudio
import parselmouth
import numpy as np
from tqdm import tqdm
from scipy.io.wavfile import read
from pyworld import pyworld
from copy import deepcopy
import torch.utils.data
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler
gs_denoise_exe = "/data/gpu_env_common/bin/denoise_exe"
gs_hmodel = utils.get_hubert_model(0 if torch.cuda.is_available() else None)
gs_model_config = {
"inter_channels": 192,
"hidden_channels": 192,
"filter_channels": 768,
"n_heads": 2,
"n_layers": 6,
"kernel_size": 3,
"p_dropout": 0.1,
"resblock": "1",
"resblock_kernel_sizes": [3, 7, 11],
"resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
"upsample_rates": [10, 8, 2, 2],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [16, 16, 4, 4],
"n_layers_q": 3,
"use_spectral_norm": False,
"gin_channels": 256,
"ssl_dim": 256,
"n_speakers": 2
}
gs_train_config = {
"log_interval": 1,
"eval_interval": 1000,
"seed": 1234,
"epochs": 1000,
"learning_rate": 0.0001,
"betas": [
0.8,
0.99
],
"eps": 1e-09,
"batch_size": 12,
"fp16_run": False,
"lr_decay": 0.999875,
"segment_size": 17920,
"init_lr_ratio": 1,
"warmup_epochs": 0,
"c_mel": 1.0, # 45
"c_kl": 1.0,
"c_fm": 1.0,
"c_gen": 1.0,
"use_sr": True,
"max_speclen": 384
}
gs_data_config = {
"max_wav_value": 32768.0,
"sampling_rate": 32000,
"filter_length": 1280,
"hop_length": 320,
"win_length": 1280,
"n_mel_channels": 80,
"mel_fmin": 0.0,
"mel_fmax": None
}
def get_f0(x, p_len, f0_up_key=0):
time_step = 160 / 16000 * 1000
f0_min = 50
f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
f0 = parselmouth.Sound(x, 16000).to_pitch_ac(
time_step=time_step / 1000, voicing_threshold=0.6,
pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
if len(f0) > p_len:
f0 = f0[:p_len]
pad_size = (p_len - len(f0) + 1) // 2
if (pad_size > 0 or p_len - len(f0) - pad_size > 0):
f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode='constant')
f0 *= pow(2, f0_up_key / 12)
f0_mel = 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255
f0_coarse = np.rint(f0_mel).astype(np.int)
return f0_coarse, f0
def resize2d(x, target_len):
source = np.array(x)
source[source < 0.001] = np.nan
target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
source)
res = np.nan_to_num(target)
return res
def compute_f0(x, sr, c_len):
# x, sr = librosa.load(path, sr=32000)
f0, t = pyworld.dio(
x.astype(np.double),
fs=sr,
f0_ceil=800,
frame_period=1000 * 320 / sr,
)
f0 = pyworld.stonemask(x.astype(np.double), f0, t, 32000)
for index, pitch in enumerate(f0):
f0[index] = round(pitch, 1)
assert abs(c_len - x.shape[0] // 320) < 3, (c_len, f0.shape)
return None, resize2d(f0, c_len)
def process(filename):
hmodel = utils.get_hubert_model(0 if torch.cuda.is_available() else None)
save_name = filename + ".soft.pt"
if not os.path.exists(save_name):
devive = torch.device("cuda" if torch.cuda.is_available() else "cpu")
wav, _ = librosa.load(filename, sr=16000)
wav = torch.from_numpy(wav).unsqueeze(0).to(devive)
c = utils.get_hubert_content(hmodel, wav)
torch.save(c.cpu(), save_name)
else:
c = torch.load(save_name)
f0path = filename + ".f0.npy"
if not os.path.exists(f0path):
cf0, f0 = compute_f0(filename, c.shape[-1] * 2)
np.save(f0path, f0)
def clean_pitch(input_pitch):
num_nan = np.sum(input_pitch == 1)
if num_nan / len(input_pitch) > 0.9:
input_pitch[input_pitch != 1] = 1
return input_pitch
class TextAudioSpeakerLoader(torch.utils.data.Dataset):
"""
1) loads audio, speaker_id, text pairs
2) normalizes text and converts them to sequences of integers
3) computes spectrograms from audio files.
"""
def __init__(self, audio_path):
self.audio_path = audio_path
self.max_wav_value = gs_data_config['max_wav_value']
self.sampling_rate = gs_data_config['sampling_rate']
self.filter_length = gs_data_config['filter_length']
self.hop_length = gs_data_config['hop_length']
self.win_length = gs_data_config['win_length']
self.use_sr = gs_train_config['use_sr']
self.spec_len = gs_train_config['max_speclen']
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.hmodel = gs_hmodel
random.seed(1234)
self.audio_data = self.get_audio(audio_path)
def get_audio(self, filename):
# 原始音频32k单声道
# 这里存在疑惑:
# audio, sr = librosa.load(filename, sr=self.sampling_rate, mono=True)
sr, audio = read(filename)
audio = torch.FloatTensor(audio.astype(np.float32))
audio_norm = audio / self.max_wav_value
audio_norm = torch.tensor(audio_norm)
audio_norm = audio_norm.unsqueeze(0)
# 幅度谱 帧长1280(40ms),帧移320(10ms),shape为(641, frame_num)
spec = spectrogram_torch(audio_norm, self.filter_length,
self.sampling_rate, self.hop_length, self.win_length,
center=False)
# print(torch.mean(spec))
spec = torch.squeeze(spec, 0)
spk = torch.LongTensor([0])
# # 提取hubert特征,shape为(256, frame_num // 2),后面做补齐
- wav = librosa.resample(audio.numpy(), sr, 16000)
+ wav = librosa.resample(y=audio.numpy(), orig_sr=sr, target_sr=16000)
wav = torch.from_numpy(wav).unsqueeze(0).to(self.device)
c = utils.get_hubert_content(self.hmodel, wav).squeeze(0)
# 提取f0特征,shape为(frame_num)
cf0, f0 = compute_f0(audio.numpy(), sr, c.shape[-1] * 2)
f0 = torch.FloatTensor(f0)
c = torch.repeat_interleave(c, repeats=2, dim=1) # shape=(256, frame_num)
lmin = min(c.size(-1), spec.size(-1), f0.shape[0])
# 当assert的前面的条件不成立的时候,会报错,并给出后面的信息
assert abs(c.size(-1) - spec.size(-1)) < 4, (c.size(-1), spec.size(-1), f0.shape, filename)
assert abs(lmin - spec.size(-1)) < 4, (c.size(-1), spec.size(-1), f0.shape)
assert abs(lmin - c.size(-1)) < 4, (c.size(-1), spec.size(-1), f0.shape)
spec, c, f0 = spec[:, :lmin], c[:, :lmin], f0[:lmin]
audio_norm = audio_norm[:, :lmin * self.hop_length]
_spec, _c, _audio_norm, _f0 = spec, c, audio_norm, f0
# 取幅度谱特征,hubert特征、f0信息
while spec.size(-1) < self.spec_len:
spec = torch.cat((spec, _spec), -1)
c = torch.cat((c, _c), -1)
f0 = torch.cat((f0, _f0), -1)
audio_norm = torch.cat((audio_norm, _audio_norm), -1)
# hubert特征,f0,幅度谱特征,对应音频段波形,人声编码
return c, f0, spec, audio_norm, spk
def random_one(self):
c, f0, spec, audio_norm, spk = self.audio_data
start = random.randint(0, spec.size(-1) - self.spec_len)
end = start + self.spec_len
spec = spec[:, start:end]
c = c[:, start:end]
f0 = f0[start:end]
audio_norm = audio_norm[:, start * self.hop_length:end * self.hop_length]
return c, f0, spec, audio_norm, spk
def __getitem__(self, index):
c, f0, spec, audio_norm, spk = self.random_one()
# 没有人声的段,不要
cnt = 0
while torch.mean(torch.abs(audio_norm)) < 0.02 and cnt < 3:
c, f0, spec, audio_norm, spk = self.random_one()
cnt += 1
return c, f0, spec, audio_norm, spk
def __len__(self):
return 1
class SoVitsSVCOnlineTrain:
def construct_model(self):
net_g = SynthesizerTrn(
gs_data_config["filter_length"] // 2 + 1,
gs_train_config["segment_size"] // gs_data_config["hop_length"],
**gs_model_config,
no_flow=False,
use_v3=False).cuda()
net_d = MultiPeriodDiscriminator(gs_model_config['use_spectral_norm']).cuda()
optim_g = torch.optim.AdamW(
net_g.parameters(),
0.0001,
betas=[0.8, 0.99],
eps=1e-09)
optim_d = torch.optim.AdamW(
net_d.parameters(),
0.0001,
betas=[0.8, 0.99],
eps=1e-09)
# checkpoint_dict = torch.load(base_g_model, map_location='cuda')
net_g.load_state_dict(self.g_model_dict)
net_d.load_state_dict(self.d_model_dict)
optim_g.load_state_dict(self.g_opt_dict)
optim_d.load_state_dict(self.d_opt_dict)
# 设置初始学习率
optim_g.param_groups[0]['lr'] = 2e-4
optim_d.param_groups[0]['lr'] = 2e-4
return net_g, net_d, optim_g, optim_d
def __init__(self, base_g_model, base_d_model):
st1 = time.time()
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
checkpoint_dict = torch.load(base_g_model, map_location='cpu')
self.g_model_dict = checkpoint_dict["model"]
self.g_opt_dict = checkpoint_dict["optimizer"]
checkpoint_dict = torch.load(base_d_model, map_location='cpu')
self.d_model_dict = checkpoint_dict["model"]
self.d_opt_dict = checkpoint_dict["optimizer"]
print("load model_path={},{},sp={}".format(base_g_model, base_d_model, time.time() - st1))
def get_units(self, source, sr):
source = source.unsqueeze(0).to(self.device)
print("source_shape===>", source.shape)
with torch.inference_mode():
start = time.time()
units = gs_hmodel.units(source)
use_time = time.time() - start
print("hubert use time:{}".format(use_time))
return units
def get_unit_pitch(self, source, sr, tran):
source = torchaudio.functional.resample(source, sr, 16000)
if len(source.shape) == 2 and source.shape[1] >= 2:
source = torch.mean(source, dim=0).unsqueeze(0)
soft = self.get_units(source, sr).squeeze(0).cpu().numpy()
f0_coarse, f0 = get_f0(source.cpu().numpy()[0], soft.shape[0] * 2, tran)
return soft, f0
def train(self, in_wav, epoch_num):
train_dataset = TextAudioSpeakerLoader(in_wav)
train_loader = DataLoader(train_dataset, num_workers=0, shuffle=False, batch_size=12)
net_g, net_d, optim_g, optim_d = self.construct_model()
rank = 0
# 用于训练加速
torch.set_float32_matmul_precision('high')
net_g.train()
net_d.train()
global_step = 0
scaler = GradScaler(enabled=gs_train_config['fp16_run'])
scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=gs_train_config['lr_decay'], last_epoch=1)
scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=gs_train_config['lr_decay'], last_epoch=1)
# 根据上一次的情况来进行学习率更新
# 思路: loss 下降 学习率增加,loss上升学习率减少
for epoch in tqdm(range(0, epoch_num)):
for batch_idx, items in enumerate(train_loader):
# hubert特征,f0,幅度谱特征,对应音频段波形(384 * hop_length),人声编码[0]
c, f0, spec, y, spk = items
g = spk.cuda(rank, non_blocking=True)
spec, y = spec.cuda(rank, non_blocking=True), y.cuda(rank, non_blocking=True)
c = c.cuda(rank, non_blocking=True)
f0 = f0.cuda(rank, non_blocking=True)
"""
"sampling_rate": 32000,
"filter_length": 1280,
"hop_length": 320,
"win_length": 1280,
"n_mel_channels": 80,
"mel_fmin": 0.0,
"mel_fmax": null
"""
# spec, n_fft, num_mels, sampling_rate, fmin, fmax
mel = spec_to_mel_torch(spec, gs_data_config['filter_length'], gs_data_config['n_mel_channels'],
gs_data_config['sampling_rate'], gs_data_config['mel_fmin'],
gs_data_config['mel_fmax'])
with autocast(enabled=gs_train_config['fp16_run']):
# net_g的输入: hubert特征,f0,幅度谱特征,说话人id,mel谱特征
# net_g的输出:
# 原始波形,批次中每个采样到的帧的位置,批次中幅度谱的有效帧位置,
# 幅度谱编码得到正态分布后随机采样得到的z, z经过标准化流之后得到z_p, hubert特征层得到的正态分布的均值,
# hubert特征层得到的正态分布的标准差(logs_p),幅度谱和人声信息得到的均值(m_q),幅度谱和人声信息得到的标准差(logs_q)
y_hat, ids_slice, z_mask, \
(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(c, f0, spec, g=g, mel=mel)
y_mel = commons.slice_segments(mel, ids_slice,
gs_train_config['segment_size'] // gs_data_config['hop_length'])
y_hat_mel = mel_spectrogram_torch(
y_hat.squeeze(1),
gs_data_config['filter_length'],
gs_data_config['n_mel_channels'],
gs_data_config['sampling_rate'],
gs_data_config['hop_length'],
gs_data_config['win_length'],
gs_data_config['mel_fmin'],
gs_data_config['mel_fmax']
)
y = commons.slice_segments(y, ids_slice * gs_data_config['hop_length'],
gs_train_config['segment_size']) # slice
# Discriminator
y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach())
with autocast(enabled=False):
loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g)
loss_disc_all = loss_disc
optim_d.zero_grad()
scaler.scale(loss_disc_all).backward()
scaler.unscale_(optim_d)
scaler.step(optim_d)
with autocast(enabled=gs_train_config['fp16_run']):
# Generator
y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat)
with autocast(enabled=False):
# mel谱之间的损失函数,后面是系数,误差越小越好
loss_mel = F.l1_loss(y_mel, y_hat_mel) * gs_train_config['c_mel']
# KL散度,z_p: 幅度谱侧得到的采样值经过标准化流之后的结果,logs_q: 幅度谱侧得到的标准差,m_p:hubert侧得到的均值
# logs_p: hubert侧得到的标准差,z_mask: 批次中幅度谱的有效帧位置,
loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * gs_train_config['c_kl']
# 在d模型中将y和y_hat的每一层特征结果都拿出来,做l1距离
loss_fm = feature_loss(fmap_r, fmap_g) * gs_train_config['c_fm']
loss_gen, losses_gen = generator_loss(y_d_hat_g)
loss_gen_all = loss_gen * gs_train_config['c_gen'] + loss_fm + loss_mel + loss_kl
optim_g.zero_grad()
scaler.scale(loss_gen_all).backward()
scaler.unscale_(optim_g)
scaler.step(optim_g)
scaler.update()
if global_step % gs_train_config['log_interval'] == 0:
lr = optim_g.param_groups[0]['lr']
losses_numpy = [round(loss_disc.item(), 3), round(loss_gen.item(), 3),
round(loss_fm.item(), 3), round(loss_mel.item(), 3), round(loss_kl.item(), 3)]
print("gstep={},lr={},disc={},gen={},fm={},mel={},kl={},tot={}".format(global_step, lr,
losses_numpy[0],
losses_numpy[1],
losses_numpy[2],
losses_numpy[3],
losses_numpy[4],
sum(losses_numpy)))
- if global_step % 200 == 0:
- torch.save(net_g.state_dict(), "data/web_trained_models/xiafan_{}.pth".format(global_step))
-
global_step += 1
scheduler_g.step()
scheduler_d.step()
return net_g
def infer(self, in_wav, dst_wav, model):
tran = 0 # 变化的音高
source, sr = librosa.load(in_wav, sr=32000, mono=True)
source = torch.tensor(source).unsqueeze(0)
sid = torch.LongTensor([0]).to(self.device).unsqueeze(0)
soft, pitch = self.get_unit_pitch(source, sr, tran)
f0 = torch.FloatTensor(clean_pitch(pitch)).unsqueeze(0).to(self.device)
stn_tst = torch.FloatTensor(soft)
with torch.no_grad():
model.eval()
x_tst = stn_tst.unsqueeze(0).to(self.device)
start = time.time()
x_tst = torch.repeat_interleave(x_tst, repeats=2, dim=1).transpose(1, 2)
audio = model.infer(x_tst, f0=f0, g=sid)[0, 0].data.float()
use_time = time.time() - start
print("vits use time:{}".format(use_time))
# 写入文件
soundfile.write(dst_wav, audio.cpu().numpy(), sr, format='wav')
####### 对外接口,训练并预测
def process_train_and_infer(self, train_media, in_path, dst_path, dst_model_path=None, params={}):
"""
:param train_media: 训练时使用的数据
:param in_path: 待转换的人声信息
:param dst_path: 转换后的文件地址
:param dst_model_path: 是否缓存模型
:return:
"""
# 对train_media转码为32k单声道
tmp_32_wav = train_media + "_321.wav"
cmd = "ffmpeg -i {} -ar 32000 -ac 1 -y {}".format(train_media, tmp_32_wav)
os.system(cmd)
if not os.path.exists(tmp_32_wav):
return 1
# 做降噪
tmp_wav = train_media + "_de321.wav"
cmd = "{} {} {}".format(gs_denoise_exe, tmp_32_wav, tmp_wav)
os.system(cmd)
if not os.path.exists(tmp_wav):
os.unlink(tmp_32_wav)
return 2
in_wav_tmp = in_path + "_321.wav"
cmd = "ffmpeg -i {} -ar 32000 -ac 1 -y {}".format(in_path, in_wav_tmp)
os.system(cmd)
if not os.path.exists(in_wav_tmp):
os.unlink(tmp_32_wav)
os.unlink(tmp_wav)
return 3
global gs_train_config
max_step = params.get('max_step', 200)
gs_train_config['c_mel'] = params.get("c_mel", 45)
gs_train_config['c_fm'] = params.get("c_fm", 1.0)
gs_train_config['c_gen'] = params.get("c_gen", 1.0)
print("params:{}".format(params))
st = time.time()
model = self.train(tmp_wav, max_step)
print("train sp={}".format(time.time() - st))
st = time.time()
self.infer(in_wav_tmp, dst_path, model)
print("infer sp={}".format(time.time() - st))
if dst_model_path is not None:
st = time.time()
torch.save(model.state_dict(), dst_model_path)
print("save model sp={}".format(time.time() - st))
os.unlink(tmp_32_wav)
os.unlink(tmp_wav)
os.unlink(in_wav_tmp)
return 0
# 推理结果
def process_infer(self, model_path, in_path, dst_path):
net_g = SynthesizerTrn(
gs_data_config["filter_length"] // 2 + 1,
gs_train_config["segment_size"] // gs_data_config["hop_length"],
**gs_model_config,
no_flow=False,
use_v3=False).cuda()
model_dict = torch.load(model_path, map_location='cpu')
net_g.load_state_dict(model_dict)
in_wav_tmp = in_path + "_321.wav"
cmd = "ffmpeg -i {} -ar 32000 -ac 1 -y {}".format(in_path, in_wav_tmp)
os.system(cmd)
if not os.path.exists(in_wav_tmp):
return 2
self.infer(in_wav_tmp, dst_path, net_g)
if __name__ == '__main__':
pp = "data/train_users/qiankun_v1/vocals/speaker0/qiankun.wav"
in_p = "data/test/vocal_32.wav"
dst_p = "data/test/vocal_32_out.wav"
dst_m_p = "data/test/mm.pth"
g_path = "data/online_models/models/base_model/sunyanzi_base_2000.pth"
d_path = "data/online_models/models/base_model/sunyanzi_base_d_2000.pth"
svsot = SoVitsSVCOnlineTrain(g_path, d_path)
start_time = time.time()
ret = svsot.process_train_and_infer(pp, in_p, dst_p, dst_m_p)
print("process = {} ret={}".format(time.time() - start_time, ret))

File Metadata

Mime Type
text/x-diff
Expires
Sun, Jan 12, 08:32 (1 d, 15 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1347187
Default Alt Text
(22 KB)

Event Timeline