diff --git a/AutoCoverTool/script/train_user_by_one_media.py b/AutoCoverTool/script/train_user_by_one_media.py index e733f67..561cd63 100644 --- a/AutoCoverTool/script/train_user_by_one_media.py +++ b/AutoCoverTool/script/train_user_by_one_media.py @@ -1,547 +1,544 @@ """ 使用一句话进行人声训练 1. 数据集 2. 训练 """ from ref.so_vits_svc.models import SynthesizerTrn, MultiPeriodDiscriminator from ref.so_vits_svc.mel_processing import spectrogram_torch, spec_to_mel_torch, mel_spectrogram_torch import ref.so_vits_svc.utils as utils import ref.so_vits_svc.commons as commons from ref.so_vits_svc.losses import kl_loss, generator_loss, discriminator_loss, feature_loss import logging logging.getLogger('numba').setLevel(logging.WARNING) import os import time import torch import random import librosa import soundfile import torchaudio import parselmouth import numpy as np from tqdm import tqdm from scipy.io.wavfile import read from pyworld import pyworld from copy import deepcopy import torch.utils.data from torch.nn import functional as F from torch.utils.data import DataLoader from torch.cuda.amp import autocast, GradScaler gs_denoise_exe = "/data/gpu_env_common/bin/denoise_exe" gs_hmodel = utils.get_hubert_model(0 if torch.cuda.is_available() else None) gs_model_config = { "inter_channels": 192, "hidden_channels": 192, "filter_channels": 768, "n_heads": 2, "n_layers": 6, "kernel_size": 3, "p_dropout": 0.1, "resblock": "1", "resblock_kernel_sizes": [3, 7, 11], "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], "upsample_rates": [10, 8, 2, 2], "upsample_initial_channel": 512, "upsample_kernel_sizes": [16, 16, 4, 4], "n_layers_q": 3, "use_spectral_norm": False, "gin_channels": 256, "ssl_dim": 256, "n_speakers": 2 } gs_train_config = { "log_interval": 1, "eval_interval": 1000, "seed": 1234, "epochs": 1000, "learning_rate": 0.0001, "betas": [ 0.8, 0.99 ], "eps": 1e-09, "batch_size": 12, "fp16_run": False, "lr_decay": 0.999875, "segment_size": 17920, "init_lr_ratio": 1, "warmup_epochs": 0, "c_mel": 1.0, # 45 "c_kl": 1.0, "c_fm": 1.0, "c_gen": 1.0, "use_sr": True, "max_speclen": 384 } gs_data_config = { "max_wav_value": 32768.0, "sampling_rate": 32000, "filter_length": 1280, "hop_length": 320, "win_length": 1280, "n_mel_channels": 80, "mel_fmin": 0.0, "mel_fmax": None } def get_f0(x, p_len, f0_up_key=0): time_step = 160 / 16000 * 1000 f0_min = 50 f0_max = 1100 f0_mel_min = 1127 * np.log(1 + f0_min / 700) f0_mel_max = 1127 * np.log(1 + f0_max / 700) f0 = parselmouth.Sound(x, 16000).to_pitch_ac( time_step=time_step / 1000, voicing_threshold=0.6, pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency'] if len(f0) > p_len: f0 = f0[:p_len] pad_size = (p_len - len(f0) + 1) // 2 if (pad_size > 0 or p_len - len(f0) - pad_size > 0): f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode='constant') f0 *= pow(2, f0_up_key / 12) f0_mel = 1127 * np.log(1 + f0 / 700) f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1 f0_mel[f0_mel <= 1] = 1 f0_mel[f0_mel > 255] = 255 f0_coarse = np.rint(f0_mel).astype(np.int) return f0_coarse, f0 def resize2d(x, target_len): source = np.array(x) source[source < 0.001] = np.nan target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)), source) res = np.nan_to_num(target) return res def compute_f0(x, sr, c_len): # x, sr = librosa.load(path, sr=32000) f0, t = pyworld.dio( x.astype(np.double), fs=sr, f0_ceil=800, frame_period=1000 * 320 / sr, ) f0 = pyworld.stonemask(x.astype(np.double), f0, t, 32000) for index, pitch in enumerate(f0): f0[index] = round(pitch, 1) assert abs(c_len - x.shape[0] // 320) < 3, (c_len, f0.shape) return None, resize2d(f0, c_len) def process(filename): hmodel = utils.get_hubert_model(0 if torch.cuda.is_available() else None) save_name = filename + ".soft.pt" if not os.path.exists(save_name): devive = torch.device("cuda" if torch.cuda.is_available() else "cpu") wav, _ = librosa.load(filename, sr=16000) wav = torch.from_numpy(wav).unsqueeze(0).to(devive) c = utils.get_hubert_content(hmodel, wav) torch.save(c.cpu(), save_name) else: c = torch.load(save_name) f0path = filename + ".f0.npy" if not os.path.exists(f0path): cf0, f0 = compute_f0(filename, c.shape[-1] * 2) np.save(f0path, f0) def clean_pitch(input_pitch): num_nan = np.sum(input_pitch == 1) if num_nan / len(input_pitch) > 0.9: input_pitch[input_pitch != 1] = 1 return input_pitch class TextAudioSpeakerLoader(torch.utils.data.Dataset): """ 1) loads audio, speaker_id, text pairs 2) normalizes text and converts them to sequences of integers 3) computes spectrograms from audio files. """ def __init__(self, audio_path): self.audio_path = audio_path self.max_wav_value = gs_data_config['max_wav_value'] self.sampling_rate = gs_data_config['sampling_rate'] self.filter_length = gs_data_config['filter_length'] self.hop_length = gs_data_config['hop_length'] self.win_length = gs_data_config['win_length'] self.use_sr = gs_train_config['use_sr'] self.spec_len = gs_train_config['max_speclen'] self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.hmodel = gs_hmodel random.seed(1234) self.audio_data = self.get_audio(audio_path) def get_audio(self, filename): # 原始音频32k单声道 # 这里存在疑惑: # audio, sr = librosa.load(filename, sr=self.sampling_rate, mono=True) sr, audio = read(filename) audio = torch.FloatTensor(audio.astype(np.float32)) audio_norm = audio / self.max_wav_value audio_norm = torch.tensor(audio_norm) audio_norm = audio_norm.unsqueeze(0) # 幅度谱 帧长1280(40ms),帧移320(10ms),shape为(641, frame_num) spec = spectrogram_torch(audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, center=False) # print(torch.mean(spec)) spec = torch.squeeze(spec, 0) spk = torch.LongTensor([0]) # # 提取hubert特征,shape为(256, frame_num // 2),后面做补齐 - wav = librosa.resample(audio.numpy(), sr, 16000) + wav = librosa.resample(y=audio.numpy(), orig_sr=sr, target_sr=16000) wav = torch.from_numpy(wav).unsqueeze(0).to(self.device) c = utils.get_hubert_content(self.hmodel, wav).squeeze(0) # 提取f0特征,shape为(frame_num) cf0, f0 = compute_f0(audio.numpy(), sr, c.shape[-1] * 2) f0 = torch.FloatTensor(f0) c = torch.repeat_interleave(c, repeats=2, dim=1) # shape=(256, frame_num) lmin = min(c.size(-1), spec.size(-1), f0.shape[0]) # 当assert的前面的条件不成立的时候,会报错,并给出后面的信息 assert abs(c.size(-1) - spec.size(-1)) < 4, (c.size(-1), spec.size(-1), f0.shape, filename) assert abs(lmin - spec.size(-1)) < 4, (c.size(-1), spec.size(-1), f0.shape) assert abs(lmin - c.size(-1)) < 4, (c.size(-1), spec.size(-1), f0.shape) spec, c, f0 = spec[:, :lmin], c[:, :lmin], f0[:lmin] audio_norm = audio_norm[:, :lmin * self.hop_length] _spec, _c, _audio_norm, _f0 = spec, c, audio_norm, f0 # 取幅度谱特征,hubert特征、f0信息 while spec.size(-1) < self.spec_len: spec = torch.cat((spec, _spec), -1) c = torch.cat((c, _c), -1) f0 = torch.cat((f0, _f0), -1) audio_norm = torch.cat((audio_norm, _audio_norm), -1) # hubert特征,f0,幅度谱特征,对应音频段波形,人声编码 return c, f0, spec, audio_norm, spk def random_one(self): c, f0, spec, audio_norm, spk = self.audio_data start = random.randint(0, spec.size(-1) - self.spec_len) end = start + self.spec_len spec = spec[:, start:end] c = c[:, start:end] f0 = f0[start:end] audio_norm = audio_norm[:, start * self.hop_length:end * self.hop_length] return c, f0, spec, audio_norm, spk def __getitem__(self, index): c, f0, spec, audio_norm, spk = self.random_one() # 没有人声的段,不要 cnt = 0 while torch.mean(torch.abs(audio_norm)) < 0.02 and cnt < 3: c, f0, spec, audio_norm, spk = self.random_one() cnt += 1 return c, f0, spec, audio_norm, spk def __len__(self): return 1 class SoVitsSVCOnlineTrain: def construct_model(self): net_g = SynthesizerTrn( gs_data_config["filter_length"] // 2 + 1, gs_train_config["segment_size"] // gs_data_config["hop_length"], **gs_model_config, no_flow=False, use_v3=False).cuda() net_d = MultiPeriodDiscriminator(gs_model_config['use_spectral_norm']).cuda() optim_g = torch.optim.AdamW( net_g.parameters(), 0.0001, betas=[0.8, 0.99], eps=1e-09) optim_d = torch.optim.AdamW( net_d.parameters(), 0.0001, betas=[0.8, 0.99], eps=1e-09) # checkpoint_dict = torch.load(base_g_model, map_location='cuda') net_g.load_state_dict(self.g_model_dict) net_d.load_state_dict(self.d_model_dict) optim_g.load_state_dict(self.g_opt_dict) optim_d.load_state_dict(self.d_opt_dict) # 设置初始学习率 optim_g.param_groups[0]['lr'] = 2e-4 optim_d.param_groups[0]['lr'] = 2e-4 return net_g, net_d, optim_g, optim_d def __init__(self, base_g_model, base_d_model): st1 = time.time() self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") checkpoint_dict = torch.load(base_g_model, map_location='cpu') self.g_model_dict = checkpoint_dict["model"] self.g_opt_dict = checkpoint_dict["optimizer"] checkpoint_dict = torch.load(base_d_model, map_location='cpu') self.d_model_dict = checkpoint_dict["model"] self.d_opt_dict = checkpoint_dict["optimizer"] print("load model_path={},{},sp={}".format(base_g_model, base_d_model, time.time() - st1)) def get_units(self, source, sr): source = source.unsqueeze(0).to(self.device) print("source_shape===>", source.shape) with torch.inference_mode(): start = time.time() units = gs_hmodel.units(source) use_time = time.time() - start print("hubert use time:{}".format(use_time)) return units def get_unit_pitch(self, source, sr, tran): source = torchaudio.functional.resample(source, sr, 16000) if len(source.shape) == 2 and source.shape[1] >= 2: source = torch.mean(source, dim=0).unsqueeze(0) soft = self.get_units(source, sr).squeeze(0).cpu().numpy() f0_coarse, f0 = get_f0(source.cpu().numpy()[0], soft.shape[0] * 2, tran) return soft, f0 def train(self, in_wav, epoch_num): train_dataset = TextAudioSpeakerLoader(in_wav) train_loader = DataLoader(train_dataset, num_workers=0, shuffle=False, batch_size=12) net_g, net_d, optim_g, optim_d = self.construct_model() rank = 0 # 用于训练加速 torch.set_float32_matmul_precision('high') net_g.train() net_d.train() global_step = 0 scaler = GradScaler(enabled=gs_train_config['fp16_run']) scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=gs_train_config['lr_decay'], last_epoch=1) scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=gs_train_config['lr_decay'], last_epoch=1) # 根据上一次的情况来进行学习率更新 # 思路: loss 下降 学习率增加,loss上升学习率减少 for epoch in tqdm(range(0, epoch_num)): for batch_idx, items in enumerate(train_loader): # hubert特征,f0,幅度谱特征,对应音频段波形(384 * hop_length),人声编码[0] c, f0, spec, y, spk = items g = spk.cuda(rank, non_blocking=True) spec, y = spec.cuda(rank, non_blocking=True), y.cuda(rank, non_blocking=True) c = c.cuda(rank, non_blocking=True) f0 = f0.cuda(rank, non_blocking=True) """ "sampling_rate": 32000, "filter_length": 1280, "hop_length": 320, "win_length": 1280, "n_mel_channels": 80, "mel_fmin": 0.0, "mel_fmax": null """ # spec, n_fft, num_mels, sampling_rate, fmin, fmax mel = spec_to_mel_torch(spec, gs_data_config['filter_length'], gs_data_config['n_mel_channels'], gs_data_config['sampling_rate'], gs_data_config['mel_fmin'], gs_data_config['mel_fmax']) with autocast(enabled=gs_train_config['fp16_run']): # net_g的输入: hubert特征,f0,幅度谱特征,说话人id,mel谱特征 # net_g的输出: # 原始波形,批次中每个采样到的帧的位置,批次中幅度谱的有效帧位置, # 幅度谱编码得到正态分布后随机采样得到的z, z经过标准化流之后得到z_p, hubert特征层得到的正态分布的均值, # hubert特征层得到的正态分布的标准差(logs_p),幅度谱和人声信息得到的均值(m_q),幅度谱和人声信息得到的标准差(logs_q) y_hat, ids_slice, z_mask, \ (z, z_p, m_p, logs_p, m_q, logs_q) = net_g(c, f0, spec, g=g, mel=mel) y_mel = commons.slice_segments(mel, ids_slice, gs_train_config['segment_size'] // gs_data_config['hop_length']) y_hat_mel = mel_spectrogram_torch( y_hat.squeeze(1), gs_data_config['filter_length'], gs_data_config['n_mel_channels'], gs_data_config['sampling_rate'], gs_data_config['hop_length'], gs_data_config['win_length'], gs_data_config['mel_fmin'], gs_data_config['mel_fmax'] ) y = commons.slice_segments(y, ids_slice * gs_data_config['hop_length'], gs_train_config['segment_size']) # slice # Discriminator y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach()) with autocast(enabled=False): loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g) loss_disc_all = loss_disc optim_d.zero_grad() scaler.scale(loss_disc_all).backward() scaler.unscale_(optim_d) scaler.step(optim_d) with autocast(enabled=gs_train_config['fp16_run']): # Generator y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat) with autocast(enabled=False): # mel谱之间的损失函数,后面是系数,误差越小越好 loss_mel = F.l1_loss(y_mel, y_hat_mel) * gs_train_config['c_mel'] # KL散度,z_p: 幅度谱侧得到的采样值经过标准化流之后的结果,logs_q: 幅度谱侧得到的标准差,m_p:hubert侧得到的均值 # logs_p: hubert侧得到的标准差,z_mask: 批次中幅度谱的有效帧位置, loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * gs_train_config['c_kl'] # 在d模型中将y和y_hat的每一层特征结果都拿出来,做l1距离 loss_fm = feature_loss(fmap_r, fmap_g) * gs_train_config['c_fm'] loss_gen, losses_gen = generator_loss(y_d_hat_g) loss_gen_all = loss_gen * gs_train_config['c_gen'] + loss_fm + loss_mel + loss_kl optim_g.zero_grad() scaler.scale(loss_gen_all).backward() scaler.unscale_(optim_g) scaler.step(optim_g) scaler.update() if global_step % gs_train_config['log_interval'] == 0: lr = optim_g.param_groups[0]['lr'] losses_numpy = [round(loss_disc.item(), 3), round(loss_gen.item(), 3), round(loss_fm.item(), 3), round(loss_mel.item(), 3), round(loss_kl.item(), 3)] print("gstep={},lr={},disc={},gen={},fm={},mel={},kl={},tot={}".format(global_step, lr, losses_numpy[0], losses_numpy[1], losses_numpy[2], losses_numpy[3], losses_numpy[4], sum(losses_numpy))) - if global_step % 200 == 0: - torch.save(net_g.state_dict(), "data/web_trained_models/xiafan_{}.pth".format(global_step)) - global_step += 1 scheduler_g.step() scheduler_d.step() return net_g def infer(self, in_wav, dst_wav, model): tran = 0 # 变化的音高 source, sr = librosa.load(in_wav, sr=32000, mono=True) source = torch.tensor(source).unsqueeze(0) sid = torch.LongTensor([0]).to(self.device).unsqueeze(0) soft, pitch = self.get_unit_pitch(source, sr, tran) f0 = torch.FloatTensor(clean_pitch(pitch)).unsqueeze(0).to(self.device) stn_tst = torch.FloatTensor(soft) with torch.no_grad(): model.eval() x_tst = stn_tst.unsqueeze(0).to(self.device) start = time.time() x_tst = torch.repeat_interleave(x_tst, repeats=2, dim=1).transpose(1, 2) audio = model.infer(x_tst, f0=f0, g=sid)[0, 0].data.float() use_time = time.time() - start print("vits use time:{}".format(use_time)) # 写入文件 soundfile.write(dst_wav, audio.cpu().numpy(), sr, format='wav') ####### 对外接口,训练并预测 def process_train_and_infer(self, train_media, in_path, dst_path, dst_model_path=None, params={}): """ :param train_media: 训练时使用的数据 :param in_path: 待转换的人声信息 :param dst_path: 转换后的文件地址 :param dst_model_path: 是否缓存模型 :return: """ # 对train_media转码为32k单声道 tmp_32_wav = train_media + "_321.wav" cmd = "ffmpeg -i {} -ar 32000 -ac 1 -y {}".format(train_media, tmp_32_wav) os.system(cmd) if not os.path.exists(tmp_32_wav): return 1 # 做降噪 tmp_wav = train_media + "_de321.wav" cmd = "{} {} {}".format(gs_denoise_exe, tmp_32_wav, tmp_wav) os.system(cmd) if not os.path.exists(tmp_wav): os.unlink(tmp_32_wav) return 2 in_wav_tmp = in_path + "_321.wav" cmd = "ffmpeg -i {} -ar 32000 -ac 1 -y {}".format(in_path, in_wav_tmp) os.system(cmd) if not os.path.exists(in_wav_tmp): os.unlink(tmp_32_wav) os.unlink(tmp_wav) return 3 global gs_train_config max_step = params.get('max_step', 200) gs_train_config['c_mel'] = params.get("c_mel", 45) gs_train_config['c_fm'] = params.get("c_fm", 1.0) gs_train_config['c_gen'] = params.get("c_gen", 1.0) print("params:{}".format(params)) st = time.time() model = self.train(tmp_wav, max_step) print("train sp={}".format(time.time() - st)) st = time.time() self.infer(in_wav_tmp, dst_path, model) print("infer sp={}".format(time.time() - st)) if dst_model_path is not None: st = time.time() torch.save(model.state_dict(), dst_model_path) print("save model sp={}".format(time.time() - st)) os.unlink(tmp_32_wav) os.unlink(tmp_wav) os.unlink(in_wav_tmp) return 0 # 推理结果 def process_infer(self, model_path, in_path, dst_path): net_g = SynthesizerTrn( gs_data_config["filter_length"] // 2 + 1, gs_train_config["segment_size"] // gs_data_config["hop_length"], **gs_model_config, no_flow=False, use_v3=False).cuda() model_dict = torch.load(model_path, map_location='cpu') net_g.load_state_dict(model_dict) in_wav_tmp = in_path + "_321.wav" cmd = "ffmpeg -i {} -ar 32000 -ac 1 -y {}".format(in_path, in_wav_tmp) os.system(cmd) if not os.path.exists(in_wav_tmp): return 2 self.infer(in_wav_tmp, dst_path, net_g) if __name__ == '__main__': pp = "data/train_users/qiankun_v1/vocals/speaker0/qiankun.wav" in_p = "data/test/vocal_32.wav" dst_p = "data/test/vocal_32_out.wav" dst_m_p = "data/test/mm.pth" g_path = "data/online_models/models/base_model/sunyanzi_base_2000.pth" d_path = "data/online_models/models/base_model/sunyanzi_base_d_2000.pth" svsot = SoVitsSVCOnlineTrain(g_path, d_path) start_time = time.time() ret = svsot.process_train_and_infer(pp, in_p, dst_p, dst_m_p) print("process = {} ret={}".format(time.time() - start_time, ret))