diff --git a/AutoCoverTool/ref/so_vits_svc/apply_model.py b/AutoCoverTool/ref/so_vits_svc/apply_model.py new file mode 100644 index 0000000..7570e91 --- /dev/null +++ b/AutoCoverTool/ref/so_vits_svc/apply_model.py @@ -0,0 +1,147 @@ +from ref.so_vits_svc.models import * +import time +import torch +from thop import profile +from thop import clever_format + +gs_model_config = { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0.1, + "resblock": "1", + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "upsample_rates": [ + 10, + 8, + 2, + 2 + ], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4 + ], + "n_layers_q": 3, + "use_spectral_norm": False, + "gin_channels": 256, + "ssl_dim": 256, + "n_speakers": 2 +} + + +def load_model(): + mm = SynthesizerTrn( + 641, 56, + **gs_model_config + ) + device = 'cuda' + checkpoint_path = "data/train_users/xiafan/logs/32k/G_0.pth" + checkpoint_dict = torch.load(checkpoint_path, map_location='cuda') + mm.load_state_dict(checkpoint_dict["model"]) + mm.eval() + mm.to(device) + + # params = mm.state_dict() + # save_path = "/tmp/t1.pth" + # torch.save(params, save_path) + + # c, f0, spec, g=None + hub_data = torch.rand((1, 256, 1)).to(device) + f0 = torch.rand((1, 1)).to(device) + g = torch.tensor([[0]]).to(device) + spec = torch.rand((1, 641, 1)).to(device) + + with torch.no_grad(): + for i in range(0, 10): + st = time.time() + mm(hub_data, f0, spec, g) + print("sp = {}".format(time.time() - st)) + + flops, params = profile(mm, inputs=(hub_data, f0, spec, g), verbose=True) + flops, params = clever_format([flops, params], "%.3f") + # 487.096M 58.126M + print(flops, params) + # macs, params = clever_format([flops, params], "%.3f") + # print(macs) + # print(params) + # + # print(mm) + # stat(mm, input_size=(hub_data, f0, spec, g)) + # print("%s | %.2f | %.2f" % ("synthesizer_trn", params / (1000 ** 2), flops / (1000 ** 3))) + + +def change_iter(): + checkpoint_path = "/data/rsync/jianli.yang/AutoCoverToolNew/AutoCoverTool/data/online_models/models/base_model/sunyanzi_base_48000.pth" + dst_checkpoint_path = "/data/rsync/jianli.yang/AutoCoverToolNew/AutoCoverTool/data/online_models/models/base_model/sunyanzi_base_48000_no_opt.pth" + checkpoint_dict = torch.load(checkpoint_path, map_location='cuda') + checkpoint_dict["iteration"] = 1 + checkpoint_dict["learning_rate"] = 1e-4 + checkpoint_dict["optimizer"] = None + torch.save(checkpoint_dict, dst_checkpoint_path) + + +def reload_model(): + in_checkpoint_path = "/data/rsync/jianli.yang/AutoCoverToolNew/AutoCoverTool/data/online_models/models/base_model/sunyanzi_base_48000.pth" + out_checkpoint_path = "/data/rsync/jianli.yang/AutoCoverToolNew/AutoCoverTool/data/online_models/models/base_model/sunyanzi_base_48000_no_flow.pth" + checkpoint_dict = torch.load(in_checkpoint_path, map_location='cuda') + device = 'cuda' + model = SynthesizerTrn( + 641, 56, + **gs_model_config, + no_flow=True, + use_v3=False + ) + state_dict = {} + for k in checkpoint_dict["model"].keys(): + if str(k).startswith("flow"): + continue + state_dict[k] = checkpoint_dict["model"][k] + + model.load_state_dict(state_dict) + model.eval() + model.to(device) + # print(model) + hub_data = torch.rand((1, 256, 1)).to(device) + f0 = torch.rand((1, 1)).to(device) + g = torch.tensor([[0]]).to(device) + spec = torch.rand((1, 641, 1)).to(device) + o, ids_slice, spec_mask, (z, z, m_p, logs_p, m_q, logs_q) = model(hub_data, f0, spec, g) + print(o.shape) + + checkpoint_dict["model"] = model.state_dict() + checkpoint_dict["iteration"] = 1 + checkpoint_dict["learning_rate"] = 1e-4 + checkpoint_dict["optimizer"] = None + torch.save(checkpoint_dict, out_checkpoint_path) + + +if __name__ == '__main__': + # reload_model() + change_iter() \ No newline at end of file diff --git a/AutoCoverTool/svc_inference/config.json b/AutoCoverTool/ref/so_vits_svc/config.json similarity index 100% copy from AutoCoverTool/svc_inference/config.json copy to AutoCoverTool/ref/so_vits_svc/config.json diff --git a/AutoCoverTool/ref/so_vits_svc/inference/infer_tool.py b/AutoCoverTool/ref/so_vits_svc/inference/infer_tool.py index 628cbdf..06a4676 100644 --- a/AutoCoverTool/ref/so_vits_svc/inference/infer_tool.py +++ b/AutoCoverTool/ref/so_vits_svc/inference/infer_tool.py @@ -1,342 +1,433 @@ import hashlib import json import logging import os import time from pathlib import Path import librosa import maad import numpy as np # import onnxruntime import parselmouth import soundfile import torch import torchaudio from hubert import hubert_model import utils from models import SynthesizerTrn import copy logging.getLogger('matplotlib').setLevel(logging.WARNING) from mel_processing import spectrogram_torch, spec_to_mel_torch def get_spec(audio): audio_norm = audio print(audio_norm) spec = spectrogram_torch(audio_norm, 1280, 32000, 320, 1280, center=False) return spec def read_temp(file_name): if not os.path.exists(file_name): with open(file_name, "w") as f: f.write(json.dumps({"info": "temp_dict"})) return {} else: try: with open(file_name, "r") as f: data = f.read() data_dict = json.loads(data) if os.path.getsize(file_name) > 50 * 1024 * 1024: f_name = file_name.replace("\\", "/").split("/")[-1] print(f"clean {f_name}") for wav_hash in list(data_dict.keys()): if int(time.time()) - int(data_dict[wav_hash]["time"]) > 14 * 24 * 3600: del data_dict[wav_hash] except Exception as e: print(e) print(f"{file_name} error,auto rebuild file") data_dict = {"info": "temp_dict"} return data_dict def write_temp(file_name, data): with open(file_name, "w") as f: f.write(json.dumps(data)) def timeit(func): def run(*args, **kwargs): t = time.time() res = func(*args, **kwargs) print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t)) return res return run def format_wav(audio_path): if Path(audio_path).suffix == '.wav': return raw_audio, raw_sample_rate = librosa.load(audio_path, mono=True, sr=None) soundfile.write(Path(audio_path).with_suffix(".wav"), raw_audio, raw_sample_rate) def get_end_file(dir_path, end): file_lists = [] for root, dirs, files in os.walk(dir_path): files = [f for f in files if f[0] != '.'] dirs[:] = [d for d in dirs if d[0] != '.'] for f_file in files: if f_file.endswith(end): file_lists.append(os.path.join(root, f_file).replace("\\", "/")) return file_lists def get_md5(content): return hashlib.new("md5", content).hexdigest() def resize2d_f0(x, target_len): source = np.array(x) source[source < 0.001] = np.nan target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)), source) res = np.nan_to_num(target) return res def get_f0(x, p_len, f0_up_key=0): time_step = 160 / 16000 * 1000 f0_min = 50 f0_max = 1100 f0_mel_min = 1127 * np.log(1 + f0_min / 700) f0_mel_max = 1127 * np.log(1 + f0_max / 700) f0 = parselmouth.Sound(x, 16000).to_pitch_ac( time_step=time_step / 1000, voicing_threshold=0.6, pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency'] if len(f0) > p_len: f0 = f0[:p_len] pad_size = (p_len - len(f0) + 1) // 2 if (pad_size > 0 or p_len - len(f0) - pad_size > 0): f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode='constant') f0 *= pow(2, f0_up_key / 12) f0_mel = 1127 * np.log(1 + f0 / 700) f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1 f0_mel[f0_mel <= 1] = 1 f0_mel[f0_mel > 255] = 255 f0_coarse = np.rint(f0_mel).astype(np.int) return f0_coarse, f0 def clean_pitch(input_pitch): num_nan = np.sum(input_pitch == 1) if num_nan / len(input_pitch) > 0.9: input_pitch[input_pitch != 1] = 1 return input_pitch def plt_pitch(input_pitch): input_pitch = input_pitch.astype(float) input_pitch[input_pitch == 1] = np.nan return input_pitch def f0_to_pitch(ff): f0_pitch = 69 + 12 * np.log2(ff / 440) return f0_pitch def fill_a_to_b(a, b): if len(a) < len(b): for _ in range(0, len(b) - len(a)): a.append(a[0]) def mkdir(paths: list): for path in paths: if not os.path.exists(path): os.mkdir(path) class Svc(object): def __init__(self, net_g_path, config_path, hubert_path="/data/prod/so_vits_models/models/hubert-soft-0d54a1f4.pt", onnx=False): self.onnx = onnx self.net_g_path = net_g_path self.hubert_path = hubert_path self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.net_g_ms = None self.hps_ms = utils.get_hparams_from_file(config_path) self.target_sample = self.hps_ms.data.sampling_rate self.hop_size = self.hps_ms.data.hop_length self.speakers = {} for spk, sid in self.hps_ms.spk.items(): self.speakers[sid] = spk self.spk2id = self.hps_ms.spk # 加载hubert self.hubert_soft = hubert_model.hubert_soft(hubert_path) if torch.cuda.is_available(): self.hubert_soft = self.hubert_soft.cuda() self.load_model() def load_model(self): # 获取模型配置 if self.onnx: raise NotImplementedError # self.net_g_ms = SynthesizerTrnForONNX( # 178, # self.hps_ms.data.filter_length // 2 + 1, # self.hps_ms.train.segment_size // self.hps_ms.data.hop_length, # n_speakers=self.hps_ms.data.n_speakers, # **self.hps_ms.model) # _ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None) else: self.net_g_ms = SynthesizerTrn( self.hps_ms.data.filter_length // 2 + 1, self.hps_ms.train.segment_size // self.hps_ms.data.hop_length, - **self.hps_ms.model) + **self.hps_ms.model, no_flow=True, use_v3=True) _ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None) if "half" in self.net_g_path and torch.cuda.is_available(): _ = self.net_g_ms.half().eval().to(self.dev) else: _ = self.net_g_ms.eval().to(self.dev) def get_units(self, source, sr): source = source.unsqueeze(0).to(self.dev) with torch.inference_mode(): start = time.time() units = self.hubert_soft.units(source) use_time = time.time() - start print("hubert use time:{}".format(use_time)) return units def get_unit_pitch(self, in_path, tran): source, sr = torchaudio.load(in_path) source_bak = copy.deepcopy(source) source = torchaudio.functional.resample(source, sr, 16000) if len(source.shape) == 2 and source.shape[1] >= 2: source = torch.mean(source, dim=0).unsqueeze(0) soft = self.get_units(source, sr).squeeze(0).cpu().numpy() f0_coarse, f0 = get_f0(source.cpu().numpy()[0], soft.shape[0] * 2, tran) return soft, f0, source_bak def infer(self, speaker_id, tran, raw_path, dev=False): if type(speaker_id) == str: speaker_id = self.spk2id[speaker_id] sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0) soft, pitch, source = self.get_unit_pitch(raw_path, tran) f0 = torch.FloatTensor(clean_pitch(pitch)).unsqueeze(0).to(self.dev) if "half" in self.net_g_path and torch.cuda.is_available(): stn_tst = torch.HalfTensor(soft) else: stn_tst = torch.FloatTensor(soft) # 提取幅度谱 # spec = get_spec(source).to(self.dev) with torch.no_grad(): x_tst = stn_tst.unsqueeze(0).to(self.dev) start = time.time() x_tst = torch.repeat_interleave(x_tst, repeats=2, dim=1).transpose(1, 2) audio = self.net_g_ms.infer(x_tst, f0=f0, g=sid)[0, 0].data.float() # audio = self.net_g_ms.infer_v1(x_tst, spec[:, :, :f0.size(-1)], f0=f0, g=sid)[0, 0].data.float() use_time = time.time() - start print("vits use time:{}".format(use_time)) return audio, audio.shape[-1] +class SVCRealTimeByBuffer(object): + def __init__(self, net_g_path, config_path, hubert_path="/data/prod/so_vits_models/models/hubert-soft-0d54a1f4.pt"): + self.net_g_path = net_g_path + self.hubert_path = hubert_path + self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.net_g_ms = None + self.hps_ms = utils.get_hparams_from_file(config_path) + self.target_sample = self.hps_ms.data.sampling_rate + self.hop_size = self.hps_ms.data.hop_length + self.speakers = {} + for spk, sid in self.hps_ms.spk.items(): + self.speakers[sid] = spk + self.spk2id = self.hps_ms.spk + # 加载hubert + self.hubert_soft = hubert_model.hubert_soft(hubert_path) + if torch.cuda.is_available(): + self.hubert_soft = self.hubert_soft.cuda() + self.load_model() + + def load_model(self): + self.net_g_ms = SynthesizerTrn( + self.hps_ms.data.filter_length // 2 + 1, + self.hps_ms.train.segment_size // self.hps_ms.data.hop_length, + **self.hps_ms.model, no_flow=True) + # _ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None) + net_g = torch.load(self.net_g_path, map_location='cpu') + self.net_g_ms.load_state_dict(net_g) + if "half" in self.net_g_path and torch.cuda.is_available(): + _ = self.net_g_ms.half().eval().to(self.dev) + else: + _ = self.net_g_ms.eval().to(self.dev) + + def get_units(self, source, sr): + source = source.unsqueeze(0).to(self.dev) + print("source_shape===>", source.shape) + with torch.inference_mode(): + start = time.time() + units = self.hubert_soft.units(source) + use_time = time.time() - start + print("hubert use time:{}".format(use_time)) + return units + + def get_unit_pitch(self, source, sr, tran): + source = torchaudio.functional.resample(source, sr, 16000) + if len(source.shape) == 2 and source.shape[1] >= 2: + source = torch.mean(source, dim=0).unsqueeze(0) + soft = self.get_units(source, sr).squeeze(0).cpu().numpy() + f0_coarse, f0 = get_f0(source.cpu().numpy()[0], soft.shape[0] * 2, tran) + return soft, f0 + + def infer(self, speaker_id, tran, source, sr): + if type(speaker_id) == str: + speaker_id = self.spk2id[speaker_id] + sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0) + soft, pitch = self.get_unit_pitch(source, sr, tran) + f0 = torch.FloatTensor(clean_pitch(pitch)).unsqueeze(0).to(self.dev) + if "half" in self.net_g_path and torch.cuda.is_available(): + stn_tst = torch.HalfTensor(soft) + else: + stn_tst = torch.FloatTensor(soft) + + with torch.no_grad(): + x_tst = stn_tst.unsqueeze(0).to(self.dev) + start = time.time() + x_tst = torch.repeat_interleave(x_tst, repeats=2, dim=1).transpose(1, 2) + audio = self.net_g_ms.infer(x_tst, f0=f0, g=sid)[0, 0].data.float() + use_time = time.time() - start + print("vits use time:{}".format(use_time)) + return audio, audio.shape[-1] + + def process(self, vocal_path, dst_path, tran=0): + source, sr = librosa.load(vocal_path, sr=32000, mono=True) + # 按照每秒一次进行处理 + out_audio = [] + source = torch.tensor(source).to(self.dev) + hop_len = 3840 * 4 # 120ms + length = 640 * 1000 + for i in range(0, len(source), length - hop_len): + cur_hop_len = hop_len + input_data = source[i:i + length].unsqueeze(0) + audio, _ = self.infer(0, tran, input_data, sr) + if len(audio) < hop_len: + break + if len(out_audio) > 0: + # 本次开头和前面的末尾做fade + for j in range(hop_len): + out_audio[i+j] = out_audio[i+j] * (1-(j / hop_len)) + audio[j] * (j / hop_len) + else: + cur_hop_len = 0 + out_audio.extend(audio[cur_hop_len:]) + soundfile.write(dst_path, out_audio, sr, format="wav") # class SvcONNXInferModel(object): # def __init__(self, hubert_onnx, vits_onnx, config_path): # self.config_path = config_path # self.vits_onnx = vits_onnx # self.hubert_onnx = hubert_onnx # self.hubert_onnx_session = onnxruntime.InferenceSession(hubert_onnx, providers=['CUDAExecutionProvider', ]) # self.inspect_onnx(self.hubert_onnx_session) # self.vits_onnx_session = onnxruntime.InferenceSession(vits_onnx, providers=['CUDAExecutionProvider', ]) # self.inspect_onnx(self.vits_onnx_session) # self.hps_ms = utils.get_hparams_from_file(self.config_path) # self.target_sample = self.hps_ms.data.sampling_rate # self.feature_input = FeatureInput(self.hps_ms.data.sampling_rate, self.hps_ms.data.hop_length) # # @staticmethod # def inspect_onnx(session): # for i in session.get_inputs(): # print("name:{}\tshape:{}\tdtype:{}".format(i.name, i.shape, i.type)) # for i in session.get_outputs(): # print("name:{}\tshape:{}\tdtype:{}".format(i.name, i.shape, i.type)) # # def infer(self, speaker_id, tran, raw_path): # sid = np.array([int(speaker_id)], dtype=np.int64) # soft, pitch = self.get_unit_pitch(raw_path, tran) # pitch = np.expand_dims(pitch, axis=0).astype(np.int64) # stn_tst = soft # x_tst = np.expand_dims(stn_tst, axis=0) # x_tst_lengths = np.array([stn_tst.shape[0]], dtype=np.int64) # # 使用ONNX Runtime进行推理 # start = time.time() # audio = self.vits_onnx_session.run(output_names=["audio"], # input_feed={ # "hidden_unit": x_tst, # "lengths": x_tst_lengths, # "pitch": pitch, # "sid": sid, # })[0][0, 0] # use_time = time.time() - start # print("vits_onnx_session.run time:{}".format(use_time)) # audio = torch.from_numpy(audio) # return audio, audio.shape[-1] # # def get_units(self, source, sr): # source = torchaudio.functional.resample(source, sr, 16000) # if len(source.shape) == 2 and source.shape[1] >= 2: # source = torch.mean(source, dim=0).unsqueeze(0) # source = source.unsqueeze(0) # # 使用ONNX Runtime进行推理 # start = time.time() # units = self.hubert_onnx_session.run(output_names=["embed"], # input_feed={"source": source.numpy()})[0] # use_time = time.time() - start # print("hubert_onnx_session.run time:{}".format(use_time)) # return units # # def transcribe(self, source, sr, length, transform): # feature_pit = self.feature_input.compute_f0(source, sr) # feature_pit = feature_pit * 2 ** (transform / 12) # feature_pit = resize2d_f0(feature_pit, length) # coarse_pit = self.feature_input.coarse_f0(feature_pit) # return coarse_pit # # def get_unit_pitch(self, in_path, tran): # source, sr = torchaudio.load(in_path) # soft = self.get_units(source, sr).squeeze(0) # input_pitch = self.transcribe(source.numpy()[0], sr, soft.shape[0], tran) # return soft, input_pitch class RealTimeVC: def __init__(self): self.last_chunk = None self.last_o = None self.chunk_len = 16000 # 区块长度 self.pre_len = 3840 # 交叉淡化长度,640的倍数 """输入输出都是1维numpy 音频波形数组""" def process(self, svc_model, speaker_id, f_pitch_change, input_wav_path): audio, sr = torchaudio.load(input_wav_path) audio = audio.cpu().numpy()[0] temp_wav = io.BytesIO() if self.last_chunk is None: input_wav_path.seek(0) audio, sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path) audio = audio.cpu().numpy() self.last_chunk = audio[-self.pre_len:] self.last_o = audio return audio[-self.chunk_len:] else: audio = np.concatenate([self.last_chunk, audio]) soundfile.write(temp_wav, audio, sr, format="wav") temp_wav.seek(0) audio, sr = svc_model.infer(speaker_id, f_pitch_change, temp_wav) audio = audio.cpu().numpy() ret = maad.util.crossfade(self.last_o, audio, self.pre_len) self.last_chunk = audio[-self.pre_len:] self.last_o = audio return ret[self.chunk_len:2 * self.chunk_len] diff --git a/AutoCoverTool/ref/so_vits_svc/models.py b/AutoCoverTool/ref/so_vits_svc/models.py index 477f395..3e9eba9 100644 --- a/AutoCoverTool/ref/so_vits_svc/models.py +++ b/AutoCoverTool/ref/so_vits_svc/models.py @@ -1,412 +1,439 @@ import copy import math import torch from torch import nn from torch.nn import functional as F import attentions import commons import modules from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm from commons import init_weights, get_padding from vdecoder.hifigan.models import Generator from utils import f0_to_coarse class ResidualCouplingBlock(nn.Module): def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows=4, gin_channels=0): super().__init__() self.channels = channels self.hidden_channels = hidden_channels self.kernel_size = kernel_size self.dilation_rate = dilation_rate self.n_layers = n_layers self.n_flows = n_flows self.gin_channels = gin_channels self.flows = nn.ModuleList() for i in range(n_flows): self.flows.append( modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True)) self.flows.append(modules.Flip()) def forward(self, x, x_mask, g=None, reverse=False): if not reverse: for flow in self.flows: x, _ = flow(x, x_mask, g=g, reverse=reverse) else: for flow in reversed(self.flows): x = flow(x, x_mask, g=g, reverse=reverse) return x class Encoder(nn.Module): def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0): super().__init__() self.in_channels = in_channels self.out_channels = out_channels self.hidden_channels = hidden_channels self.kernel_size = kernel_size self.dilation_rate = dilation_rate self.n_layers = n_layers self.gin_channels = gin_channels self.pre = nn.Conv1d(in_channels, hidden_channels, 1) self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels) self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) def forward(self, x, x_lengths, g=None): # print(x.shape,x_lengths.shape) # commons.sequence_mask 对于batch层级有价值,x_lengths是每个batch中每一个元素的帧数 # 比如输入([3,5,2], 5)那么得到 3 * 5的True/False矩阵,其中第一层矩阵为3个true,2个false,第二层全true,第三层前两个true,其余false # 作用一个批次中允许不同长度的数据一起训练,此时较短的乘以false,剔除影响 x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) x = self.pre(x) * x_mask x = self.enc(x, x_mask, g=g) stats = self.proj(x) * x_mask m, logs = torch.split(stats, self.out_channels, dim=1) z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask return z, m, logs, x_mask class TextEncoder(nn.Module): def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, filter_channels=None, n_heads=None, p_dropout=None): super().__init__() self.in_channels = in_channels self.out_channels = out_channels self.hidden_channels = hidden_channels self.kernel_size = kernel_size self.dilation_rate = dilation_rate self.n_layers = n_layers self.gin_channels = gin_channels self.pre = nn.Conv1d(in_channels, hidden_channels, 1) self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) self.f0_emb = nn.Embedding(256, hidden_channels) self.enc_ = attentions.Encoder( hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout) def forward(self, x, x_lengths, f0=None): # x->(b,256,frame_num), x_lengths -> (b) # commons.sequence_mask 对于batch层级有价值,x_lengths是每个batch中每一个元素的帧数 # 比如输入([3,5,2], 5)那么得到 3 * 5的True/False矩阵,其中第一层矩阵为3个true,2个false,第二层全true,第三层前两个true,其余false # 作用一个批次中允许不同长度的数据一起训练,此时较短的乘以false,剔除影响 x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) x = self.pre(x) * x_mask x = x + self.f0_emb(f0).transpose(1, 2) x = self.enc_(x * x_mask, x_mask) stats = self.proj(x) * x_mask # m是VAE过程中得到的mu,而log对应的是log(sigma) m, logs = torch.split(stats, self.out_channels, dim=1) # z是随机采样过程 z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask return z, m, logs, x_mask class DiscriminatorP(torch.nn.Module): def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): super(DiscriminatorP, self).__init__() self.period = period self.use_spectral_norm = use_spectral_norm norm_f = weight_norm if use_spectral_norm == False else spectral_norm self.convs = nn.ModuleList([ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))), ]) self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) def forward(self, x): fmap = [] # 1d to 2d b, c, t = x.shape if t % self.period != 0: # pad first n_pad = self.period - (t % self.period) x = F.pad(x, (0, n_pad), "reflect") t = t + n_pad x = x.view(b, c, t // self.period, self.period) for l in self.convs: x = l(x) x = F.leaky_relu(x, modules.LRELU_SLOPE) fmap.append(x) x = self.conv_post(x) fmap.append(x) x = torch.flatten(x, 1, -1) return x, fmap class DiscriminatorS(torch.nn.Module): def __init__(self, use_spectral_norm=False): super(DiscriminatorS, self).__init__() norm_f = weight_norm if use_spectral_norm == False else spectral_norm self.convs = nn.ModuleList([ norm_f(Conv1d(1, 16, 15, 1, padding=7)), norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), ]) self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) def forward(self, x): fmap = [] for l in self.convs: x = l(x) x = F.leaky_relu(x, modules.LRELU_SLOPE) fmap.append(x) x = self.conv_post(x) fmap.append(x) x = torch.flatten(x, 1, -1) return x, fmap class MultiPeriodDiscriminator(torch.nn.Module): def __init__(self, use_spectral_norm=False): super(MultiPeriodDiscriminator, self).__init__() periods = [2, 3, 5, 7, 11] discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods] self.discriminators = nn.ModuleList(discs) def forward(self, y, y_hat): y_d_rs = [] y_d_gs = [] fmap_rs = [] fmap_gs = [] for i, d in enumerate(self.discriminators): y_d_r, fmap_r = d(y) y_d_g, fmap_g = d(y_hat) y_d_rs.append(y_d_r) y_d_gs.append(y_d_g) fmap_rs.append(fmap_r) fmap_gs.append(fmap_g) return y_d_rs, y_d_gs, fmap_rs, fmap_gs class SpeakerEncoder(torch.nn.Module): def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256): super(SpeakerEncoder, self).__init__() self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True) self.linear = nn.Linear(model_hidden_size, model_embedding_size) self.relu = nn.ReLU() def forward(self, mels): self.lstm.flatten_parameters() _, (hidden, _) = self.lstm(mels) embeds_raw = self.relu(self.linear(hidden[-1])) return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True) def compute_partial_slices(self, total_frames, partial_frames, partial_hop): mel_slices = [] for i in range(0, total_frames - partial_frames, partial_hop): mel_range = torch.arange(i, i + partial_frames) mel_slices.append(mel_range) return mel_slices def embed_utterance(self, mel, partial_frames=128, partial_hop=64): mel_len = mel.size(1) last_mel = mel[:, -partial_frames:] if mel_len > partial_frames: mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop) mels = list(mel[:, s] for s in mel_slices) mels.append(last_mel) mels = torch.stack(tuple(mels), 0).squeeze(1) with torch.no_grad(): partial_embeds = self(mels) embed = torch.mean(partial_embeds, axis=0).unsqueeze(0) # embed = embed / torch.linalg.norm(embed, 2) else: with torch.no_grad(): embed = self(last_mel) return embed class SynthesizerTrn(nn.Module): """ Synthesizer for Training """ def __init__(self, spec_channels, segment_size, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels, ssl_dim, n_speakers, + no_flow=False, + use_v3=False, **kwargs): super().__init__() self.spec_channels = spec_channels self.inter_channels = inter_channels self.hidden_channels = hidden_channels self.filter_channels = filter_channels self.n_heads = n_heads self.n_layers = n_layers self.kernel_size = kernel_size self.p_dropout = p_dropout self.resblock = resblock self.resblock_kernel_sizes = resblock_kernel_sizes self.resblock_dilation_sizes = resblock_dilation_sizes self.upsample_rates = upsample_rates self.upsample_initial_channel = upsample_initial_channel self.upsample_kernel_sizes = upsample_kernel_sizes self.segment_size = segment_size self.gin_channels = gin_channels self.ssl_dim = ssl_dim self.emb_g = nn.Embedding(n_speakers, gin_channels) self.enc_p_ = TextEncoder(ssl_dim, inter_channels, hidden_channels, 5, 1, 16, 0, filter_channels, n_heads, p_dropout) + self.no_flow = no_flow + self.use_v3 = use_v3 hps = { "sampling_rate": 32000, "inter_channels": 192, "resblock": "1", "resblock_kernel_sizes": [3, 7, 11], "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], "upsample_rates": [10, 8, 2, 2], "upsample_initial_channel": 512, "upsample_kernel_sizes": [16, 16, 4, 4], "gin_channels": 256, } + if self.use_v3: + # v3的结构 + hps = { + "sampling_rate": 32000, + "inter_channels": 192, + "resblock": "1", + "resblock_kernel_sizes": [3, 5, 7], + "resblock_dilation_sizes": [[1, 2], [2, 6], [3, 12]], + "upsample_rates": [10, 8, 4], + "upsample_initial_channel": 256, + "upsample_kernel_sizes": [16, 16, 8], + "gin_channels": 256, + } self.dec = Generator(h=hps) self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels) - self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) + + print("no_flow={}, use_v3={}".format(self.no_flow, self.use_v3)) + + if not self.no_flow: + self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) def forward(self, c, f0, spec, g=None, mel=None, c_lengths=None, spec_lengths=None): # hubert特征(b,256,frame_num), f0 (frame_num), 幅度谱特征, 说话人id,mel谱特征 if c_lengths == None: c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device) # [frame_num,....] if spec_lengths == None: spec_lengths = (torch.ones(spec.size(0)) * spec.size(-1)).to(spec.device) # 说话人信息embding g = self.emb_g(g).transpose(1, 2) # 采样得到的z,vae需要的均值,logs_p是vae需要的log(sigma) # 输入hubert特征(b,256,frame_num), f0 (frame_num),对应的是文本出隐变量的那段模型 z_ptemp, m_p, logs_p, _ = self.enc_p_(c, c_lengths, f0=f0_to_coarse(f0)) # 输入幅度谱和说话人信息 # 输出采样得到的z,m_q是均值,logs_q是log(sigma) z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g) # 标准化流,增加分布复杂程度 - z_p = self.flow(z, spec_mask, g=g) + if not self.no_flow: + z_p = self.flow(z, spec_mask, g=g) # 由于整个batch中含有的音频帧数不一致,要求每一个元素都随机裁剪出segment_size长度的特征 # 返回z的batch列表,pitch_slice列表和ids_slice的列表 z_slice, pitch_slice, ids_slice = commons.rand_slice_segments_with_pitch(z, f0, spec_lengths, self.segment_size) # o = self.dec(z_slice, g=g) # 解码部分,输入未经过标准化的z,以及说话人信息和pitch,得到wav波形 o = self.dec(z_slice, g=g, f0=pitch_slice) + if self.no_flow: + z_p = z # 原始波形,批次中每个采样到的帧的位置,批次中幅度谱的有效帧位置, # 幅度谱编码得到正态分布后随机采样得到的z, z经过标准化流之后得到z_p, hubert特征层得到的正态分布的均值, # hubert特征层得到的正态分布的标准差(logs_p),幅度谱和人声信息得到的均值(m_q),幅度谱和人声信息得到的标准差(logs_q) return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q) def infer(self, c, f0, g=None, mel=None, c_lengths=None): + print(c.shape, f0.shape, g.shape) # hubert特征(b,256,frame_num), f0 (frame_num), 说话人id, mel谱特征 if c_lengths == None: c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device) # [frame_num,....] # 说话人信息经过embdding g = self.emb_g(g).transpose(1, 2) # 采样得到的z,vae需要的均值,logs_p是vae需要的log(sigma) # 输入hubert特征(b,256,frame_num), f0 (frame_num) # 其中c_mask的内容是由于每个batch中的元素的frame_num有可能不一致长,所以使用c_mask保证每个元素都能训练到自己的所有信息 z_p, m_p, logs_p, c_mask = self.enc_p_(c, c_lengths, f0=f0_to_coarse(f0)) # 将说话人和采样到的z信息塞入到标准化流中 - z = self.flow(z_p, c_mask, g=g, reverse=True) + if not self.no_flow: + z = self.flow(z_p, c_mask, g=g, reverse=True) + z_p = z # 解码得到波形信息 - o = self.dec(z * c_mask, g=g, f0=f0) + o = self.dec(z_p * c_mask, g=g, f0=f0) return o def infer_v1(self, c, spec, f0, g): print(c.shape, spec.shape, f0.shape, g.shape) c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device) # (b, frame_num) spec_lengths = (torch.ones(spec.size(0)) * spec.size(-1)).to(spec.device) g = self.emb_g(g).transpose(1, 2) # z_p, m_p, logs_p, c_mask = self.enc_p_(c, c_lengths, f0=f0_to_coarse(f0)) # z = self.flow(z_p, c_mask, g=g, reverse=True) # o = self.dec(z * c_mask, g=g, f0=f0) # print(c_mask.shape, c_mask) z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g) o = self.dec(z, g=g, f0=f0) return o if __name__ == '__main__': # m = MultiPeriodDiscriminator() # y = torch.rand((1, 1, 96000)) # y_hat = torch.rand((1, 1, 96000)) # a, b, c, d = m(y, y_hat) t1 = DiscriminatorS() y = torch.rand(1, 1, 512) a, b = t1(y) print(a.shape) diff --git a/AutoCoverTool/ref/so_vits_svc/preprocess_flist_config.py b/AutoCoverTool/ref/so_vits_svc/preprocess_flist_config.py index 5b7e80a..59f7bf8 100644 --- a/AutoCoverTool/ref/so_vits_svc/preprocess_flist_config.py +++ b/AutoCoverTool/ref/so_vits_svc/preprocess_flist_config.py @@ -1,132 +1,137 @@ import os import argparse import re from tqdm import tqdm from random import shuffle import json config_template = { "train": { "log_interval": 200, "eval_interval": 1000, "seed": 1234, - "epochs": 400, # 由10000->400 + "epochs": 10000, # 由10000->400 "learning_rate": 1e-4, "betas": [0.8, 0.99], "eps": 1e-9, "batch_size": 12, "fp16_run": False, "lr_decay": 0.999875, "segment_size": 17920, "init_lr_ratio": 1, "warmup_epochs": 0, "c_mel": 45, "c_kl": 1.0, "use_sr": True, "max_speclen": 384, "port": "8001" }, "data": { "training_files": "filelists/train.txt", "validation_files": "filelists/val.txt", "max_wav_value": 32768.0, "sampling_rate": 32000, "filter_length": 1280, "hop_length": 320, "win_length": 1280, "n_mel_channels": 80, "mel_fmin": 0.0, "mel_fmax": None }, "model": { "inter_channels": 192, "hidden_channels": 192, "filter_channels": 768, "n_heads": 2, "n_layers": 6, "kernel_size": 3, "p_dropout": 0.1, "resblock": "1", "resblock_kernel_sizes": [3, 7, 11], "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], "upsample_rates": [10, 8, 2, 2], "upsample_initial_channel": 512, "upsample_kernel_sizes": [16, 16, 4, 4], "n_layers_q": 3, "use_spectral_norm": False, "gin_channels": 256, "ssl_dim": 256, "n_speakers": 0, }, "spk": { "nen": 0, "paimon": 1, "yunhao": 2 } } pattern = re.compile(r'^[\.a-zA-Z0-9_\/]+$') if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--train_list", type=str, default="./filelists/train.txt", help="path to train list") parser.add_argument("--val_list", type=str, default="./filelists/val.txt", help="path to val list") parser.add_argument("--test_list", type=str, default="./filelists/test.txt", help="path to test list") parser.add_argument("--source_dir", type=str, default="./dataset/32k", help="path to source dir") parser.add_argument("--config_path", type=str, default="./config/config.json", help="path to source dir") args = parser.parse_args() train = [] val = [] test = [] idx = 0 spk_dict = {} spk_id = 0 for speaker in tqdm(os.listdir(args.source_dir)): spk_dict[speaker] = spk_id spk_id += 1 wavs = ["/".join([args.source_dir, speaker, i]) for i in os.listdir(os.path.join(args.source_dir, speaker))] for wavpath in wavs: if not pattern.match(wavpath): print(f"warning:文件名{wavpath}中包含非字母数字下划线,可能会导致错误。(也可能不会)") if len(wavs) < 10: print(f"warning:{speaker}数据集数量小于10条,请补充数据") wavs = [i for i in wavs if i.endswith("wav")] shuffle(wavs) - train += wavs[2:-2] - val += wavs[:2] - test += wavs[-2:] + if len(wavs) < 10: + train += wavs + val += wavs + test += wavs + else: + train += wavs[2:-2] + val += wavs[:2] + test += wavs[-2:] n_speakers = len(spk_dict.keys()) * 2 shuffle(train) shuffle(val) shuffle(test) print("Writing", args.train_list) with open(args.train_list, "w") as f: for fname in tqdm(train): wavpath = fname f.write(wavpath + "\n") print("Writing", args.val_list) with open(args.val_list, "w") as f: for fname in tqdm(val): wavpath = fname f.write(wavpath + "\n") print("Writing", args.test_list) with open(args.test_list, "w") as f: for fname in tqdm(test): wavpath = fname f.write(wavpath + "\n") config_template["model"]["n_speakers"] = n_speakers config_template["spk"] = spk_dict print("Writing configs/config.json") # 修改配置文件 config_template["data"]["training_files"] = args.train_list config_template["data"]["validation_files"] = args.val_list with open(args.config_path, "w") as f: json.dump(config_template, f, indent=2) diff --git a/AutoCoverTool/ref/so_vits_svc/real_time_inference.py b/AutoCoverTool/ref/so_vits_svc/real_time_inference.py new file mode 100644 index 0000000..e0f5af0 --- /dev/null +++ b/AutoCoverTool/ref/so_vits_svc/real_time_inference.py @@ -0,0 +1,23 @@ +""" +实时推理 +""" +import os +import time +from ref.so_vits_svc.inference.infer_tool import SVCRealTimeByBuffer, Svc + + +def test(in_path, out_path): + config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "config.json") + print(config_path) + net_g_path = "/Users/yangjianli/starmaker/av_tools/resource_av_svc/models/G_inference_1000.pth" + hubert_net_path = "/Users/yangjianli/starmaker/av_tools/resource_av_svc/models/hubert-soft-0d54a1f4.pt" + stbb = SVCRealTimeByBuffer(net_g_path, config_path, hubert_net_path) + stbb.process(in_path, out_path) + + +if __name__ == '__main__': + in_path = "/Users/yangjianli/starmaker/av_tools/resource_av_svc/clean_jianli/vocal_32.wav" + out_path = "/Users/yangjianli/starmaker/av_tools/resource_av_svc/clean_jianli/vocal_out.wav" + st = time.time() + test(in_path, out_path) + print("sp={}".format(time.time() - st)) diff --git a/AutoCoverTool/ref/so_vits_svc/train.py b/AutoCoverTool/ref/so_vits_svc/train.py index 69f56ac..cb656f3 100644 --- a/AutoCoverTool/ref/so_vits_svc/train.py +++ b/AutoCoverTool/ref/so_vits_svc/train.py @@ -1,312 +1,334 @@ import logging logging.getLogger('matplotlib').setLevel(logging.WARNING) logging.getLogger('numba').setLevel(logging.WARNING) import os import json import argparse import itertools import math import torch from torch import nn, optim from torch.nn import functional as F from torch.utils.data import DataLoader from torch.utils.tensorboard import SummaryWriter import torch.multiprocessing as mp import torch.distributed as dist from torch.nn.parallel import DistributedDataParallel as DDP from torch.cuda.amp import autocast, GradScaler import commons import utils from data_utils import TextAudioSpeakerLoader, EvalDataLoader from models import ( SynthesizerTrn, MultiPeriodDiscriminator, ) from losses import ( kl_loss, generator_loss, discriminator_loss, feature_loss ) from mel_processing import mel_spectrogram_torch, spec_to_mel_torch torch.backends.cudnn.benchmark = True global_step = 0 # os.environ['TORCH_DISTRIBUTED_DEBUG'] = 'INFO' def main(): """Assume Single Node Multi GPUs Training Only""" assert torch.cuda.is_available(), "CPU training is not allowed." hps = utils.get_hparams() n_gpus = torch.cuda.device_count() os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = hps.train.port mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hps,)) def run(rank, n_gpus, hps): print("CurRank:===>", rank) global global_step if rank == 0: logger = utils.get_logger(hps.model_dir) logger.info(hps) utils.check_git_hash(hps.model_dir) writer = SummaryWriter(log_dir=hps.model_dir) writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval")) dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank) torch.manual_seed(hps.train.seed) torch.cuda.set_device(rank) # 从每段音频文件中获取特征 # hubert特征,f0,幅度谱特征,对应音频段波形(384 * hop_length),人声编码[0],每一次获取3840ms长度的特征 - train_dataset = TextAudioSpeakerLoader(hps.data.training_files, hps) - train_loader = DataLoader(train_dataset, num_workers=8, shuffle=False, pin_memory=True, + # train_dataset = TextAudioSpeakerLoader(hps.data.training_files, hps) + from script.train_user_by_one_media import TextAudioSpeakerLoader + train_dataset = TextAudioSpeakerLoader('data/train_users/qiankun_v1/vocals/speaker0/qiankun.wav') + train_loader = DataLoader(train_dataset, num_workers=0, shuffle=False, pin_memory=True, batch_size=hps.train.batch_size) if rank == 0: eval_dataset = EvalDataLoader(hps.data.validation_files, hps) - eval_loader = DataLoader(eval_dataset, num_workers=1, shuffle=False, + eval_loader = DataLoader(eval_dataset, num_workers=0, shuffle=False, batch_size=1, pin_memory=False, drop_last=False) net_g = SynthesizerTrn( hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, - **hps.model).cuda(rank) + **hps.model, no_flow=False, use_v3=False).cuda(rank) net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) optim_g = torch.optim.AdamW( net_g.parameters(), hps.train.learning_rate, betas=hps.train.betas, eps=hps.train.eps) optim_d = torch.optim.AdamW( net_d.parameters(), hps.train.learning_rate, betas=hps.train.betas, eps=hps.train.eps) - net_g = DDP(net_g, device_ids=[rank]) # , find_unused_parameters=True) - net_d = DDP(net_d, device_ids=[rank]) + # net_g = DDP(net_g, device_ids=[rank]) # , find_unused_parameters=True) + # net_d = DDP(net_d, device_ids=[rank]) + print("{}".format(hps.model_dir)) try: _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g) _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d) global_step = (epoch_str - 1) * len(train_loader) - print("load checkpoint ok !") - except: + print("load checkpoint ok ! {}".format(epoch_str)) + except Exception as ex: + global_step = 1 epoch_str = 1 - global_step = 0 + print("EXXX ! {}".format(ex)) scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) scaler = GradScaler(enabled=hps.train.fp16_run) for epoch in range(epoch_str, hps.train.epochs + 1): if rank == 0: train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, eval_loader], logger, [writer, writer_eval]) else: train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, None], None, None) scheduler_g.step() scheduler_d.step() def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers): net_g, net_d = nets optim_g, optim_d = optims scheduler_g, scheduler_d = schedulers train_loader, eval_loader = loaders if writers is not None: writer, writer_eval = writers # train_loader.batch_sampler.set_epoch(epoch) global global_step net_g.train() net_d.train() for batch_idx, items in enumerate(train_loader): # hubert特征,f0,幅度谱特征,对应音频段波形(384 * hop_length),人声编码[0] c, f0, spec, y, spk = items g = spk.cuda(rank, non_blocking=True) spec, y = spec.cuda(rank, non_blocking=True), y.cuda(rank, non_blocking=True) c = c.cuda(rank, non_blocking=True) f0 = f0.cuda(rank, non_blocking=True) """ "sampling_rate": 32000, "filter_length": 1280, "hop_length": 320, "win_length": 1280, "n_mel_channels": 80, "mel_fmin": 0.0, "mel_fmax": null """ mel = spec_to_mel_torch( spec, hps.data.filter_length, hps.data.n_mel_channels, hps.data.sampling_rate, hps.data.mel_fmin, hps.data.mel_fmax) with autocast(enabled=hps.train.fp16_run): # net_g的输入: hubert特征,f0,幅度谱特征,说话人id,mel谱特征 # net_g的输出: # 原始波形,批次中每个采样到的帧的位置,批次中幅度谱的有效帧位置, # 幅度谱编码得到正态分布后随机采样得到的z, z经过标准化流之后得到z_p, hubert特征层得到的正态分布的均值, # hubert特征层得到的正态分布的标准差(logs_p),幅度谱和人声信息得到的均值(m_q),幅度谱和人声信息得到的标准差(logs_q) y_hat, ids_slice, z_mask, \ (z, z_p, m_p, logs_p, m_q, logs_q) = net_g(c, f0, spec, g=g, mel=mel) y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length) y_hat_mel = mel_spectrogram_torch( y_hat.squeeze(1), hps.data.filter_length, hps.data.n_mel_channels, hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length, hps.data.mel_fmin, hps.data.mel_fmax ) y = commons.slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice # Discriminator y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach()) with autocast(enabled=False): loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g) loss_disc_all = loss_disc optim_d.zero_grad() scaler.scale(loss_disc_all).backward() scaler.unscale_(optim_d) - grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None) + # grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None) scaler.step(optim_d) with autocast(enabled=hps.train.fp16_run): # Generator y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat) with autocast(enabled=False): # mel谱之间的损失函数,后面是系数,误差越小越好 loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel # KL散度,z_p: 幅度谱侧得到的采样值经过标准化流之后的结果,logs_q: 幅度谱侧得到的标准差,m_p:hubert侧得到的均值 # logs_p: hubert侧得到的标准差,z_mask: 批次中幅度谱的有效帧位置, loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl # 在d模型中将y和y_hat的每一层特征结果都拿出来,做l1距离 loss_fm = feature_loss(fmap_r, fmap_g) loss_gen, losses_gen = generator_loss(y_d_hat_g) loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl optim_g.zero_grad() scaler.scale(loss_gen_all).backward() scaler.unscale_(optim_g) - grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) + # grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) scaler.step(optim_g) scaler.update() if rank == 0: if global_step % hps.train.log_interval == 0: lr = optim_g.param_groups[0]['lr'] losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_kl] logger.info('Train Epoch: {} [{:.0f}%]'.format( epoch, 100. * batch_idx / len(train_loader))) logger.info([x.item() for x in losses] + [global_step, lr]) - scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, "learning_rate": lr, - "grad_norm_d": grad_norm_d, "grad_norm_g": grad_norm_g} - scalar_dict.update({"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl}) - - scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}) - scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}) - scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}) - image_dict = { - "slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()), - "slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()), - "all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()), - } - - utils.summarize( - writer=writer, - global_step=global_step, - images=image_dict, - scalars=scalar_dict - ) - - if global_step % hps.train.eval_interval == 0: + losses_numpy = [round(loss_disc.item(), 3), round(loss_gen.item(), 3), + round(loss_fm.item(), 3), round(loss_mel.item(), 3), round(loss_kl.item(), 3)] + print("gstep={},lr={},disc={},gen={},fm={},mel={},kl={},tot={}".format(global_step, lr, + losses_numpy[0], + losses_numpy[1], + losses_numpy[2], + losses_numpy[3], + losses_numpy[4], + sum(losses_numpy))) + + # scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, "learning_rate": lr, + # "grad_norm_d": grad_norm_d, "grad_norm_g": grad_norm_g} + # scalar_dict.update({"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl}) + # + # scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}) + # scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}) + # scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}) + # image_dict = { + # "slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()), + # "slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()), + # "all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()), + # } + + # utils.summarize( + # writer=writer, + # global_step=global_step, + # images=image_dict, + # scalars=scalar_dict + # ) + + if global_step % hps.train.eval_interval == 0 and global_step != 0: evaluate(hps, net_g, eval_loader, writer_eval) utils.save_checkpoint(net_g, optim_g, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "G_{}.pth".format(global_step))) utils.save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "D_{}.pth".format(global_step))) # 达到2000个step则停止 - if global_step == 2000: - logger.info('====> 2000 ==> Epoch: {},{}'.format(epoch, global_step)) + if global_step == 1000: + logger.info('====> 1000 ==> Epoch: {},{}'.format(epoch, global_step)) exit(0) global_step += 1 if rank == 0: logger.info('====> Epoch: {},{}'.format(epoch, global_step)) def evaluate(hps, generator, eval_loader, writer_eval): generator.eval() image_dict = {} audio_dict = {} with torch.no_grad(): for batch_idx, items in enumerate(eval_loader): c, f0, spec, y, spk = items g = spk[:1].cuda(0) spec, y = spec[:1].cuda(0), y[:1].cuda(0) c = c[:1].cuda(0) f0 = f0[:1].cuda(0) mel = spec_to_mel_torch( spec, hps.data.filter_length, hps.data.n_mel_channels, hps.data.sampling_rate, hps.data.mel_fmin, hps.data.mel_fmax) - y_hat = generator.module.infer(c, f0, g=g, mel=mel) + + if hasattr(generator, 'module'): + y_hat = generator.module.infer(c, f0, g=g, mel=mel) + else: + y_hat = generator.infer(c, f0, g=g, mel=mel) y_hat_mel = mel_spectrogram_torch( y_hat.squeeze(1).float(), hps.data.filter_length, hps.data.n_mel_channels, hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length, hps.data.mel_fmin, hps.data.mel_fmax ) audio_dict.update({ f"gen/audio_{batch_idx}": y_hat[0], f"gt/audio_{batch_idx}": y[0] }) image_dict.update({ f"gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy()), "gt/mel": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy()) }) utils.summarize( writer=writer_eval, global_step=global_step, images=image_dict, audios=audio_dict, audio_sampling_rate=hps.data.sampling_rate ) generator.train() if __name__ == "__main__": + import time + + st = time.time() main() + print("sp={}".format(time.time() - st)) diff --git a/AutoCoverTool/ref/so_vits_svc/utils.py b/AutoCoverTool/ref/so_vits_svc/utils.py index 2dadf1a..6bba348 100644 --- a/AutoCoverTool/ref/so_vits_svc/utils.py +++ b/AutoCoverTool/ref/so_vits_svc/utils.py @@ -1,366 +1,366 @@ import os import glob import re import sys import argparse import logging import json import subprocess import librosa import numpy as np import torchaudio from scipy.io.wavfile import read import torch import torchvision from torch.nn import functional as F from commons import sequence_mask from hubert import hubert_model MATPLOTLIB_FLAG = False logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) logger = logging f0_bin = 256 f0_max = 1100.0 f0_min = 50.0 f0_mel_min = 1127 * np.log(1 + f0_min / 700) f0_mel_max = 1127 * np.log(1 + f0_max / 700) def f0_to_coarse(f0): """ 将f0按照Log10的级别进行区分,最后归一化到[1-255] 之间 :param f0: :return: """ is_torch = isinstance(f0, torch.Tensor) f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700) f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1 f0_mel[f0_mel <= 1] = 1 f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1 # np.rint() 四舍五入取整 f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int) assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min()) return f0_coarse def get_hubert_model(rank=None): hubert_soft = hubert_model.hubert_soft("/data/prod/so_vits_models/models/hubert-soft-0d54a1f4.pt") if rank is not None: hubert_soft = hubert_soft.cuda(rank) return hubert_soft def get_hubert_content(hmodel, y=None, path=None): if path is not None: source, sr = torchaudio.load(path) source = torchaudio.functional.resample(source, sr, 16000) if len(source.shape) == 2 and source.shape[1] >= 2: source = torch.mean(source, dim=0).unsqueeze(0) else: source = y source = source.unsqueeze(0) with torch.inference_mode(): units = hmodel.units(source) return units.transpose(1, 2) def get_content(cmodel, y): with torch.no_grad(): c = cmodel.extract_features(y.squeeze(1))[0] c = c.transpose(1, 2) return c def transform(mel, height): # 68-92 # r = np.random.random() # rate = r * 0.3 + 0.85 # 0.85-1.15 # height = int(mel.size(-2) * rate) tgt = torchvision.transforms.functional.resize(mel, (height, mel.size(-1))) if height >= mel.size(-2): return tgt[:, :mel.size(-2), :] else: silence = tgt[:, -1:, :].repeat(1, mel.size(-2) - height, 1) silence += torch.randn_like(silence) / 10 return torch.cat((tgt, silence), 1) def stretch(mel, width): # 0.5-2 return torchvision.transforms.functional.resize(mel, (mel.size(-2), width)) def load_checkpoint(checkpoint_path, model, optimizer=None): assert os.path.isfile(checkpoint_path) checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') - iteration = checkpoint_dict['iteration'] - learning_rate = checkpoint_dict['learning_rate'] + iteration = checkpoint_dict.get('iteration', None) + learning_rate = checkpoint_dict.get('learning_rate', None) if iteration is None: iteration = 1 if learning_rate is None: learning_rate = 0.0002 - if optimizer is not None and checkpoint_dict['optimizer'] is not None: + if optimizer is not None and checkpoint_dict.get('optimizer', None) is not None: optimizer.load_state_dict(checkpoint_dict['optimizer']) saved_state_dict = checkpoint_dict['model'] if hasattr(model, 'module'): state_dict = model.module.state_dict() else: state_dict = model.state_dict() new_state_dict = {} for k, v in state_dict.items(): try: new_state_dict[k] = saved_state_dict[k] except: logger.info("%s is not in the checkpoint" % k) new_state_dict[k] = v if hasattr(model, 'module'): model.module.load_state_dict(new_state_dict) else: model.load_state_dict(new_state_dict) logger.info("Loaded checkpoint '{}' (iteration {})".format( checkpoint_path, iteration)) return model, optimizer, learning_rate, iteration def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path): logger.info("Saving model and optimizer state at iteration {} to {}".format( iteration, checkpoint_path)) if hasattr(model, 'module'): state_dict = model.module.state_dict() else: state_dict = model.state_dict() torch.save({'model': state_dict, 'iteration': iteration, 'optimizer': optimizer.state_dict(), 'learning_rate': learning_rate}, checkpoint_path) clean_ckpt = False if clean_ckpt: clean_checkpoints(path_to_models='logs/32k/', n_ckpts_to_keep=3, sort_by_time=True) def clean_checkpoints(path_to_models='logs/48k/', n_ckpts_to_keep=2, sort_by_time=True): """Freeing up space by deleting saved ckpts Arguments: path_to_models -- Path to the model directory n_ckpts_to_keep -- Number of ckpts to keep, excluding G_0.pth and D_0.pth sort_by_time -- True -> chronologically delete ckpts False -> lexicographically delete ckpts """ ckpts_files = [f for f in os.listdir(path_to_models) if os.path.isfile(os.path.join(path_to_models, f))] name_key = (lambda _f: int(re.compile('._(\d+)\.pth').match(_f).group(1))) time_key = (lambda _f: os.path.getmtime(os.path.join(path_to_models, _f))) sort_key = time_key if sort_by_time else name_key x_sorted = lambda _x: sorted([f for f in ckpts_files if f.startswith(_x) and not f.endswith('_0.pth')], key=sort_key) to_del = [os.path.join(path_to_models, fn) for fn in (x_sorted('G')[:-n_ckpts_to_keep] + x_sorted('D')[:-n_ckpts_to_keep])] del_info = lambda fn: logger.info(f".. Free up space by deleting ckpt {fn}") del_routine = lambda x: [os.remove(x), del_info(x)] rs = [del_routine(fn) for fn in to_del] def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050): for k, v in scalars.items(): writer.add_scalar(k, v, global_step) for k, v in histograms.items(): writer.add_histogram(k, v, global_step) for k, v in images.items(): writer.add_image(k, v, global_step, dataformats='HWC') for k, v in audios.items(): writer.add_audio(k, v, global_step, audio_sampling_rate) def latest_checkpoint_path(dir_path, regex="G_*.pth"): f_list = glob.glob(os.path.join(dir_path, regex)) f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) x = f_list[-1] print(x) return x def plot_spectrogram_to_numpy(spectrogram): global MATPLOTLIB_FLAG if not MATPLOTLIB_FLAG: import matplotlib matplotlib.use("Agg") MATPLOTLIB_FLAG = True mpl_logger = logging.getLogger('matplotlib') mpl_logger.setLevel(logging.WARNING) import matplotlib.pylab as plt import numpy as np fig, ax = plt.subplots(figsize=(10, 2)) im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation='none') plt.colorbar(im, ax=ax) plt.xlabel("Frames") plt.ylabel("Channels") plt.tight_layout() fig.canvas.draw() data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) plt.close() return data def plot_alignment_to_numpy(alignment, info=None): global MATPLOTLIB_FLAG if not MATPLOTLIB_FLAG: import matplotlib matplotlib.use("Agg") MATPLOTLIB_FLAG = True mpl_logger = logging.getLogger('matplotlib') mpl_logger.setLevel(logging.WARNING) import matplotlib.pylab as plt import numpy as np fig, ax = plt.subplots(figsize=(6, 4)) im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower', interpolation='none') fig.colorbar(im, ax=ax) xlabel = 'Decoder timestep' if info is not None: xlabel += '\n\n' + info plt.xlabel(xlabel) plt.ylabel('Encoder timestep') plt.tight_layout() fig.canvas.draw() data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) plt.close() return data def load_wav_to_torch(full_path): sampling_rate, data = read(full_path) return torch.FloatTensor(data.astype(np.float32)), sampling_rate def load_filepaths_and_text(filename, split="|"): with open(filename, encoding='utf-8') as f: filepaths_and_text = [line.strip().split(split) for line in f] return filepaths_and_text def get_hparams(init=True): parser = argparse.ArgumentParser() parser.add_argument('-c', '--config', type=str, default="./configs/base.json", help='JSON file for configuration') parser.add_argument('-m', '--model', type=str, required=True, help='Model name') parser.add_argument('-l', '--logs', type=str, required=True, help='log Name') args = parser.parse_args() model_dir = os.path.join(args.logs, args.model) if not os.path.exists(model_dir): os.makedirs(model_dir) config_path = args.config config_save_path = os.path.join(model_dir, "config.json") if init: with open(config_path, "r") as f: data = f.read() with open(config_save_path, "w") as f: f.write(data) else: with open(config_save_path, "r") as f: data = f.read() config = json.loads(data) hparams = HParams(**config) hparams.model_dir = model_dir return hparams def get_hparams_from_dir(model_dir): config_save_path = os.path.join(model_dir, "config.json") with open(config_save_path, "r") as f: data = f.read() config = json.loads(data) hparams = HParams(**config) hparams.model_dir = model_dir return hparams def get_hparams_from_file(config_path): with open(config_path, "r") as f: data = f.read() config = json.loads(data) hparams = HParams(**config) return hparams def check_git_hash(model_dir): source_dir = os.path.dirname(os.path.realpath(__file__)) if not os.path.exists(os.path.join(source_dir, ".git")): logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format( source_dir )) return cur_hash = subprocess.getoutput("git rev-parse HEAD") path = os.path.join(model_dir, "githash") if os.path.exists(path): saved_hash = open(path).read() if saved_hash != cur_hash: logger.warn("git hash values are different. {}(saved) != {}(current)".format( saved_hash[:8], cur_hash[:8])) else: open(path, "w").write(cur_hash) def get_logger(model_dir, filename="train.log"): global logger logger = logging.getLogger(os.path.basename(model_dir)) logger.setLevel(logging.DEBUG) formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") if not os.path.exists(model_dir): os.makedirs(model_dir) h = logging.FileHandler(os.path.join(model_dir, filename)) h.setLevel(logging.DEBUG) h.setFormatter(formatter) logger.addHandler(h) return logger class HParams(): def __init__(self, **kwargs): for k, v in kwargs.items(): if type(v) == dict: v = HParams(**v) self[k] = v def keys(self): return self.__dict__.keys() def items(self): return self.__dict__.items() def values(self): return self.__dict__.values() def __len__(self): return len(self.__dict__) def __getitem__(self, key): return getattr(self, key) def __setitem__(self, key, value): return setattr(self, key, value) def __contains__(self, key): return key in self.__dict__ def __repr__(self): return self.__dict__.__repr__() diff --git a/AutoCoverTool/ref/so_vits_svc/vdecoder/hifigan/models.py b/AutoCoverTool/ref/so_vits_svc/vdecoder/hifigan/models.py index 9747301..629812e 100644 --- a/AutoCoverTool/ref/so_vits_svc/vdecoder/hifigan/models.py +++ b/AutoCoverTool/ref/so_vits_svc/vdecoder/hifigan/models.py @@ -1,503 +1,511 @@ import os import json from .env import AttrDict import numpy as np import torch import torch.nn.functional as F import torch.nn as nn from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm from .utils import init_weights, get_padding LRELU_SLOPE = 0.1 def load_model(model_path, device='cuda'): config_file = os.path.join(os.path.split(model_path)[0], 'config.json') with open(config_file) as f: data = f.read() global h json_config = json.loads(data) h = AttrDict(json_config) generator = Generator(h).to(device) cp_dict = torch.load(model_path) generator.load_state_dict(cp_dict['generator']) generator.eval() generator.remove_weight_norm() del cp_dict return generator, h class ResBlock1(torch.nn.Module): def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): super(ResBlock1, self).__init__() self.h = h - self.convs1 = nn.ModuleList([ - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], - padding=get_padding(kernel_size, dilation[0]))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], - padding=get_padding(kernel_size, dilation[1]))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], - padding=get_padding(kernel_size, dilation[2]))) - ]) + weight_norm_arr = [] + for i in range(len(dilation)): + weight_norm_arr.append(weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[i], + padding=get_padding(kernel_size, dilation[i]))), ) + self.convs1 = nn.ModuleList( + # [weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + # padding=get_padding(kernel_size, dilation[0]))), + # weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + # padding=get_padding(kernel_size, dilation[1]))), + # weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], + # padding=get_padding(kernel_size, dilation[2]))) + # ] + weight_norm_arr + ) self.convs1.apply(init_weights) self.convs2 = nn.ModuleList([ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))) ]) self.convs2.apply(init_weights) def forward(self, x): for c1, c2 in zip(self.convs1, self.convs2): xt = F.leaky_relu(x, LRELU_SLOPE) xt = c1(xt) xt = F.leaky_relu(xt, LRELU_SLOPE) xt = c2(xt) x = xt + x return x def remove_weight_norm(self): for l in self.convs1: remove_weight_norm(l) for l in self.convs2: remove_weight_norm(l) class ResBlock2(torch.nn.Module): def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)): super(ResBlock2, self).__init__() self.h = h self.convs = nn.ModuleList([ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1]))) ]) self.convs.apply(init_weights) def forward(self, x): for c in self.convs: xt = F.leaky_relu(x, LRELU_SLOPE) xt = c(xt) x = xt + x return x def remove_weight_norm(self): for l in self.convs: remove_weight_norm(l) def padDiff(x): - return F.pad(F.pad(x, (0,0,-1,1), 'constant', 0) - x, (0,0,0,-1), 'constant', 0) + return F.pad(F.pad(x, (0, 0, -1, 1), 'constant', 0) - x, (0, 0, 0, -1), 'constant', 0) + class SineGen(torch.nn.Module): """ Definition of sine generator SineGen(samp_rate, harmonic_num = 0, sine_amp = 0.1, noise_std = 0.003, voiced_threshold = 0, flag_for_pulse=False) samp_rate: sampling rate in Hz harmonic_num: number of harmonic overtones (default 0) sine_amp: amplitude of sine-wavefrom (default 0.1) noise_std: std of Gaussian noise (default 0.003) voiced_thoreshold: F0 threshold for U/V classification (default 0) flag_for_pulse: this SinGen is used inside PulseGen (default False) Note: when flag_for_pulse is True, the first time step of a voiced segment is always sin(np.pi) or cos(0) """ def __init__(self, samp_rate, harmonic_num=0, sine_amp=0.1, noise_std=0.003, voiced_threshold=0, flag_for_pulse=False): super(SineGen, self).__init__() self.sine_amp = sine_amp self.noise_std = noise_std self.harmonic_num = harmonic_num self.dim = self.harmonic_num + 1 self.sampling_rate = samp_rate self.voiced_threshold = voiced_threshold self.flag_for_pulse = flag_for_pulse def _f02uv(self, f0): # generate uv signal uv = (f0 > self.voiced_threshold).type(torch.float32) return uv def _f02sine(self, f0_values): """ f0_values: (batchsize, length, dim) where dim indicates fundamental tone and overtones """ # convert to F0 in rad. The interger part n can be ignored # because 2 * np.pi * n doesn't affect phase rad_values = (f0_values / self.sampling_rate) % 1 # initial phase noise (no noise for fundamental component) rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], \ device=f0_values.device) rand_ini[:, 0] = 0 rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad) if not self.flag_for_pulse: # for normal case # To prevent torch.cumsum numerical overflow, # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1. # Buffer tmp_over_one_idx indicates the time step to add -1. # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi tmp_over_one = torch.cumsum(rad_values, 1) % 1 tmp_over_one_idx = (padDiff(tmp_over_one)) < 0 cumsum_shift = torch.zeros_like(rad_values) cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi) else: # If necessary, make sure that the first time step of every # voiced segments is sin(pi) or cos(0) # This is used for pulse-train generation # identify the last time step in unvoiced segments uv = self._f02uv(f0_values) uv_1 = torch.roll(uv, shifts=-1, dims=1) uv_1[:, -1, :] = 1 u_loc = (uv < 1) * (uv_1 > 0) # get the instantanouse phase tmp_cumsum = torch.cumsum(rad_values, dim=1) # different batch needs to be processed differently for idx in range(f0_values.shape[0]): temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :] temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :] # stores the accumulation of i.phase within # each voiced segments tmp_cumsum[idx, :, :] = 0 tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum # rad_values - tmp_cumsum: remove the accumulation of i.phase # within the previous voiced segment. i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1) # get the sines sines = torch.cos(i_phase * 2 * np.pi) return sines def forward(self, f0): """ sine_tensor, uv = forward(f0) input F0: tensor(batchsize=1, length, dim=1) f0 for unvoiced steps should be 0 output sine_tensor: tensor(batchsize=1, length, dim) output uv: tensor(batchsize=1, length, 1) """ with torch.no_grad(): f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) # fundamental component fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device)) # generate sine waveforms sine_waves = self._f02sine(fn) * self.sine_amp # generate uv signal # uv = torch.ones(f0.shape) # uv = uv * (f0 > self.voiced_threshold) uv = self._f02uv(f0) # noise: for unvoiced should be similar to sine_amp # std = self.sine_amp/3 -> max value ~ self.sine_amp # . for voiced regions is self.noise_std noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 noise = noise_amp * torch.randn_like(sine_waves) # first: set the unvoiced part to 0 by uv # then: additive noise sine_waves = sine_waves * uv + noise return sine_waves, uv, noise class SourceModuleHnNSF(torch.nn.Module): """ SourceModule for hn-nsf SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, add_noise_std=0.003, voiced_threshod=0) sampling_rate: sampling_rate in Hz harmonic_num: number of harmonic above F0 (default: 0) sine_amp: amplitude of sine source signal (default: 0.1) add_noise_std: std of additive Gaussian noise (default: 0.003) note that amplitude of noise in unvoiced is decided by sine_amp voiced_threshold: threhold to set U/V given F0 (default: 0) Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) F0_sampled (batchsize, length, 1) Sine_source (batchsize, length, 1) noise_source (batchsize, length 1) uv (batchsize, length, 1) """ def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1, add_noise_std=0.003, voiced_threshod=0): super(SourceModuleHnNSF, self).__init__() self.sine_amp = sine_amp self.noise_std = add_noise_std # to produce sine waveforms self.l_sin_gen = SineGen(sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod) # to merge source harmonics into a single excitation self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) self.l_tanh = torch.nn.Tanh() def forward(self, x): """ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) F0_sampled (batchsize, length, 1) Sine_source (batchsize, length, 1) noise_source (batchsize, length 1) """ # source for harmonic branch sine_wavs, uv, _ = self.l_sin_gen(x) sine_merge = self.l_tanh(self.l_linear(sine_wavs)) # source for noise branch, in the same shape as uv noise = torch.randn_like(uv) * self.sine_amp / 3 return sine_merge, noise, uv class Generator(torch.nn.Module): def __init__(self, h): super(Generator, self).__init__() self.h = h self.num_kernels = len(h["resblock_kernel_sizes"]) self.num_upsamples = len(h["upsample_rates"]) self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(h["upsample_rates"])) self.m_source = SourceModuleHnNSF( sampling_rate=h["sampling_rate"], harmonic_num=8) self.noise_convs = nn.ModuleList() self.conv_pre = weight_norm(Conv1d(h["inter_channels"], h["upsample_initial_channel"], 7, 1, padding=3)) resblock = ResBlock1 if h["resblock"] == '1' else ResBlock2 self.ups = nn.ModuleList() for i, (u, k) in enumerate(zip(h["upsample_rates"], h["upsample_kernel_sizes"])): c_cur = h["upsample_initial_channel"] // (2 ** (i + 1)) self.ups.append(weight_norm( - ConvTranspose1d(h["upsample_initial_channel"] // (2 ** i), h["upsample_initial_channel"] // (2 ** (i + 1)), + ConvTranspose1d(h["upsample_initial_channel"] // (2 ** i), + h["upsample_initial_channel"] // (2 ** (i + 1)), k, u, padding=(k - u) // 2))) if i + 1 < len(h["upsample_rates"]): # stride_f0 = np.prod(h["upsample_rates"][i + 1:]) self.noise_convs.append(Conv1d( 1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2)) else: self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) self.resblocks = nn.ModuleList() for i in range(len(self.ups)): ch = h["upsample_initial_channel"] // (2 ** (i + 1)) for j, (k, d) in enumerate(zip(h["resblock_kernel_sizes"], h["resblock_dilation_sizes"])): self.resblocks.append(resblock(h, ch, k, d)) self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) self.ups.apply(init_weights) self.conv_post.apply(init_weights) self.cond = nn.Conv1d(h['gin_channels'], h['upsample_initial_channel'], 1) def forward(self, x, f0, g=None): # print(1,x.shape,f0.shape,f0[:, None].shape) f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t # print(2,f0.shape) har_source, noi_source, uv = self.m_source(f0) har_source = har_source.transpose(1, 2) x = self.conv_pre(x) x = x + self.cond(g) # print(124,x.shape,har_source.shape) for i in range(self.num_upsamples): x = F.leaky_relu(x, LRELU_SLOPE) # print(3,x.shape) x = self.ups[i](x) x_source = self.noise_convs[i](har_source) # print(4,x_source.shape,har_source.shape,x.shape) x = x + x_source xs = None for j in range(self.num_kernels): if xs is None: xs = self.resblocks[i * self.num_kernels + j](x) else: xs += self.resblocks[i * self.num_kernels + j](x) x = xs / self.num_kernels x = F.leaky_relu(x) x = self.conv_post(x) x = torch.tanh(x) return x def remove_weight_norm(self): print('Removing weight norm...') for l in self.ups: remove_weight_norm(l) for l in self.resblocks: l.remove_weight_norm() remove_weight_norm(self.conv_pre) remove_weight_norm(self.conv_post) class DiscriminatorP(torch.nn.Module): def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): super(DiscriminatorP, self).__init__() self.period = period norm_f = weight_norm if use_spectral_norm == False else spectral_norm self.convs = nn.ModuleList([ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), ]) self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) def forward(self, x): fmap = [] # 1d to 2d b, c, t = x.shape if t % self.period != 0: # pad first n_pad = self.period - (t % self.period) x = F.pad(x, (0, n_pad), "reflect") t = t + n_pad x = x.view(b, c, t // self.period, self.period) for l in self.convs: x = l(x) x = F.leaky_relu(x, LRELU_SLOPE) fmap.append(x) x = self.conv_post(x) fmap.append(x) x = torch.flatten(x, 1, -1) return x, fmap class MultiPeriodDiscriminator(torch.nn.Module): def __init__(self, periods=None): super(MultiPeriodDiscriminator, self).__init__() self.periods = periods if periods is not None else [2, 3, 5, 7, 11] self.discriminators = nn.ModuleList() for period in self.periods: self.discriminators.append(DiscriminatorP(period)) def forward(self, y, y_hat): y_d_rs = [] y_d_gs = [] fmap_rs = [] fmap_gs = [] for i, d in enumerate(self.discriminators): y_d_r, fmap_r = d(y) y_d_g, fmap_g = d(y_hat) y_d_rs.append(y_d_r) fmap_rs.append(fmap_r) y_d_gs.append(y_d_g) fmap_gs.append(fmap_g) return y_d_rs, y_d_gs, fmap_rs, fmap_gs class DiscriminatorS(torch.nn.Module): def __init__(self, use_spectral_norm=False): super(DiscriminatorS, self).__init__() norm_f = weight_norm if use_spectral_norm == False else spectral_norm self.convs = nn.ModuleList([ norm_f(Conv1d(1, 128, 15, 1, padding=7)), norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)), norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)), norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)), norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)), norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)), norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), ]) self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) def forward(self, x): fmap = [] for l in self.convs: x = l(x) x = F.leaky_relu(x, LRELU_SLOPE) fmap.append(x) x = self.conv_post(x) fmap.append(x) x = torch.flatten(x, 1, -1) return x, fmap class MultiScaleDiscriminator(torch.nn.Module): def __init__(self): super(MultiScaleDiscriminator, self).__init__() self.discriminators = nn.ModuleList([ DiscriminatorS(use_spectral_norm=True), DiscriminatorS(), DiscriminatorS(), ]) self.meanpools = nn.ModuleList([ AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2) ]) def forward(self, y, y_hat): y_d_rs = [] y_d_gs = [] fmap_rs = [] fmap_gs = [] for i, d in enumerate(self.discriminators): if i != 0: y = self.meanpools[i - 1](y) y_hat = self.meanpools[i - 1](y_hat) y_d_r, fmap_r = d(y) y_d_g, fmap_g = d(y_hat) y_d_rs.append(y_d_r) fmap_rs.append(fmap_r) y_d_gs.append(y_d_g) fmap_gs.append(fmap_g) return y_d_rs, y_d_gs, fmap_rs, fmap_gs def feature_loss(fmap_r, fmap_g): loss = 0 for dr, dg in zip(fmap_r, fmap_g): for rl, gl in zip(dr, dg): loss += torch.mean(torch.abs(rl - gl)) return loss * 2 def discriminator_loss(disc_real_outputs, disc_generated_outputs): loss = 0 r_losses = [] g_losses = [] for dr, dg in zip(disc_real_outputs, disc_generated_outputs): r_loss = torch.mean((1 - dr) ** 2) g_loss = torch.mean(dg ** 2) loss += (r_loss + g_loss) r_losses.append(r_loss.item()) g_losses.append(g_loss.item()) return loss, r_losses, g_losses def generator_loss(disc_outputs): loss = 0 gen_losses = [] for dg in disc_outputs: l = torch.mean((1 - dg) ** 2) gen_losses.append(l) loss += l return loss, gen_losses diff --git a/AutoCoverTool/script/pure_model.py b/AutoCoverTool/script/pure_model.py new file mode 100644 index 0000000..39fa2a1 --- /dev/null +++ b/AutoCoverTool/script/pure_model.py @@ -0,0 +1,26 @@ +""" +只保留模型原始大小 +""" +import torch + + +def keep_pure(in_path, dst_path): + device = 'cuda' + mm = torch.load(in_path, map_location=device) + torch.save(mm["model"], dst_path) + + +def change_iter(in_path, dst_path): + device = 'cuda' + mm = torch.load(in_path, map_location=device) + mm["iteration"] = 1 + mm["learning_rate"] = 0.0001 + torch.save(mm, dst_path) + + +if __name__ == '__main__': + # keep_pure("data/train_users/qiankun_v1/logs/32k/G_1000.pth", + # "data/train_users/qiankun_v1/logs/32k/G_inference_1000.pth") + change_iter("data/online_models/models/base_model/sunyanzi_base_d_48000.pth", + "data/online_models/models/base_model/sunyanzi_base_d_48000_no_flow.pth", + ) diff --git a/AutoCoverTool/script/train.sh b/AutoCoverTool/script/train.sh index 5f7a31b..f1b3a8a 100644 --- a/AutoCoverTool/script/train.sh +++ b/AutoCoverTool/script/train.sh @@ -1,22 +1,57 @@ #export LD_LIBRARY_PATH=/data/gpu_env_common/env/anaconda3/envs/so_vits_svc/lib:$LD_LIBRARY_PATH export PATH=$PATH:/data/gpu_env_common/env/bin/ffmpeg/bin #export PYTHONPATH=$PWD:$PWD/ref/music_remover/demucs export PYTHONPATH=$PWD:$PWD/ref/music_remover/demucs:$PWD/ref/so_vits_svc:$PWD/ref/split_dirty_frame:$PWD/ref/adaptive_voice_conversion mkdir -p /data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1/filelists mkdir -p /data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1/config # 1. 收集数据放到train_users/zjl/src # 2. 提取人声 & 分片 & 取音量响度大的Top80 /data/gpu_env_common/env/anaconda3/envs/auto_song_cover/bin/python script/get_vocals_for_train.py $1 # 3. 重采样 /data/gpu_env_common/env/anaconda3/envs/auto_song_cover/bin/python ref/so_vits_svc/resample.py --in_dir=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1 --out_dir2=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1/slice_resample # 4. 生成配置文件 /data/gpu_env_common/env/anaconda3/envs/auto_song_cover/bin/python ref/so_vits_svc/preprocess_flist_config.py --source_dir=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1/slice_resample --train_list=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1/filelists/train.txt --val_list=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1/filelists/val.txt --test_list=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1/filelists/test.txt --config_path=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1/config/config.json # 5. 预处理提取特征 /data/gpu_env_common/env/anaconda3/envs/auto_song_cover/bin/python ref/so_vits_svc/preprocess_hubert_f0.py --in_dir=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1/slice_resample # 6. 拷贝数据到logs文件夹 mkdir -p data/train_users/multi_users/$1/logs/32k cp -r data/models/G_0.pth data/train_users/multi_users/$1/logs/32k cp -r data/models/D_0.pth data/train_users/multi_users/$1/logs/32k # 7. 训练 /data/gpu_env_common/env/anaconda3/envs/auto_song_cover/bin/python ref/so_vits_svc/train.py -c data/train_users/multi_users/$1/config/config.json -m 32k -l data/train_users/multi_users/$1/logs + + + +mkdir -p data/train_users/qiankun_v2/filelists +mkdir -p data/train_users/qiankun_v2/config +# 1. 收集数据放到train_users/zjl/src +# 2. 提取人声 & 分片 & 取音量响度大的Top80 +python script/get_vocals_for_train.py $1 +# 3. 重采样 +python ref/so_vits_svc/resample.py --in_dir=data/train_users/qiankun_v2/vocals --out_dir2=data/train_users/qiankun_v2/slice_resample +# 4. 生成配置文件 +python ref/so_vits_svc/preprocess_flist_config.py --source_dir=data/train_users/qiankun_v2/slice_resample --train_list=data/train_users/qiankun_v2/filelists/train.txt --val_list=data/train_users/qiankun_v2/filelists/val.txt --test_list=data/train_users/qiankun_v2/filelists/test.txt --config_path=data/train_users/qiankun_v2/config/config.json +# 5. 预处理提取特征 +python ref/so_vits_svc/preprocess_hubert_f0.py --in_dir=data/train_users/qiankun_v2/slice_resample +# 6. 拷贝数据到logs文件夹 +mkdir -p data/train_users/qiankun_v2/logs/32k + +# 基础的 +cp -r data/models/base_D_0.pth data/train_users/xiafan_v1/logs/32k/D_0.pth +cp -r data/models/base_G_0.pth data/train_users/xiafan_v1/logs/32k/G_0.pth + +# 去除flow层的 +cp -r data/models/sunyanzi_v1_g_55000.pth data/train_users/qiankun_v1/logs/32k/G_0.pth +cp -r data/models/sunyanzi_v1_d_55000.pth data/train_users/qiankun_v1/logs/32k/D_0.pth + +# 去除flow层,解码器从v1->v3的 +cp -r data/models/train_base_v2_g_330000.pth data/train_users/qiankun_v2/logs/32k/G_0.pth +cp -r data/models/train_base_v2_d_330000.pth data/train_users/qiankun_v2/logs/32k/D_0.pth + +# 7. 训练 +python ref/so_vits_svc/train.py -c data/train_users/qiankun_v1/config/config.json -m 32k -l data/train_users/qiankun_v1/logs + + + +python ref/so_vits_svc/preprocess_flist_config.py --source_dir=data/train_users/sunyanzi_v1/slice_resample --train_list=data/train_users/sunyanzi_v1/filelists/train.txt --val_list=data/train_users/sunyanzi_v1/filelists/val.txt --test_list=data/train_users/sunyanzi_v1/filelists/test.txt --config_path=data/train_users/sunyanzi_v1/config/config.json \ No newline at end of file diff --git a/AutoCoverTool/script/train_user_by_one_media.py b/AutoCoverTool/script/train_user_by_one_media.py new file mode 100644 index 0000000..e01176d --- /dev/null +++ b/AutoCoverTool/script/train_user_by_one_media.py @@ -0,0 +1,531 @@ +""" +使用一句话进行人声训练 +1. 数据集 +2. 训练 +""" +from ref.so_vits_svc.models import SynthesizerTrn, MultiPeriodDiscriminator +from ref.so_vits_svc.mel_processing import spectrogram_torch, spec_to_mel_torch, mel_spectrogram_torch +import ref.so_vits_svc.utils as utils +import ref.so_vits_svc.commons as commons +from ref.so_vits_svc.losses import kl_loss, generator_loss, discriminator_loss, feature_loss + +import logging + +logging.getLogger('numba').setLevel(logging.WARNING) + +import os +import time +import torch +import random +import librosa +import soundfile +import torchaudio +import parselmouth +import numpy as np +from tqdm import tqdm +from scipy.io.wavfile import read +from pyworld import pyworld +from copy import deepcopy +import torch.utils.data +from torch.nn import functional as F +from torch.utils.data import DataLoader +from torch.cuda.amp import autocast, GradScaler + +gs_hmodel = utils.get_hubert_model(0 if torch.cuda.is_available() else None) +gs_model_config = { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0.1, + "resblock": "1", + "resblock_kernel_sizes": [3, 7, 11], + "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + "upsample_rates": [10, 8, 2, 2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16, 16, 4, 4], + "n_layers_q": 3, + "use_spectral_norm": False, + "gin_channels": 256, + "ssl_dim": 256, + "n_speakers": 2 +} + +gs_train_config = { + "log_interval": 1, + "eval_interval": 1000, + "seed": 1234, + "epochs": 1000, + "learning_rate": 0.0001, + "betas": [ + 0.8, + 0.99 + ], + "eps": 1e-09, + "batch_size": 12, + "fp16_run": False, + "lr_decay": 0.999875, + "segment_size": 17920, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 1.0, # 45 + "c_kl": 1.0, + "c_fm": 1.0, + "c_gen": 1.0, + "use_sr": True, + "max_speclen": 384 +} +gs_data_config = { + "max_wav_value": 32768.0, + "sampling_rate": 32000, + "filter_length": 1280, + "hop_length": 320, + "win_length": 1280, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": None +} + + +def get_f0(x, p_len, f0_up_key=0): + time_step = 160 / 16000 * 1000 + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + + f0 = parselmouth.Sound(x, 16000).to_pitch_ac( + time_step=time_step / 1000, voicing_threshold=0.6, + pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency'] + if len(f0) > p_len: + f0 = f0[:p_len] + pad_size = (p_len - len(f0) + 1) // 2 + if (pad_size > 0 or p_len - len(f0) - pad_size > 0): + f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode='constant') + + f0 *= pow(2, f0_up_key / 12) + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + f0_coarse = np.rint(f0_mel).astype(np.int) + return f0_coarse, f0 + + +def resize2d(x, target_len): + source = np.array(x) + source[source < 0.001] = np.nan + target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)), + source) + res = np.nan_to_num(target) + return res + + +def compute_f0(x, sr, c_len): + # x, sr = librosa.load(path, sr=32000) + f0, t = pyworld.dio( + x.astype(np.double), + fs=sr, + f0_ceil=800, + frame_period=1000 * 320 / sr, + ) + f0 = pyworld.stonemask(x.astype(np.double), f0, t, 32000) + for index, pitch in enumerate(f0): + f0[index] = round(pitch, 1) + assert abs(c_len - x.shape[0] // 320) < 3, (c_len, f0.shape) + + return None, resize2d(f0, c_len) + + +def process(filename): + hmodel = utils.get_hubert_model(0 if torch.cuda.is_available() else None) + save_name = filename + ".soft.pt" + if not os.path.exists(save_name): + devive = torch.device("cuda" if torch.cuda.is_available() else "cpu") + wav, _ = librosa.load(filename, sr=16000) + wav = torch.from_numpy(wav).unsqueeze(0).to(devive) + c = utils.get_hubert_content(hmodel, wav) + torch.save(c.cpu(), save_name) + else: + c = torch.load(save_name) + f0path = filename + ".f0.npy" + if not os.path.exists(f0path): + cf0, f0 = compute_f0(filename, c.shape[-1] * 2) + np.save(f0path, f0) + + +def clean_pitch(input_pitch): + num_nan = np.sum(input_pitch == 1) + if num_nan / len(input_pitch) > 0.9: + input_pitch[input_pitch != 1] = 1 + return input_pitch + + +class TextAudioSpeakerLoader(torch.utils.data.Dataset): + """ + 1) loads audio, speaker_id, text pairs + 2) normalizes text and converts them to sequences of integers + 3) computes spectrograms from audio files. + """ + + def __init__(self, audio_path): + self.audio_path = audio_path + self.max_wav_value = gs_data_config['max_wav_value'] + self.sampling_rate = gs_data_config['sampling_rate'] + self.filter_length = gs_data_config['filter_length'] + self.hop_length = gs_data_config['hop_length'] + self.win_length = gs_data_config['win_length'] + self.use_sr = gs_train_config['use_sr'] + self.spec_len = gs_train_config['max_speclen'] + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.hmodel = gs_hmodel + + random.seed(1234) + self.audio_data = self.get_audio(audio_path) + + def get_audio(self, filename): + # 原始音频32k单声道 + + # 这里存在疑惑: + # audio, sr = librosa.load(filename, sr=self.sampling_rate, mono=True) + sr, audio = read(filename) + audio = torch.FloatTensor(audio.astype(np.float32)) + + audio_norm = audio / self.max_wav_value + audio_norm = torch.tensor(audio_norm) + audio_norm = audio_norm.unsqueeze(0) + + # 幅度谱 帧长1280(40ms),帧移320(10ms),shape为(641, frame_num) + spec = spectrogram_torch(audio_norm, self.filter_length, + self.sampling_rate, self.hop_length, self.win_length, + center=False) + # print(torch.mean(spec)) + spec = torch.squeeze(spec, 0) + spk = torch.LongTensor([0]) + + # # 提取hubert特征,shape为(256, frame_num // 2),后面做补齐 + wav = librosa.resample(audio.numpy(), sr, 16000) + wav = torch.from_numpy(wav).unsqueeze(0).to(self.device) + c = utils.get_hubert_content(self.hmodel, wav).squeeze(0) + + # 提取f0特征,shape为(frame_num) + cf0, f0 = compute_f0(audio.numpy(), sr, c.shape[-1] * 2) + f0 = torch.FloatTensor(f0) + c = torch.repeat_interleave(c, repeats=2, dim=1) # shape=(256, frame_num) + + lmin = min(c.size(-1), spec.size(-1), f0.shape[0]) + # 当assert的前面的条件不成立的时候,会报错,并给出后面的信息 + assert abs(c.size(-1) - spec.size(-1)) < 4, (c.size(-1), spec.size(-1), f0.shape, filename) + assert abs(lmin - spec.size(-1)) < 4, (c.size(-1), spec.size(-1), f0.shape) + assert abs(lmin - c.size(-1)) < 4, (c.size(-1), spec.size(-1), f0.shape) + spec, c, f0 = spec[:, :lmin], c[:, :lmin], f0[:lmin] + audio_norm = audio_norm[:, :lmin * self.hop_length] + _spec, _c, _audio_norm, _f0 = spec, c, audio_norm, f0 + # 取幅度谱特征,hubert特征、f0信息 + while spec.size(-1) < self.spec_len: + spec = torch.cat((spec, _spec), -1) + c = torch.cat((c, _c), -1) + f0 = torch.cat((f0, _f0), -1) + audio_norm = torch.cat((audio_norm, _audio_norm), -1) + # hubert特征,f0,幅度谱特征,对应音频段波形,人声编码 + return c, f0, spec, audio_norm, spk + + def random_one(self): + c, f0, spec, audio_norm, spk = self.audio_data + start = random.randint(0, spec.size(-1) - self.spec_len) + end = start + self.spec_len + spec = spec[:, start:end] + c = c[:, start:end] + f0 = f0[start:end] + audio_norm = audio_norm[:, start * self.hop_length:end * self.hop_length] + return c, f0, spec, audio_norm, spk + + def __getitem__(self, index): + return self.random_one() + + def __len__(self): + return 1 + + +class SoVitsSVCOnlineTrain: + + def construct_model(self): + net_g = SynthesizerTrn( + gs_data_config["filter_length"] // 2 + 1, + gs_train_config["segment_size"] // gs_data_config["hop_length"], + **gs_model_config, + no_flow=False, + use_v3=False).cuda() + net_d = MultiPeriodDiscriminator(gs_model_config['use_spectral_norm']).cuda() + optim_g = torch.optim.AdamW( + net_g.parameters(), + 0.0001, + betas=[0.8, 0.99], + eps=1e-09) + optim_d = torch.optim.AdamW( + net_d.parameters(), + 0.0001, + betas=[0.8, 0.99], + eps=1e-09) + + # checkpoint_dict = torch.load(base_g_model, map_location='cuda') + net_g.load_state_dict(self.g_model_dict) + net_d.load_state_dict(self.d_model_dict) + optim_g.load_state_dict(self.g_opt_dict) + optim_d.load_state_dict(self.d_opt_dict) + + # 设置初始学习率 + optim_g.param_groups[0]['lr'] = 2e-4 + optim_d.param_groups[0]['lr'] = 2e-4 + return net_g, net_d, optim_g, optim_d + + def __init__(self, base_g_model, base_d_model): + st1 = time.time() + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + checkpoint_dict = torch.load(base_g_model, map_location='cpu') + self.g_model_dict = checkpoint_dict["model"] + self.g_opt_dict = checkpoint_dict["optimizer"] + + checkpoint_dict = torch.load(base_d_model, map_location='cpu') + self.d_model_dict = checkpoint_dict["model"] + self.d_opt_dict = checkpoint_dict["optimizer"] + + print("load model_path={},{},sp={}".format(base_g_model, base_d_model, time.time() - st1)) + + def get_units(self, source, sr): + source = source.unsqueeze(0).to(self.device) + print("source_shape===>", source.shape) + with torch.inference_mode(): + start = time.time() + units = gs_hmodel.units(source) + use_time = time.time() - start + print("hubert use time:{}".format(use_time)) + return units + + def get_unit_pitch(self, source, sr, tran): + source = torchaudio.functional.resample(source, sr, 16000) + if len(source.shape) == 2 and source.shape[1] >= 2: + source = torch.mean(source, dim=0).unsqueeze(0) + soft = self.get_units(source, sr).squeeze(0).cpu().numpy() + f0_coarse, f0 = get_f0(source.cpu().numpy()[0], soft.shape[0] * 2, tran) + return soft, f0 + + def train(self, in_wav, epoch_num): + train_dataset = TextAudioSpeakerLoader(in_wav) + train_loader = DataLoader(train_dataset, num_workers=0, shuffle=False, batch_size=12) + net_g, net_d, optim_g, optim_d = self.construct_model() + + rank = 0 + # 用于训练加速 + torch.set_float32_matmul_precision('high') + net_g.train() + net_d.train() + global_step = 0 + scaler = GradScaler(enabled=gs_train_config['fp16_run']) + + scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=gs_train_config['lr_decay'], last_epoch=1) + scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=gs_train_config['lr_decay'], last_epoch=1) + # 根据上一次的情况来进行学习率更新 + # 思路: loss 下降 学习率增加,loss上升学习率减少 + for epoch in tqdm(range(0, epoch_num)): + for batch_idx, items in enumerate(train_loader): + # hubert特征,f0,幅度谱特征,对应音频段波形(384 * hop_length),人声编码[0] + c, f0, spec, y, spk = items + g = spk.cuda(rank, non_blocking=True) + spec, y = spec.cuda(rank, non_blocking=True), y.cuda(rank, non_blocking=True) + c = c.cuda(rank, non_blocking=True) + f0 = f0.cuda(rank, non_blocking=True) + """ + "sampling_rate": 32000, + "filter_length": 1280, + "hop_length": 320, + "win_length": 1280, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + """ + + # spec, n_fft, num_mels, sampling_rate, fmin, fmax + mel = spec_to_mel_torch(spec, gs_data_config['filter_length'], gs_data_config['n_mel_channels'], + gs_data_config['sampling_rate'], gs_data_config['mel_fmin'], + gs_data_config['mel_fmax']) + with autocast(enabled=gs_train_config['fp16_run']): + # net_g的输入: hubert特征,f0,幅度谱特征,说话人id,mel谱特征 + # net_g的输出: + # 原始波形,批次中每个采样到的帧的位置,批次中幅度谱的有效帧位置, + # 幅度谱编码得到正态分布后随机采样得到的z, z经过标准化流之后得到z_p, hubert特征层得到的正态分布的均值, + # hubert特征层得到的正态分布的标准差(logs_p),幅度谱和人声信息得到的均值(m_q),幅度谱和人声信息得到的标准差(logs_q) + y_hat, ids_slice, z_mask, \ + (z, z_p, m_p, logs_p, m_q, logs_q) = net_g(c, f0, spec, g=g, mel=mel) + + y_mel = commons.slice_segments(mel, ids_slice, + gs_train_config['segment_size'] // gs_data_config['hop_length']) + + y_hat_mel = mel_spectrogram_torch( + y_hat.squeeze(1), + gs_data_config['filter_length'], + gs_data_config['n_mel_channels'], + gs_data_config['sampling_rate'], + gs_data_config['hop_length'], + gs_data_config['win_length'], + gs_data_config['mel_fmin'], + gs_data_config['mel_fmax'] + ) + y = commons.slice_segments(y, ids_slice * gs_data_config['hop_length'], + gs_train_config['segment_size']) # slice + + # Discriminator + y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach()) + + with autocast(enabled=False): + loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g) + loss_disc_all = loss_disc + + optim_d.zero_grad() + scaler.scale(loss_disc_all).backward() + scaler.unscale_(optim_d) + scaler.step(optim_d) + with autocast(enabled=gs_train_config['fp16_run']): + # Generator + y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat) + with autocast(enabled=False): + # mel谱之间的损失函数,后面是系数,误差越小越好 + loss_mel = F.l1_loss(y_mel, y_hat_mel) * gs_train_config['c_mel'] + # KL散度,z_p: 幅度谱侧得到的采样值经过标准化流之后的结果,logs_q: 幅度谱侧得到的标准差,m_p:hubert侧得到的均值 + # logs_p: hubert侧得到的标准差,z_mask: 批次中幅度谱的有效帧位置, + loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * gs_train_config['c_kl'] + # 在d模型中将y和y_hat的每一层特征结果都拿出来,做l1距离 + loss_fm = feature_loss(fmap_r, fmap_g) * gs_train_config['c_fm'] + loss_gen, losses_gen = generator_loss(y_d_hat_g) + loss_gen_all = loss_gen * gs_train_config['c_gen'] + loss_fm + loss_mel + loss_kl + optim_g.zero_grad() + scaler.scale(loss_gen_all).backward() + scaler.unscale_(optim_g) + scaler.step(optim_g) + scaler.update() + + if global_step % gs_train_config['log_interval'] == 0: + lr = optim_g.param_groups[0]['lr'] + losses_numpy = [round(loss_disc.item(), 3), round(loss_gen.item(), 3), + round(loss_fm.item(), 3), round(loss_mel.item(), 3), round(loss_kl.item(), 3)] + print("gstep={},lr={},disc={},gen={},fm={},mel={},kl={},tot={}".format(global_step, lr, + losses_numpy[0], + losses_numpy[1], + losses_numpy[2], + losses_numpy[3], + losses_numpy[4], + sum(losses_numpy))) + + if global_step % 200 == 0: + torch.save(net_g.state_dict(), "data/web_trained_models/xiafan_{}.pth".format(global_step)) + + global_step += 1 + + scheduler_g.step() + scheduler_d.step() + return net_g + + def infer(self, in_wav, dst_wav, model): + tran = 0 # 变化的音高 + source, sr = librosa.load(in_wav, sr=32000, mono=True) + source = torch.tensor(source).unsqueeze(0) + sid = torch.LongTensor([0]).to(self.device).unsqueeze(0) + soft, pitch = self.get_unit_pitch(source, sr, tran) + f0 = torch.FloatTensor(clean_pitch(pitch)).unsqueeze(0).to(self.device) + stn_tst = torch.FloatTensor(soft) + + with torch.no_grad(): + model.eval() + x_tst = stn_tst.unsqueeze(0).to(self.device) + start = time.time() + x_tst = torch.repeat_interleave(x_tst, repeats=2, dim=1).transpose(1, 2) + audio = model.infer(x_tst, f0=f0, g=sid)[0, 0].data.float() + use_time = time.time() - start + print("vits use time:{}".format(use_time)) + # 写入文件 + soundfile.write(dst_wav, audio.cpu().numpy(), sr, format='wav') + + ####### 对外接口,训练并预测 + def process_train_and_infer(self, train_media, in_path, dst_path, dst_model_path=None, params={}): + """ + :param train_media: 训练时使用的数据 + :param in_path: 待转换的人声信息 + :param dst_path: 转换后的文件地址 + :param dst_model_path: 是否缓存模型 + :return: + """ + # 对train_media转码为32k单声道 + tmp_wav = train_media + "_321.wav" + cmd = "ffmpeg -i {} -ar 32000 -ac 1 -y {}".format(train_media, tmp_wav) + os.system(cmd) + if not os.path.exists(tmp_wav): + return 1 + in_wav_tmp = in_path + "_321.wav" + cmd = "ffmpeg -i {} -ar 32000 -ac 1 -y {}".format(in_path, in_wav_tmp) + os.system(cmd) + if not os.path.exists(in_wav_tmp): + os.unlink(tmp_wav) + return 2 + + global gs_train_config + max_step = params.get('max_step', 200) + gs_train_config['c_mel'] = params.get("c_mel", 45) + gs_train_config['c_fm'] = params.get("c_fm", 1.0) + gs_train_config['c_gen'] = params.get("c_gen", 1.0) + + print("params:{}".format(params)) + st = time.time() + model = self.train(tmp_wav, max_step) + print("train sp={}".format(time.time() - st)) + + st = time.time() + self.infer(in_wav_tmp, dst_path, model) + print("infer sp={}".format(time.time() - st)) + + if dst_model_path is not None: + st = time.time() + torch.save(model.state_dict(), dst_model_path) + print("save model sp={}".format(time.time() - st)) + + os.unlink(tmp_wav) + os.unlink(in_wav_tmp) + return 0 + + # 推理结果 + def process_infer(self, model_path, in_path, dst_path): + net_g = SynthesizerTrn( + gs_data_config["filter_length"] // 2 + 1, + gs_train_config["segment_size"] // gs_data_config["hop_length"], + **gs_model_config, + no_flow=False, + use_v3=False).cuda() + model_dict = torch.load(model_path, map_location='cpu') + net_g.load_state_dict(model_dict) + in_wav_tmp = in_path + "_321.wav" + cmd = "ffmpeg -i {} -ar 32000 -ac 1 -y {}".format(in_path, in_wav_tmp) + os.system(cmd) + if not os.path.exists(in_wav_tmp): + return 2 + + self.infer(in_wav_tmp, dst_path, net_g) + + def get_f0(self, vocal_path): + get_f0() + + +if __name__ == '__main__': + pp = "data/train_users/qiankun_v1/vocals/speaker0/qiankun.wav" + in_p = "data/test/vocal_32.wav" + dst_p = "data/test/vocal_32_out.wav" + dst_m_p = "data/test/mm.pth" + + g_path = "data/online_models/models/base_model/sunyanzi_base_2000.pth" + d_path = "data/online_models/models/base_model/sunyanzi_base_d_2000.pth" + svsot = SoVitsSVCOnlineTrain(g_path, d_path) + + start_time = time.time() + ret = svsot.process_train_and_infer(pp, in_p, dst_p, dst_m_p) + print("process = {} ret={}".format(time.time() - start_time, ret)) diff --git a/AutoCoverTool/svc_inference/config.json b/AutoCoverTool/svc_inference/config.json index 8399ea3..ec1f0a5 100644 --- a/AutoCoverTool/svc_inference/config.json +++ b/AutoCoverTool/svc_inference/config.json @@ -1,90 +1,91 @@ { "train": { "log_interval": 200, "eval_interval": 1000, "seed": 1234, "epochs": 1000, "learning_rate": 0.0001, "betas": [ 0.8, 0.99 ], "eps": 1e-09, "batch_size": 12, "fp16_run": false, "lr_decay": 0.999875, "segment_size": 17920, "init_lr_ratio": 1, "warmup_epochs": 0, "c_mel": 45, "c_kl": 1.0, "use_sr": true, "max_speclen": 384, "port": "8002" }, "data": { "training_files": "/data/rsync/jianli.yang/AutoCoverTool/data/train_users/dlj_v1/filelists/train.txt", "validation_files": "/data/rsync/jianli.yang/AutoCoverTool/data/train_users/dlj_v1/filelists/val.txt", "max_wav_value": 32768.0, "sampling_rate": 32000, "filter_length": 1280, "hop_length": 320, "win_length": 1280, "n_mel_channels": 80, "mel_fmin": 0.0, "mel_fmax": null }, "model": { "inter_channels": 192, "hidden_channels": 192, "filter_channels": 768, "n_heads": 2, "n_layers": 6, "kernel_size": 3, "p_dropout": 0.1, "resblock": "1", "resblock_kernel_sizes": [ 3, 7, 11 ], "resblock_dilation_sizes": [ [ 1, 3, 5 ], [ 1, 3, 5 ], [ 1, 3, 5 ] ], "upsample_rates": [ 10, 8, 2, 2 ], "upsample_initial_channel": 512, "upsample_kernel_sizes": [ 16, 16, 4, 4 ], "n_layers_q": 3, "use_spectral_norm": false, "gin_channels": 256, "ssl_dim": 256, "n_speakers": 2 }, "spk": { - "speaker0": 0 + "speaker0": 0, + "speaker1": 1 } } \ No newline at end of file diff --git a/AutoCoverTool/svc_inference/webui.py b/AutoCoverTool/svc_inference/webui.py index 48a7031..0652207 100644 --- a/AutoCoverTool/svc_inference/webui.py +++ b/AutoCoverTool/svc_inference/webui.py @@ -1,76 +1,77 @@ """ 构建唱歌音色转换网页(基于3.0) 要求: 1. 音频上传 2. 推理 3. 下载 """ import os import time import glob import shutil import librosa import soundfile import gradio as gr from online.inference_one import inf gs_tmp_dir = "/tmp/svc_inference_one_web" -gs_model_dir = "/data/prod/so_vits_models/3.0" +# gs_model_dir = "/data/prod/so_vits_models/3.0" +gs_model_dir = "/data/rsync/jianli.yang/AutoCoverToolNew/AutoCoverTool/data/online_models/models" gs_config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "config.json") gs_models_choices = glob.glob(os.path.join(gs_model_dir, "*/*pth")) gs_model_list_dropdown = None def svc(audio_data, model_path): sr, data = audio_data if os.path.exists(gs_tmp_dir): shutil.rmtree(gs_tmp_dir) os.makedirs(gs_tmp_dir) tmp_path = os.path.join(gs_tmp_dir, "tmp.wav") soundfile.write(tmp_path, data, sr, format="wav") # 重采样到32k audio, sr = librosa.load(tmp_path, sr=32000, mono=True) tmp_path = os.path.join(gs_tmp_dir, "tmp_32.wav") out_path = os.path.join(gs_tmp_dir, "out.wav") soundfile.write(tmp_path, data, sr, format="wav") # 推理 print("svc: {}".format(model_path)) st = time.time() inf(model_path, gs_config_path, tmp_path, out_path, 'cuda') print("input d={}, sp = {}".format(len(audio) / sr, time.time() - st)) return out_path def model_select(): files = glob.glob(os.path.join(gs_model_dir, "*/*pth")) return gs_model_list_dropdown.update(choices=files) def main(): # header app = gr.Blocks() with app: # 头部介绍 gr.Markdown(value=""" ### 唱歌音色转换 作者:starmaker音视频 """) global gs_model_list_dropdown gs_model_list_dropdown = gr.Dropdown(choices=gs_models_choices, interactive=True, label="model list") refresh_btn = gr.Button("refresh_model_list") refresh_btn.click(fn=model_select, inputs=[], outputs=gs_model_list_dropdown) # 提示词输入框 input_audio = gr.inputs.Audio(label="input") gen_btn = gr.Button("generate", variant="primary") output_audio = gr.outputs.Audio(label="output", type='filepath') gen_btn.click(fn=svc, inputs=[input_audio, gs_model_list_dropdown], outputs=output_audio) # 本方法实现同一时刻只有一个程序在服务器端运行 app.queue(concurrency_count=1, max_size=2044).launch(server_name="0.0.0.0", inbrowser=True, quiet=True, server_port=7860) if __name__ == '__main__': main() diff --git a/AutoCoverTool/svc_inference/webui_play.py b/AutoCoverTool/svc_inference/webui_play.py new file mode 100644 index 0000000..abe787f --- /dev/null +++ b/AutoCoverTool/svc_inference/webui_play.py @@ -0,0 +1,133 @@ +""" +构建唱歌音色转换网页(基于3.0) +1. 要求上传一个音频 +2. 选定男女声 +3. 选择一首歌曲 +4. 训练特定轮次并合成歌曲 +""" + +import os +import time +import glob +import hashlib +import shutil +import librosa +import soundfile +import gradio as gr +from script.train_user_by_one_media import SoVitsSVCOnlineTrain + +gs_g_path = "data/online_models/models/base_model/sunyanzi_base_2000.pth" +gs_d_path = "data/online_models/models/base_model/sunyanzi_base_d_2000.pth" +gs_out_model_dir = "data/web_trained_models_for_play" +gs_out_audio_dir = "data/web_trained_models_for_play_audio" +gs_work_dir = "/tmp/train_user_by_one_media_for_play" +gs_ssot_inst = SoVitsSVCOnlineTrain(gs_g_path, gs_d_path) +gs_draw_volume_exe = "/data/gpu_env_common/bin/draw_volume" +gs_simple_mixer_path = "/data/gpu_env_common/bin/simple_mixer" + + +def get_song_map(): + female_song_names = [] + song_list = glob.glob("data/resource/female/*") + for song in song_list: + female_song_names.append(song.replace("data/resource/female/", "")) + male_song_names = [] + song_list = glob.glob("data/resource/male/*") + for song in song_list: + male_song_names.append(song.replace("data/resource/male/", "")) + song_map = { + "female": female_song_names, + "male": male_song_names, + } + return song_map + + +gs_song_map = get_song_map() +gs_song_list_dropdown = None + + +def song_select(gender): + return gs_song_list_dropdown.update(choices=gs_song_map[gender]), gs_song_map[gender][0] + + +def get_file_md5(filename): + with open(filename, "rb") as fp: + return hashlib.md5(fp.read()).hexdigest() + + +def mix(in_path, acc_path, dst_path): + # svc转码到442 + svc_442_file = in_path + "_442.wav" + st = time.time() + cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {} -loglevel fatal".format(in_path, svc_442_file) + os.system(cmd) + if not os.path.exists(svc_442_file): + return -1 + print("transcode,{},sp={}".format(in_path, time.time() - st)) + + # 混合 + st = time.time() + cmd = "{} {} {} {} 1".format(gs_simple_mixer_path, svc_442_file, acc_path, dst_path) + os.system(cmd) + print("mixer,{},sp={}".format(in_path, time.time() - st)) + + +def train_svc(train_audio_data, gender, song_name): + if os.path.exists(gs_work_dir): + shutil.rmtree(gs_work_dir) + os.makedirs(gs_work_dir) + + train_audio_path = os.path.join(gs_work_dir, "inf.wav") + + sr, data = train_audio_data + soundfile.write(train_audio_path, data, samplerate=sr, format='wav') + + inf_audio_path = os.path.join("data/resource/{}/{}/vocal321.wav".format(gender, song_name)) # 人声 + inf_acc_path = os.path.join("data/resource/{}/{}/acc.wav".format(gender, song_name)) # 伴奏 + inf_out_path = os.path.join(gs_work_dir, "tmp.wav") + print("svc: {}".format(train_audio_path)) + + st = time.time() + md5 = get_file_md5(train_audio_path) + out_model_path = os.path.join(gs_out_model_dir, "{}.pth".format(md5)) + + print("inputMsg:", train_audio_path, inf_audio_path, out_model_path) + if os.path.exists(out_model_path): + err = gs_ssot_inst.process_infer(out_model_path, inf_audio_path, inf_out_path) + else: + err = gs_ssot_inst.process_train_and_infer(train_audio_path, inf_audio_path, inf_out_path, out_model_path) + + # 人声伴奏合并 + out_path = os.path.join(gs_out_audio_dir, "{}.wav".format(md5)) + mix(inf_out_path, inf_acc_path, out_path) + print("input err={}, sp = {}".format(err, time.time() - st)) + return out_path + + +def main(): + # header + app = gr.Blocks() + with app: + # 头部介绍 + gr.Markdown(value=""" + ### 用你的音色来唱歌 + #### 使用说明: 手机录一个15s左右的音频,拖拽到网页上,点击开始后,稍等2分钟~ + 作者:starmaker音视频 + """) + # train_audio_path + train_audio = gr.inputs.Audio(label="input_audio") + gender = gr.inputs.Radio(choices=["female", "male"], default="female") + global gs_song_list_dropdown + gs_song_list_dropdown = gr.Dropdown(choices=gs_song_map["female"], interactive=True, label="song list") + gender.change(song_select, inputs=[gender], outputs=[gs_song_list_dropdown, gs_song_list_dropdown]) + gen_btn = gr.Button("generate", variant="primary") + + output_audio = gr.outputs.Audio(label="output", type='filepath') + gen_btn.click(fn=train_svc, inputs=[train_audio, gender, gs_song_list_dropdown], outputs=output_audio) + # 本方法实现同一时刻只有一个程序在服务器端运行 + app.queue(concurrency_count=1, max_size=2044).launch(server_name="0.0.0.0", inbrowser=True, quiet=True, + server_port=7860) + + +if __name__ == '__main__': + main() diff --git a/AutoCoverTool/svc_inference/webui_v1.py b/AutoCoverTool/svc_inference/webui_v1.py new file mode 100644 index 0000000..b05c8af --- /dev/null +++ b/AutoCoverTool/svc_inference/webui_v1.py @@ -0,0 +1,107 @@ +""" +构建唱歌音色转换网页(基于3.0) +要求: +1. 音频上传 +2. 训练 +3. 推理 +4. 下载 +第一版功能: 给定两个文件(1) 待训练音频 (2) 待推理音频 输出 音色转换后的音频 +""" +import os +import time +import glob +import shutil +import librosa +import soundfile +import gradio as gr +from script.train_user_by_one_media import SoVitsSVCOnlineTrain + +gs_g_path = "data/online_models/models/base_model/sunyanzi_base_2000.pth" +gs_d_path = "data/online_models/models/base_model/sunyanzi_base_d_2000.pth" +gs_out_model_dir = "data/web_trained_models" +gs_work_dir = "/tmp/train_user_by_one_media" +gs_ssot_inst = SoVitsSVCOnlineTrain(gs_g_path, gs_d_path) +gs_models_choices = glob.glob(os.path.join(gs_out_model_dir, "*pth")) + + +def train_svc(train_audio_path, inf_audio_path, name, train_step, c_mel, c_fm, c_gen): + train_step = int(train_step) + if os.path.exists(gs_work_dir): + shutil.rmtree(gs_work_dir) + print("svc: {}".format(train_audio_path, inf_audio_path)) + st = time.time() + os.makedirs(gs_work_dir) + out_path = os.path.join(gs_work_dir, "tmp.wav") + out_model_path = None + if name != "": + out_model_path = os.path.join(gs_out_model_dir, "{}_{}.pth".format(name, train_step)) + print("inputMsg:", train_audio_path, inf_audio_path, out_path, out_model_path, train_step) + err = gs_ssot_inst.process_train_and_infer(train_audio_path, inf_audio_path, out_path, out_model_path, + params={'max_step': 100, 'c_mel': int(c_mel), 'c_fm': int(c_fm), + 'c_gen': int(c_gen)}) + print("input err={}, step={}, sp = {}".format(err, train_step, time.time() - st)) + return out_path + + +def svc_v1(inf_audio_path, model_path): + print("svc={}", model_path) + if os.path.exists(gs_work_dir): + shutil.rmtree(gs_work_dir) + os.makedirs(gs_work_dir) + + # 重采样到32k + audio, sr = librosa.load(inf_audio_path, sr=32000, mono=True) + tmp_path = os.path.join(gs_work_dir, "tmp_32.wav") + out_path = os.path.join(gs_work_dir, "out.wav") + soundfile.write(tmp_path, audio, sr, format="wav") + + # 推理 + print("svc: {}".format(model_path)) + st = time.time() + gs_ssot_inst.process_infer(model_path, tmp_path, out_path) + print("input d={}, sp = {}".format(len(audio) / sr, time.time() - st)) + return out_path + + +def model_select(): + files = glob.glob(os.path.join(gs_out_model_dir, "*pth")) + return gs_model_list_dropdown.update(choices=files) + + +def main(): + # header + app = gr.Blocks() + with app: + # 头部介绍 + gr.Markdown(value=""" + ### 唱歌音色转换 + 作者:starmaker音视频 + """) + global gs_model_list_dropdown + gs_model_list_dropdown = gr.Dropdown(choices=gs_models_choices, interactive=True, label="model list") + refresh_btn = gr.Button("refresh_model_list") + refresh_btn.click(fn=model_select, inputs=[], outputs=gs_model_list_dropdown) + + # train_audio_path + train_audio = gr.inputs.Audio(label="train_audio", type='filepath') + svc_audio = gr.inputs.Audio(label="svc_audio", type='filepath') + name_text = gr.inputs.Textbox(label="model_name", default="") + with gr.Row(): + max_step = gr.inputs.Number(label="max_step", default=100) + c_mel = gr.inputs.Number(label="c_mel", default=45) + c_fm = gr.inputs.Number(label="c_fm", default=1) + c_gen = gr.inputs.Number(label="c_gen", default=1) + gen_btn = gr.Button("generate", variant="primary") + gen_btn1 = gr.Button("generateByModel", variant="primary") + + output_audio = gr.outputs.Audio(label="output", type='filepath') + gen_btn.click(fn=train_svc, inputs=[train_audio, svc_audio, name_text, max_step, c_mel, c_fm, c_gen], + outputs=output_audio) + gen_btn1.click(fn=svc_v1, inputs=[svc_audio, gs_model_list_dropdown], outputs=output_audio) + # 本方法实现同一时刻只有一个程序在服务器端运行 + app.queue(concurrency_count=1, max_size=2044).launch(server_name="0.0.0.0", inbrowser=True, quiet=True, + server_port=7860) + + +if __name__ == '__main__': + main()