Page MenuHomePhabricator

No OneTemporary

diff --git a/AutoCoverTool/ref/so_vits_svc/apply_model.py b/AutoCoverTool/ref/so_vits_svc/apply_model.py
new file mode 100644
index 0000000..7570e91
--- /dev/null
+++ b/AutoCoverTool/ref/so_vits_svc/apply_model.py
@@ -0,0 +1,147 @@
+from ref.so_vits_svc.models import *
+import time
+import torch
+from thop import profile
+from thop import clever_format
+
+gs_model_config = {
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 768,
+ "n_heads": 2,
+ "n_layers": 6,
+ "kernel_size": 3,
+ "p_dropout": 0.1,
+ "resblock": "1",
+ "resblock_kernel_sizes": [
+ 3,
+ 7,
+ 11
+ ],
+ "resblock_dilation_sizes": [
+ [
+ 1,
+ 3,
+ 5
+ ],
+ [
+ 1,
+ 3,
+ 5
+ ],
+ [
+ 1,
+ 3,
+ 5
+ ]
+ ],
+ "upsample_rates": [
+ 10,
+ 8,
+ 2,
+ 2
+ ],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [
+ 16,
+ 16,
+ 4,
+ 4
+ ],
+ "n_layers_q": 3,
+ "use_spectral_norm": False,
+ "gin_channels": 256,
+ "ssl_dim": 256,
+ "n_speakers": 2
+}
+
+
+def load_model():
+ mm = SynthesizerTrn(
+ 641, 56,
+ **gs_model_config
+ )
+ device = 'cuda'
+ checkpoint_path = "data/train_users/xiafan/logs/32k/G_0.pth"
+ checkpoint_dict = torch.load(checkpoint_path, map_location='cuda')
+ mm.load_state_dict(checkpoint_dict["model"])
+ mm.eval()
+ mm.to(device)
+
+ # params = mm.state_dict()
+ # save_path = "/tmp/t1.pth"
+ # torch.save(params, save_path)
+
+ # c, f0, spec, g=None
+ hub_data = torch.rand((1, 256, 1)).to(device)
+ f0 = torch.rand((1, 1)).to(device)
+ g = torch.tensor([[0]]).to(device)
+ spec = torch.rand((1, 641, 1)).to(device)
+
+ with torch.no_grad():
+ for i in range(0, 10):
+ st = time.time()
+ mm(hub_data, f0, spec, g)
+ print("sp = {}".format(time.time() - st))
+
+ flops, params = profile(mm, inputs=(hub_data, f0, spec, g), verbose=True)
+ flops, params = clever_format([flops, params], "%.3f")
+ # 487.096M 58.126M
+ print(flops, params)
+ # macs, params = clever_format([flops, params], "%.3f")
+ # print(macs)
+ # print(params)
+ #
+ # print(mm)
+ # stat(mm, input_size=(hub_data, f0, spec, g))
+ # print("%s | %.2f | %.2f" % ("synthesizer_trn", params / (1000 ** 2), flops / (1000 ** 3)))
+
+
+def change_iter():
+ checkpoint_path = "/data/rsync/jianli.yang/AutoCoverToolNew/AutoCoverTool/data/online_models/models/base_model/sunyanzi_base_48000.pth"
+ dst_checkpoint_path = "/data/rsync/jianli.yang/AutoCoverToolNew/AutoCoverTool/data/online_models/models/base_model/sunyanzi_base_48000_no_opt.pth"
+ checkpoint_dict = torch.load(checkpoint_path, map_location='cuda')
+ checkpoint_dict["iteration"] = 1
+ checkpoint_dict["learning_rate"] = 1e-4
+ checkpoint_dict["optimizer"] = None
+ torch.save(checkpoint_dict, dst_checkpoint_path)
+
+
+def reload_model():
+ in_checkpoint_path = "/data/rsync/jianli.yang/AutoCoverToolNew/AutoCoverTool/data/online_models/models/base_model/sunyanzi_base_48000.pth"
+ out_checkpoint_path = "/data/rsync/jianli.yang/AutoCoverToolNew/AutoCoverTool/data/online_models/models/base_model/sunyanzi_base_48000_no_flow.pth"
+ checkpoint_dict = torch.load(in_checkpoint_path, map_location='cuda')
+ device = 'cuda'
+ model = SynthesizerTrn(
+ 641, 56,
+ **gs_model_config,
+ no_flow=True,
+ use_v3=False
+ )
+ state_dict = {}
+ for k in checkpoint_dict["model"].keys():
+ if str(k).startswith("flow"):
+ continue
+ state_dict[k] = checkpoint_dict["model"][k]
+
+ model.load_state_dict(state_dict)
+ model.eval()
+ model.to(device)
+ # print(model)
+ hub_data = torch.rand((1, 256, 1)).to(device)
+ f0 = torch.rand((1, 1)).to(device)
+ g = torch.tensor([[0]]).to(device)
+ spec = torch.rand((1, 641, 1)).to(device)
+ o, ids_slice, spec_mask, (z, z, m_p, logs_p, m_q, logs_q) = model(hub_data, f0, spec, g)
+ print(o.shape)
+
+ checkpoint_dict["model"] = model.state_dict()
+ checkpoint_dict["iteration"] = 1
+ checkpoint_dict["learning_rate"] = 1e-4
+ checkpoint_dict["optimizer"] = None
+ torch.save(checkpoint_dict, out_checkpoint_path)
+
+
+if __name__ == '__main__':
+ # reload_model()
+ change_iter()
\ No newline at end of file
diff --git a/AutoCoverTool/svc_inference/config.json b/AutoCoverTool/ref/so_vits_svc/config.json
similarity index 100%
copy from AutoCoverTool/svc_inference/config.json
copy to AutoCoverTool/ref/so_vits_svc/config.json
diff --git a/AutoCoverTool/ref/so_vits_svc/inference/infer_tool.py b/AutoCoverTool/ref/so_vits_svc/inference/infer_tool.py
index 628cbdf..06a4676 100644
--- a/AutoCoverTool/ref/so_vits_svc/inference/infer_tool.py
+++ b/AutoCoverTool/ref/so_vits_svc/inference/infer_tool.py
@@ -1,342 +1,433 @@
import hashlib
import json
import logging
import os
import time
from pathlib import Path
import librosa
import maad
import numpy as np
# import onnxruntime
import parselmouth
import soundfile
import torch
import torchaudio
from hubert import hubert_model
import utils
from models import SynthesizerTrn
import copy
logging.getLogger('matplotlib').setLevel(logging.WARNING)
from mel_processing import spectrogram_torch, spec_to_mel_torch
def get_spec(audio):
audio_norm = audio
print(audio_norm)
spec = spectrogram_torch(audio_norm, 1280, 32000, 320, 1280, center=False)
return spec
def read_temp(file_name):
if not os.path.exists(file_name):
with open(file_name, "w") as f:
f.write(json.dumps({"info": "temp_dict"}))
return {}
else:
try:
with open(file_name, "r") as f:
data = f.read()
data_dict = json.loads(data)
if os.path.getsize(file_name) > 50 * 1024 * 1024:
f_name = file_name.replace("\\", "/").split("/")[-1]
print(f"clean {f_name}")
for wav_hash in list(data_dict.keys()):
if int(time.time()) - int(data_dict[wav_hash]["time"]) > 14 * 24 * 3600:
del data_dict[wav_hash]
except Exception as e:
print(e)
print(f"{file_name} error,auto rebuild file")
data_dict = {"info": "temp_dict"}
return data_dict
def write_temp(file_name, data):
with open(file_name, "w") as f:
f.write(json.dumps(data))
def timeit(func):
def run(*args, **kwargs):
t = time.time()
res = func(*args, **kwargs)
print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t))
return res
return run
def format_wav(audio_path):
if Path(audio_path).suffix == '.wav':
return
raw_audio, raw_sample_rate = librosa.load(audio_path, mono=True, sr=None)
soundfile.write(Path(audio_path).with_suffix(".wav"), raw_audio, raw_sample_rate)
def get_end_file(dir_path, end):
file_lists = []
for root, dirs, files in os.walk(dir_path):
files = [f for f in files if f[0] != '.']
dirs[:] = [d for d in dirs if d[0] != '.']
for f_file in files:
if f_file.endswith(end):
file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
return file_lists
def get_md5(content):
return hashlib.new("md5", content).hexdigest()
def resize2d_f0(x, target_len):
source = np.array(x)
source[source < 0.001] = np.nan
target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
source)
res = np.nan_to_num(target)
return res
def get_f0(x, p_len, f0_up_key=0):
time_step = 160 / 16000 * 1000
f0_min = 50
f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
f0 = parselmouth.Sound(x, 16000).to_pitch_ac(
time_step=time_step / 1000, voicing_threshold=0.6,
pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
if len(f0) > p_len:
f0 = f0[:p_len]
pad_size = (p_len - len(f0) + 1) // 2
if (pad_size > 0 or p_len - len(f0) - pad_size > 0):
f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode='constant')
f0 *= pow(2, f0_up_key / 12)
f0_mel = 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255
f0_coarse = np.rint(f0_mel).astype(np.int)
return f0_coarse, f0
def clean_pitch(input_pitch):
num_nan = np.sum(input_pitch == 1)
if num_nan / len(input_pitch) > 0.9:
input_pitch[input_pitch != 1] = 1
return input_pitch
def plt_pitch(input_pitch):
input_pitch = input_pitch.astype(float)
input_pitch[input_pitch == 1] = np.nan
return input_pitch
def f0_to_pitch(ff):
f0_pitch = 69 + 12 * np.log2(ff / 440)
return f0_pitch
def fill_a_to_b(a, b):
if len(a) < len(b):
for _ in range(0, len(b) - len(a)):
a.append(a[0])
def mkdir(paths: list):
for path in paths:
if not os.path.exists(path):
os.mkdir(path)
class Svc(object):
def __init__(self, net_g_path, config_path, hubert_path="/data/prod/so_vits_models/models/hubert-soft-0d54a1f4.pt",
onnx=False):
self.onnx = onnx
self.net_g_path = net_g_path
self.hubert_path = hubert_path
self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.net_g_ms = None
self.hps_ms = utils.get_hparams_from_file(config_path)
self.target_sample = self.hps_ms.data.sampling_rate
self.hop_size = self.hps_ms.data.hop_length
self.speakers = {}
for spk, sid in self.hps_ms.spk.items():
self.speakers[sid] = spk
self.spk2id = self.hps_ms.spk
# 加载hubert
self.hubert_soft = hubert_model.hubert_soft(hubert_path)
if torch.cuda.is_available():
self.hubert_soft = self.hubert_soft.cuda()
self.load_model()
def load_model(self):
# 获取模型配置
if self.onnx:
raise NotImplementedError
# self.net_g_ms = SynthesizerTrnForONNX(
# 178,
# self.hps_ms.data.filter_length // 2 + 1,
# self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
# n_speakers=self.hps_ms.data.n_speakers,
# **self.hps_ms.model)
# _ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None)
else:
self.net_g_ms = SynthesizerTrn(
self.hps_ms.data.filter_length // 2 + 1,
self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
- **self.hps_ms.model)
+ **self.hps_ms.model, no_flow=True, use_v3=True)
_ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None)
if "half" in self.net_g_path and torch.cuda.is_available():
_ = self.net_g_ms.half().eval().to(self.dev)
else:
_ = self.net_g_ms.eval().to(self.dev)
def get_units(self, source, sr):
source = source.unsqueeze(0).to(self.dev)
with torch.inference_mode():
start = time.time()
units = self.hubert_soft.units(source)
use_time = time.time() - start
print("hubert use time:{}".format(use_time))
return units
def get_unit_pitch(self, in_path, tran):
source, sr = torchaudio.load(in_path)
source_bak = copy.deepcopy(source)
source = torchaudio.functional.resample(source, sr, 16000)
if len(source.shape) == 2 and source.shape[1] >= 2:
source = torch.mean(source, dim=0).unsqueeze(0)
soft = self.get_units(source, sr).squeeze(0).cpu().numpy()
f0_coarse, f0 = get_f0(source.cpu().numpy()[0], soft.shape[0] * 2, tran)
return soft, f0, source_bak
def infer(self, speaker_id, tran, raw_path, dev=False):
if type(speaker_id) == str:
speaker_id = self.spk2id[speaker_id]
sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
soft, pitch, source = self.get_unit_pitch(raw_path, tran)
f0 = torch.FloatTensor(clean_pitch(pitch)).unsqueeze(0).to(self.dev)
if "half" in self.net_g_path and torch.cuda.is_available():
stn_tst = torch.HalfTensor(soft)
else:
stn_tst = torch.FloatTensor(soft)
# 提取幅度谱
# spec = get_spec(source).to(self.dev)
with torch.no_grad():
x_tst = stn_tst.unsqueeze(0).to(self.dev)
start = time.time()
x_tst = torch.repeat_interleave(x_tst, repeats=2, dim=1).transpose(1, 2)
audio = self.net_g_ms.infer(x_tst, f0=f0, g=sid)[0, 0].data.float()
# audio = self.net_g_ms.infer_v1(x_tst, spec[:, :, :f0.size(-1)], f0=f0, g=sid)[0, 0].data.float()
use_time = time.time() - start
print("vits use time:{}".format(use_time))
return audio, audio.shape[-1]
+class SVCRealTimeByBuffer(object):
+ def __init__(self, net_g_path, config_path, hubert_path="/data/prod/so_vits_models/models/hubert-soft-0d54a1f4.pt"):
+ self.net_g_path = net_g_path
+ self.hubert_path = hubert_path
+ self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ self.net_g_ms = None
+ self.hps_ms = utils.get_hparams_from_file(config_path)
+ self.target_sample = self.hps_ms.data.sampling_rate
+ self.hop_size = self.hps_ms.data.hop_length
+ self.speakers = {}
+ for spk, sid in self.hps_ms.spk.items():
+ self.speakers[sid] = spk
+ self.spk2id = self.hps_ms.spk
+ # 加载hubert
+ self.hubert_soft = hubert_model.hubert_soft(hubert_path)
+ if torch.cuda.is_available():
+ self.hubert_soft = self.hubert_soft.cuda()
+ self.load_model()
+
+ def load_model(self):
+ self.net_g_ms = SynthesizerTrn(
+ self.hps_ms.data.filter_length // 2 + 1,
+ self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
+ **self.hps_ms.model, no_flow=True)
+ # _ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None)
+ net_g = torch.load(self.net_g_path, map_location='cpu')
+ self.net_g_ms.load_state_dict(net_g)
+ if "half" in self.net_g_path and torch.cuda.is_available():
+ _ = self.net_g_ms.half().eval().to(self.dev)
+ else:
+ _ = self.net_g_ms.eval().to(self.dev)
+
+ def get_units(self, source, sr):
+ source = source.unsqueeze(0).to(self.dev)
+ print("source_shape===>", source.shape)
+ with torch.inference_mode():
+ start = time.time()
+ units = self.hubert_soft.units(source)
+ use_time = time.time() - start
+ print("hubert use time:{}".format(use_time))
+ return units
+
+ def get_unit_pitch(self, source, sr, tran):
+ source = torchaudio.functional.resample(source, sr, 16000)
+ if len(source.shape) == 2 and source.shape[1] >= 2:
+ source = torch.mean(source, dim=0).unsqueeze(0)
+ soft = self.get_units(source, sr).squeeze(0).cpu().numpy()
+ f0_coarse, f0 = get_f0(source.cpu().numpy()[0], soft.shape[0] * 2, tran)
+ return soft, f0
+
+ def infer(self, speaker_id, tran, source, sr):
+ if type(speaker_id) == str:
+ speaker_id = self.spk2id[speaker_id]
+ sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
+ soft, pitch = self.get_unit_pitch(source, sr, tran)
+ f0 = torch.FloatTensor(clean_pitch(pitch)).unsqueeze(0).to(self.dev)
+ if "half" in self.net_g_path and torch.cuda.is_available():
+ stn_tst = torch.HalfTensor(soft)
+ else:
+ stn_tst = torch.FloatTensor(soft)
+
+ with torch.no_grad():
+ x_tst = stn_tst.unsqueeze(0).to(self.dev)
+ start = time.time()
+ x_tst = torch.repeat_interleave(x_tst, repeats=2, dim=1).transpose(1, 2)
+ audio = self.net_g_ms.infer(x_tst, f0=f0, g=sid)[0, 0].data.float()
+ use_time = time.time() - start
+ print("vits use time:{}".format(use_time))
+ return audio, audio.shape[-1]
+
+ def process(self, vocal_path, dst_path, tran=0):
+ source, sr = librosa.load(vocal_path, sr=32000, mono=True)
+ # 按照每秒一次进行处理
+ out_audio = []
+ source = torch.tensor(source).to(self.dev)
+ hop_len = 3840 * 4 # 120ms
+ length = 640 * 1000
+ for i in range(0, len(source), length - hop_len):
+ cur_hop_len = hop_len
+ input_data = source[i:i + length].unsqueeze(0)
+ audio, _ = self.infer(0, tran, input_data, sr)
+ if len(audio) < hop_len:
+ break
+ if len(out_audio) > 0:
+ # 本次开头和前面的末尾做fade
+ for j in range(hop_len):
+ out_audio[i+j] = out_audio[i+j] * (1-(j / hop_len)) + audio[j] * (j / hop_len)
+ else:
+ cur_hop_len = 0
+ out_audio.extend(audio[cur_hop_len:])
+ soundfile.write(dst_path, out_audio, sr, format="wav")
# class SvcONNXInferModel(object):
# def __init__(self, hubert_onnx, vits_onnx, config_path):
# self.config_path = config_path
# self.vits_onnx = vits_onnx
# self.hubert_onnx = hubert_onnx
# self.hubert_onnx_session = onnxruntime.InferenceSession(hubert_onnx, providers=['CUDAExecutionProvider', ])
# self.inspect_onnx(self.hubert_onnx_session)
# self.vits_onnx_session = onnxruntime.InferenceSession(vits_onnx, providers=['CUDAExecutionProvider', ])
# self.inspect_onnx(self.vits_onnx_session)
# self.hps_ms = utils.get_hparams_from_file(self.config_path)
# self.target_sample = self.hps_ms.data.sampling_rate
# self.feature_input = FeatureInput(self.hps_ms.data.sampling_rate, self.hps_ms.data.hop_length)
#
# @staticmethod
# def inspect_onnx(session):
# for i in session.get_inputs():
# print("name:{}\tshape:{}\tdtype:{}".format(i.name, i.shape, i.type))
# for i in session.get_outputs():
# print("name:{}\tshape:{}\tdtype:{}".format(i.name, i.shape, i.type))
#
# def infer(self, speaker_id, tran, raw_path):
# sid = np.array([int(speaker_id)], dtype=np.int64)
# soft, pitch = self.get_unit_pitch(raw_path, tran)
# pitch = np.expand_dims(pitch, axis=0).astype(np.int64)
# stn_tst = soft
# x_tst = np.expand_dims(stn_tst, axis=0)
# x_tst_lengths = np.array([stn_tst.shape[0]], dtype=np.int64)
# # 使用ONNX Runtime进行推理
# start = time.time()
# audio = self.vits_onnx_session.run(output_names=["audio"],
# input_feed={
# "hidden_unit": x_tst,
# "lengths": x_tst_lengths,
# "pitch": pitch,
# "sid": sid,
# })[0][0, 0]
# use_time = time.time() - start
# print("vits_onnx_session.run time:{}".format(use_time))
# audio = torch.from_numpy(audio)
# return audio, audio.shape[-1]
#
# def get_units(self, source, sr):
# source = torchaudio.functional.resample(source, sr, 16000)
# if len(source.shape) == 2 and source.shape[1] >= 2:
# source = torch.mean(source, dim=0).unsqueeze(0)
# source = source.unsqueeze(0)
# # 使用ONNX Runtime进行推理
# start = time.time()
# units = self.hubert_onnx_session.run(output_names=["embed"],
# input_feed={"source": source.numpy()})[0]
# use_time = time.time() - start
# print("hubert_onnx_session.run time:{}".format(use_time))
# return units
#
# def transcribe(self, source, sr, length, transform):
# feature_pit = self.feature_input.compute_f0(source, sr)
# feature_pit = feature_pit * 2 ** (transform / 12)
# feature_pit = resize2d_f0(feature_pit, length)
# coarse_pit = self.feature_input.coarse_f0(feature_pit)
# return coarse_pit
#
# def get_unit_pitch(self, in_path, tran):
# source, sr = torchaudio.load(in_path)
# soft = self.get_units(source, sr).squeeze(0)
# input_pitch = self.transcribe(source.numpy()[0], sr, soft.shape[0], tran)
# return soft, input_pitch
class RealTimeVC:
def __init__(self):
self.last_chunk = None
self.last_o = None
self.chunk_len = 16000 # 区块长度
self.pre_len = 3840 # 交叉淡化长度,640的倍数
"""输入输出都是1维numpy 音频波形数组"""
def process(self, svc_model, speaker_id, f_pitch_change, input_wav_path):
audio, sr = torchaudio.load(input_wav_path)
audio = audio.cpu().numpy()[0]
temp_wav = io.BytesIO()
if self.last_chunk is None:
input_wav_path.seek(0)
audio, sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path)
audio = audio.cpu().numpy()
self.last_chunk = audio[-self.pre_len:]
self.last_o = audio
return audio[-self.chunk_len:]
else:
audio = np.concatenate([self.last_chunk, audio])
soundfile.write(temp_wav, audio, sr, format="wav")
temp_wav.seek(0)
audio, sr = svc_model.infer(speaker_id, f_pitch_change, temp_wav)
audio = audio.cpu().numpy()
ret = maad.util.crossfade(self.last_o, audio, self.pre_len)
self.last_chunk = audio[-self.pre_len:]
self.last_o = audio
return ret[self.chunk_len:2 * self.chunk_len]
diff --git a/AutoCoverTool/ref/so_vits_svc/models.py b/AutoCoverTool/ref/so_vits_svc/models.py
index 477f395..3e9eba9 100644
--- a/AutoCoverTool/ref/so_vits_svc/models.py
+++ b/AutoCoverTool/ref/so_vits_svc/models.py
@@ -1,412 +1,439 @@
import copy
import math
import torch
from torch import nn
from torch.nn import functional as F
import attentions
import commons
import modules
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
from commons import init_weights, get_padding
from vdecoder.hifigan.models import Generator
from utils import f0_to_coarse
class ResidualCouplingBlock(nn.Module):
def __init__(self,
channels,
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
n_flows=4,
gin_channels=0):
super().__init__()
self.channels = channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.n_flows = n_flows
self.gin_channels = gin_channels
self.flows = nn.ModuleList()
for i in range(n_flows):
self.flows.append(
modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers,
gin_channels=gin_channels, mean_only=True))
self.flows.append(modules.Flip())
def forward(self, x, x_mask, g=None, reverse=False):
if not reverse:
for flow in self.flows:
x, _ = flow(x, x_mask, g=g, reverse=reverse)
else:
for flow in reversed(self.flows):
x = flow(x, x_mask, g=g, reverse=reverse)
return x
class Encoder(nn.Module):
def __init__(self,
in_channels,
out_channels,
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
gin_channels=0):
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.gin_channels = gin_channels
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
def forward(self, x, x_lengths, g=None):
# print(x.shape,x_lengths.shape)
# commons.sequence_mask 对于batch层级有价值,x_lengths是每个batch中每一个元素的帧数
# 比如输入([3,5,2], 5)那么得到 3 * 5的True/False矩阵,其中第一层矩阵为3个true,2个false,第二层全true,第三层前两个true,其余false
# 作用一个批次中允许不同长度的数据一起训练,此时较短的乘以false,剔除影响
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
x = self.pre(x) * x_mask
x = self.enc(x, x_mask, g=g)
stats = self.proj(x) * x_mask
m, logs = torch.split(stats, self.out_channels, dim=1)
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
return z, m, logs, x_mask
class TextEncoder(nn.Module):
def __init__(self,
in_channels,
out_channels,
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
gin_channels=0,
filter_channels=None,
n_heads=None,
p_dropout=None):
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.gin_channels = gin_channels
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
self.f0_emb = nn.Embedding(256, hidden_channels)
self.enc_ = attentions.Encoder(
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout)
def forward(self, x, x_lengths, f0=None):
# x->(b,256,frame_num), x_lengths -> (b)
# commons.sequence_mask 对于batch层级有价值,x_lengths是每个batch中每一个元素的帧数
# 比如输入([3,5,2], 5)那么得到 3 * 5的True/False矩阵,其中第一层矩阵为3个true,2个false,第二层全true,第三层前两个true,其余false
# 作用一个批次中允许不同长度的数据一起训练,此时较短的乘以false,剔除影响
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
x = self.pre(x) * x_mask
x = x + self.f0_emb(f0).transpose(1, 2)
x = self.enc_(x * x_mask, x_mask)
stats = self.proj(x) * x_mask
# m是VAE过程中得到的mu,而log对应的是log(sigma)
m, logs = torch.split(stats, self.out_channels, dim=1)
# z是随机采样过程
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
return z, m, logs, x_mask
class DiscriminatorP(torch.nn.Module):
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
super(DiscriminatorP, self).__init__()
self.period = period
self.use_spectral_norm = use_spectral_norm
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
self.convs = nn.ModuleList([
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
])
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
def forward(self, x):
fmap = []
# 1d to 2d
b, c, t = x.shape
if t % self.period != 0: # pad first
n_pad = self.period - (t % self.period)
x = F.pad(x, (0, n_pad), "reflect")
t = t + n_pad
x = x.view(b, c, t // self.period, self.period)
for l in self.convs:
x = l(x)
x = F.leaky_relu(x, modules.LRELU_SLOPE)
fmap.append(x)
x = self.conv_post(x)
fmap.append(x)
x = torch.flatten(x, 1, -1)
return x, fmap
class DiscriminatorS(torch.nn.Module):
def __init__(self, use_spectral_norm=False):
super(DiscriminatorS, self).__init__()
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
self.convs = nn.ModuleList([
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
])
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
def forward(self, x):
fmap = []
for l in self.convs:
x = l(x)
x = F.leaky_relu(x, modules.LRELU_SLOPE)
fmap.append(x)
x = self.conv_post(x)
fmap.append(x)
x = torch.flatten(x, 1, -1)
return x, fmap
class MultiPeriodDiscriminator(torch.nn.Module):
def __init__(self, use_spectral_norm=False):
super(MultiPeriodDiscriminator, self).__init__()
periods = [2, 3, 5, 7, 11]
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
self.discriminators = nn.ModuleList(discs)
def forward(self, y, y_hat):
y_d_rs = []
y_d_gs = []
fmap_rs = []
fmap_gs = []
for i, d in enumerate(self.discriminators):
y_d_r, fmap_r = d(y)
y_d_g, fmap_g = d(y_hat)
y_d_rs.append(y_d_r)
y_d_gs.append(y_d_g)
fmap_rs.append(fmap_r)
fmap_gs.append(fmap_g)
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
class SpeakerEncoder(torch.nn.Module):
def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256):
super(SpeakerEncoder, self).__init__()
self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
self.linear = nn.Linear(model_hidden_size, model_embedding_size)
self.relu = nn.ReLU()
def forward(self, mels):
self.lstm.flatten_parameters()
_, (hidden, _) = self.lstm(mels)
embeds_raw = self.relu(self.linear(hidden[-1]))
return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
def compute_partial_slices(self, total_frames, partial_frames, partial_hop):
mel_slices = []
for i in range(0, total_frames - partial_frames, partial_hop):
mel_range = torch.arange(i, i + partial_frames)
mel_slices.append(mel_range)
return mel_slices
def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
mel_len = mel.size(1)
last_mel = mel[:, -partial_frames:]
if mel_len > partial_frames:
mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop)
mels = list(mel[:, s] for s in mel_slices)
mels.append(last_mel)
mels = torch.stack(tuple(mels), 0).squeeze(1)
with torch.no_grad():
partial_embeds = self(mels)
embed = torch.mean(partial_embeds, axis=0).unsqueeze(0)
# embed = embed / torch.linalg.norm(embed, 2)
else:
with torch.no_grad():
embed = self(last_mel)
return embed
class SynthesizerTrn(nn.Module):
"""
Synthesizer for Training
"""
def __init__(self,
spec_channels,
segment_size,
inter_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
gin_channels,
ssl_dim,
n_speakers,
+ no_flow=False,
+ use_v3=False,
**kwargs):
super().__init__()
self.spec_channels = spec_channels
self.inter_channels = inter_channels
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.resblock = resblock
self.resblock_kernel_sizes = resblock_kernel_sizes
self.resblock_dilation_sizes = resblock_dilation_sizes
self.upsample_rates = upsample_rates
self.upsample_initial_channel = upsample_initial_channel
self.upsample_kernel_sizes = upsample_kernel_sizes
self.segment_size = segment_size
self.gin_channels = gin_channels
self.ssl_dim = ssl_dim
self.emb_g = nn.Embedding(n_speakers, gin_channels)
self.enc_p_ = TextEncoder(ssl_dim, inter_channels, hidden_channels, 5, 1, 16, 0, filter_channels, n_heads,
p_dropout)
+ self.no_flow = no_flow
+ self.use_v3 = use_v3
hps = {
"sampling_rate": 32000,
"inter_channels": 192,
"resblock": "1",
"resblock_kernel_sizes": [3, 7, 11],
"resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
"upsample_rates": [10, 8, 2, 2],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [16, 16, 4, 4],
"gin_channels": 256,
}
+ if self.use_v3:
+ # v3的结构
+ hps = {
+ "sampling_rate": 32000,
+ "inter_channels": 192,
+ "resblock": "1",
+ "resblock_kernel_sizes": [3, 5, 7],
+ "resblock_dilation_sizes": [[1, 2], [2, 6], [3, 12]],
+ "upsample_rates": [10, 8, 4],
+ "upsample_initial_channel": 256,
+ "upsample_kernel_sizes": [16, 16, 8],
+ "gin_channels": 256,
+ }
self.dec = Generator(h=hps)
self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
- self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
+
+ print("no_flow={}, use_v3={}".format(self.no_flow, self.use_v3))
+
+ if not self.no_flow:
+ self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
def forward(self, c, f0, spec, g=None, mel=None, c_lengths=None, spec_lengths=None):
# hubert特征(b,256,frame_num), f0 (frame_num), 幅度谱特征, 说话人id,mel谱特征
if c_lengths == None:
c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device) # [frame_num,....]
if spec_lengths == None:
spec_lengths = (torch.ones(spec.size(0)) * spec.size(-1)).to(spec.device)
# 说话人信息embding
g = self.emb_g(g).transpose(1, 2)
# 采样得到的z,vae需要的均值,logs_p是vae需要的log(sigma)
# 输入hubert特征(b,256,frame_num), f0 (frame_num),对应的是文本出隐变量的那段模型
z_ptemp, m_p, logs_p, _ = self.enc_p_(c, c_lengths, f0=f0_to_coarse(f0))
# 输入幅度谱和说话人信息
# 输出采样得到的z,m_q是均值,logs_q是log(sigma)
z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g)
# 标准化流,增加分布复杂程度
- z_p = self.flow(z, spec_mask, g=g)
+ if not self.no_flow:
+ z_p = self.flow(z, spec_mask, g=g)
# 由于整个batch中含有的音频帧数不一致,要求每一个元素都随机裁剪出segment_size长度的特征
# 返回z的batch列表,pitch_slice列表和ids_slice的列表
z_slice, pitch_slice, ids_slice = commons.rand_slice_segments_with_pitch(z, f0, spec_lengths, self.segment_size)
# o = self.dec(z_slice, g=g)
# 解码部分,输入未经过标准化的z,以及说话人信息和pitch,得到wav波形
o = self.dec(z_slice, g=g, f0=pitch_slice)
+ if self.no_flow:
+ z_p = z
# 原始波形,批次中每个采样到的帧的位置,批次中幅度谱的有效帧位置,
# 幅度谱编码得到正态分布后随机采样得到的z, z经过标准化流之后得到z_p, hubert特征层得到的正态分布的均值,
# hubert特征层得到的正态分布的标准差(logs_p),幅度谱和人声信息得到的均值(m_q),幅度谱和人声信息得到的标准差(logs_q)
return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
def infer(self, c, f0, g=None, mel=None, c_lengths=None):
+ print(c.shape, f0.shape, g.shape)
# hubert特征(b,256,frame_num), f0 (frame_num), 说话人id, mel谱特征
if c_lengths == None:
c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device) # [frame_num,....]
# 说话人信息经过embdding
g = self.emb_g(g).transpose(1, 2)
# 采样得到的z,vae需要的均值,logs_p是vae需要的log(sigma)
# 输入hubert特征(b,256,frame_num), f0 (frame_num)
# 其中c_mask的内容是由于每个batch中的元素的frame_num有可能不一致长,所以使用c_mask保证每个元素都能训练到自己的所有信息
z_p, m_p, logs_p, c_mask = self.enc_p_(c, c_lengths, f0=f0_to_coarse(f0))
# 将说话人和采样到的z信息塞入到标准化流中
- z = self.flow(z_p, c_mask, g=g, reverse=True)
+ if not self.no_flow:
+ z = self.flow(z_p, c_mask, g=g, reverse=True)
+ z_p = z
# 解码得到波形信息
- o = self.dec(z * c_mask, g=g, f0=f0)
+ o = self.dec(z_p * c_mask, g=g, f0=f0)
return o
def infer_v1(self, c, spec, f0, g):
print(c.shape, spec.shape, f0.shape, g.shape)
c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device) # (b, frame_num)
spec_lengths = (torch.ones(spec.size(0)) * spec.size(-1)).to(spec.device)
g = self.emb_g(g).transpose(1, 2)
# z_p, m_p, logs_p, c_mask = self.enc_p_(c, c_lengths, f0=f0_to_coarse(f0))
# z = self.flow(z_p, c_mask, g=g, reverse=True)
# o = self.dec(z * c_mask, g=g, f0=f0)
# print(c_mask.shape, c_mask)
z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g)
o = self.dec(z, g=g, f0=f0)
return o
if __name__ == '__main__':
# m = MultiPeriodDiscriminator()
# y = torch.rand((1, 1, 96000))
# y_hat = torch.rand((1, 1, 96000))
# a, b, c, d = m(y, y_hat)
t1 = DiscriminatorS()
y = torch.rand(1, 1, 512)
a, b = t1(y)
print(a.shape)
diff --git a/AutoCoverTool/ref/so_vits_svc/preprocess_flist_config.py b/AutoCoverTool/ref/so_vits_svc/preprocess_flist_config.py
index 5b7e80a..59f7bf8 100644
--- a/AutoCoverTool/ref/so_vits_svc/preprocess_flist_config.py
+++ b/AutoCoverTool/ref/so_vits_svc/preprocess_flist_config.py
@@ -1,132 +1,137 @@
import os
import argparse
import re
from tqdm import tqdm
from random import shuffle
import json
config_template = {
"train": {
"log_interval": 200,
"eval_interval": 1000,
"seed": 1234,
- "epochs": 400, # 由10000->400
+ "epochs": 10000, # 由10000->400
"learning_rate": 1e-4,
"betas": [0.8, 0.99],
"eps": 1e-9,
"batch_size": 12,
"fp16_run": False,
"lr_decay": 0.999875,
"segment_size": 17920,
"init_lr_ratio": 1,
"warmup_epochs": 0,
"c_mel": 45,
"c_kl": 1.0,
"use_sr": True,
"max_speclen": 384,
"port": "8001"
},
"data": {
"training_files": "filelists/train.txt",
"validation_files": "filelists/val.txt",
"max_wav_value": 32768.0,
"sampling_rate": 32000,
"filter_length": 1280,
"hop_length": 320,
"win_length": 1280,
"n_mel_channels": 80,
"mel_fmin": 0.0,
"mel_fmax": None
},
"model": {
"inter_channels": 192,
"hidden_channels": 192,
"filter_channels": 768,
"n_heads": 2,
"n_layers": 6,
"kernel_size": 3,
"p_dropout": 0.1,
"resblock": "1",
"resblock_kernel_sizes": [3, 7, 11],
"resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
"upsample_rates": [10, 8, 2, 2],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [16, 16, 4, 4],
"n_layers_q": 3,
"use_spectral_norm": False,
"gin_channels": 256,
"ssl_dim": 256,
"n_speakers": 0,
},
"spk": {
"nen": 0,
"paimon": 1,
"yunhao": 2
}
}
pattern = re.compile(r'^[\.a-zA-Z0-9_\/]+$')
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--train_list", type=str, default="./filelists/train.txt", help="path to train list")
parser.add_argument("--val_list", type=str, default="./filelists/val.txt", help="path to val list")
parser.add_argument("--test_list", type=str, default="./filelists/test.txt", help="path to test list")
parser.add_argument("--source_dir", type=str, default="./dataset/32k", help="path to source dir")
parser.add_argument("--config_path", type=str, default="./config/config.json", help="path to source dir")
args = parser.parse_args()
train = []
val = []
test = []
idx = 0
spk_dict = {}
spk_id = 0
for speaker in tqdm(os.listdir(args.source_dir)):
spk_dict[speaker] = spk_id
spk_id += 1
wavs = ["/".join([args.source_dir, speaker, i]) for i in os.listdir(os.path.join(args.source_dir, speaker))]
for wavpath in wavs:
if not pattern.match(wavpath):
print(f"warning:文件名{wavpath}中包含非字母数字下划线,可能会导致错误。(也可能不会)")
if len(wavs) < 10:
print(f"warning:{speaker}数据集数量小于10条,请补充数据")
wavs = [i for i in wavs if i.endswith("wav")]
shuffle(wavs)
- train += wavs[2:-2]
- val += wavs[:2]
- test += wavs[-2:]
+ if len(wavs) < 10:
+ train += wavs
+ val += wavs
+ test += wavs
+ else:
+ train += wavs[2:-2]
+ val += wavs[:2]
+ test += wavs[-2:]
n_speakers = len(spk_dict.keys()) * 2
shuffle(train)
shuffle(val)
shuffle(test)
print("Writing", args.train_list)
with open(args.train_list, "w") as f:
for fname in tqdm(train):
wavpath = fname
f.write(wavpath + "\n")
print("Writing", args.val_list)
with open(args.val_list, "w") as f:
for fname in tqdm(val):
wavpath = fname
f.write(wavpath + "\n")
print("Writing", args.test_list)
with open(args.test_list, "w") as f:
for fname in tqdm(test):
wavpath = fname
f.write(wavpath + "\n")
config_template["model"]["n_speakers"] = n_speakers
config_template["spk"] = spk_dict
print("Writing configs/config.json")
# 修改配置文件
config_template["data"]["training_files"] = args.train_list
config_template["data"]["validation_files"] = args.val_list
with open(args.config_path, "w") as f:
json.dump(config_template, f, indent=2)
diff --git a/AutoCoverTool/ref/so_vits_svc/real_time_inference.py b/AutoCoverTool/ref/so_vits_svc/real_time_inference.py
new file mode 100644
index 0000000..e0f5af0
--- /dev/null
+++ b/AutoCoverTool/ref/so_vits_svc/real_time_inference.py
@@ -0,0 +1,23 @@
+"""
+实时推理
+"""
+import os
+import time
+from ref.so_vits_svc.inference.infer_tool import SVCRealTimeByBuffer, Svc
+
+
+def test(in_path, out_path):
+ config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "config.json")
+ print(config_path)
+ net_g_path = "/Users/yangjianli/starmaker/av_tools/resource_av_svc/models/G_inference_1000.pth"
+ hubert_net_path = "/Users/yangjianli/starmaker/av_tools/resource_av_svc/models/hubert-soft-0d54a1f4.pt"
+ stbb = SVCRealTimeByBuffer(net_g_path, config_path, hubert_net_path)
+ stbb.process(in_path, out_path)
+
+
+if __name__ == '__main__':
+ in_path = "/Users/yangjianli/starmaker/av_tools/resource_av_svc/clean_jianli/vocal_32.wav"
+ out_path = "/Users/yangjianli/starmaker/av_tools/resource_av_svc/clean_jianli/vocal_out.wav"
+ st = time.time()
+ test(in_path, out_path)
+ print("sp={}".format(time.time() - st))
diff --git a/AutoCoverTool/ref/so_vits_svc/train.py b/AutoCoverTool/ref/so_vits_svc/train.py
index 69f56ac..cb656f3 100644
--- a/AutoCoverTool/ref/so_vits_svc/train.py
+++ b/AutoCoverTool/ref/so_vits_svc/train.py
@@ -1,312 +1,334 @@
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)
logging.getLogger('numba').setLevel(logging.WARNING)
import os
import json
import argparse
import itertools
import math
import torch
from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
import torch.multiprocessing as mp
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.cuda.amp import autocast, GradScaler
import commons
import utils
from data_utils import TextAudioSpeakerLoader, EvalDataLoader
from models import (
SynthesizerTrn,
MultiPeriodDiscriminator,
)
from losses import (
kl_loss,
generator_loss, discriminator_loss, feature_loss
)
from mel_processing import mel_spectrogram_torch, spec_to_mel_torch
torch.backends.cudnn.benchmark = True
global_step = 0
# os.environ['TORCH_DISTRIBUTED_DEBUG'] = 'INFO'
def main():
"""Assume Single Node Multi GPUs Training Only"""
assert torch.cuda.is_available(), "CPU training is not allowed."
hps = utils.get_hparams()
n_gpus = torch.cuda.device_count()
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = hps.train.port
mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hps,))
def run(rank, n_gpus, hps):
print("CurRank:===>", rank)
global global_step
if rank == 0:
logger = utils.get_logger(hps.model_dir)
logger.info(hps)
utils.check_git_hash(hps.model_dir)
writer = SummaryWriter(log_dir=hps.model_dir)
writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank)
torch.manual_seed(hps.train.seed)
torch.cuda.set_device(rank)
# 从每段音频文件中获取特征
# hubert特征,f0,幅度谱特征,对应音频段波形(384 * hop_length),人声编码[0],每一次获取3840ms长度的特征
- train_dataset = TextAudioSpeakerLoader(hps.data.training_files, hps)
- train_loader = DataLoader(train_dataset, num_workers=8, shuffle=False, pin_memory=True,
+ # train_dataset = TextAudioSpeakerLoader(hps.data.training_files, hps)
+ from script.train_user_by_one_media import TextAudioSpeakerLoader
+ train_dataset = TextAudioSpeakerLoader('data/train_users/qiankun_v1/vocals/speaker0/qiankun.wav')
+ train_loader = DataLoader(train_dataset, num_workers=0, shuffle=False, pin_memory=True,
batch_size=hps.train.batch_size)
if rank == 0:
eval_dataset = EvalDataLoader(hps.data.validation_files, hps)
- eval_loader = DataLoader(eval_dataset, num_workers=1, shuffle=False,
+ eval_loader = DataLoader(eval_dataset, num_workers=0, shuffle=False,
batch_size=1, pin_memory=False,
drop_last=False)
net_g = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
- **hps.model).cuda(rank)
+ **hps.model, no_flow=False, use_v3=False).cuda(rank)
net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank)
optim_g = torch.optim.AdamW(
net_g.parameters(),
hps.train.learning_rate,
betas=hps.train.betas,
eps=hps.train.eps)
optim_d = torch.optim.AdamW(
net_d.parameters(),
hps.train.learning_rate,
betas=hps.train.betas,
eps=hps.train.eps)
- net_g = DDP(net_g, device_ids=[rank]) # , find_unused_parameters=True)
- net_d = DDP(net_d, device_ids=[rank])
+ # net_g = DDP(net_g, device_ids=[rank]) # , find_unused_parameters=True)
+ # net_d = DDP(net_d, device_ids=[rank])
+ print("{}".format(hps.model_dir))
try:
_, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g,
optim_g)
_, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d,
optim_d)
global_step = (epoch_str - 1) * len(train_loader)
- print("load checkpoint ok !")
- except:
+ print("load checkpoint ok ! {}".format(epoch_str))
+ except Exception as ex:
+ global_step = 1
epoch_str = 1
- global_step = 0
+ print("EXXX ! {}".format(ex))
scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2)
scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2)
scaler = GradScaler(enabled=hps.train.fp16_run)
for epoch in range(epoch_str, hps.train.epochs + 1):
if rank == 0:
train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler,
[train_loader, eval_loader], logger, [writer, writer_eval])
else:
train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler,
[train_loader, None], None, None)
scheduler_g.step()
scheduler_d.step()
def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers):
net_g, net_d = nets
optim_g, optim_d = optims
scheduler_g, scheduler_d = schedulers
train_loader, eval_loader = loaders
if writers is not None:
writer, writer_eval = writers
# train_loader.batch_sampler.set_epoch(epoch)
global global_step
net_g.train()
net_d.train()
for batch_idx, items in enumerate(train_loader):
# hubert特征,f0,幅度谱特征,对应音频段波形(384 * hop_length),人声编码[0]
c, f0, spec, y, spk = items
g = spk.cuda(rank, non_blocking=True)
spec, y = spec.cuda(rank, non_blocking=True), y.cuda(rank, non_blocking=True)
c = c.cuda(rank, non_blocking=True)
f0 = f0.cuda(rank, non_blocking=True)
"""
"sampling_rate": 32000,
"filter_length": 1280,
"hop_length": 320,
"win_length": 1280,
"n_mel_channels": 80,
"mel_fmin": 0.0,
"mel_fmax": null
"""
mel = spec_to_mel_torch(
spec,
hps.data.filter_length,
hps.data.n_mel_channels,
hps.data.sampling_rate,
hps.data.mel_fmin,
hps.data.mel_fmax)
with autocast(enabled=hps.train.fp16_run):
# net_g的输入: hubert特征,f0,幅度谱特征,说话人id,mel谱特征
# net_g的输出:
# 原始波形,批次中每个采样到的帧的位置,批次中幅度谱的有效帧位置,
# 幅度谱编码得到正态分布后随机采样得到的z, z经过标准化流之后得到z_p, hubert特征层得到的正态分布的均值,
# hubert特征层得到的正态分布的标准差(logs_p),幅度谱和人声信息得到的均值(m_q),幅度谱和人声信息得到的标准差(logs_q)
y_hat, ids_slice, z_mask, \
(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(c, f0, spec, g=g, mel=mel)
y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length)
y_hat_mel = mel_spectrogram_torch(
y_hat.squeeze(1),
hps.data.filter_length,
hps.data.n_mel_channels,
hps.data.sampling_rate,
hps.data.hop_length,
hps.data.win_length,
hps.data.mel_fmin,
hps.data.mel_fmax
)
y = commons.slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice
# Discriminator
y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach())
with autocast(enabled=False):
loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g)
loss_disc_all = loss_disc
optim_d.zero_grad()
scaler.scale(loss_disc_all).backward()
scaler.unscale_(optim_d)
- grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
+ # grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
scaler.step(optim_d)
with autocast(enabled=hps.train.fp16_run):
# Generator
y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat)
with autocast(enabled=False):
# mel谱之间的损失函数,后面是系数,误差越小越好
loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
# KL散度,z_p: 幅度谱侧得到的采样值经过标准化流之后的结果,logs_q: 幅度谱侧得到的标准差,m_p:hubert侧得到的均值
# logs_p: hubert侧得到的标准差,z_mask: 批次中幅度谱的有效帧位置,
loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
# 在d模型中将y和y_hat的每一层特征结果都拿出来,做l1距离
loss_fm = feature_loss(fmap_r, fmap_g)
loss_gen, losses_gen = generator_loss(y_d_hat_g)
loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl
optim_g.zero_grad()
scaler.scale(loss_gen_all).backward()
scaler.unscale_(optim_g)
- grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
+ # grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
scaler.step(optim_g)
scaler.update()
if rank == 0:
if global_step % hps.train.log_interval == 0:
lr = optim_g.param_groups[0]['lr']
losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_kl]
logger.info('Train Epoch: {} [{:.0f}%]'.format(
epoch,
100. * batch_idx / len(train_loader)))
logger.info([x.item() for x in losses] + [global_step, lr])
- scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, "learning_rate": lr,
- "grad_norm_d": grad_norm_d, "grad_norm_g": grad_norm_g}
- scalar_dict.update({"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl})
-
- scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)})
- scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)})
- scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)})
- image_dict = {
- "slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()),
- "slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()),
- "all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()),
- }
-
- utils.summarize(
- writer=writer,
- global_step=global_step,
- images=image_dict,
- scalars=scalar_dict
- )
-
- if global_step % hps.train.eval_interval == 0:
+ losses_numpy = [round(loss_disc.item(), 3), round(loss_gen.item(), 3),
+ round(loss_fm.item(), 3), round(loss_mel.item(), 3), round(loss_kl.item(), 3)]
+ print("gstep={},lr={},disc={},gen={},fm={},mel={},kl={},tot={}".format(global_step, lr,
+ losses_numpy[0],
+ losses_numpy[1],
+ losses_numpy[2],
+ losses_numpy[3],
+ losses_numpy[4],
+ sum(losses_numpy)))
+
+ # scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, "learning_rate": lr,
+ # "grad_norm_d": grad_norm_d, "grad_norm_g": grad_norm_g}
+ # scalar_dict.update({"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl})
+ #
+ # scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)})
+ # scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)})
+ # scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)})
+ # image_dict = {
+ # "slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()),
+ # "slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()),
+ # "all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()),
+ # }
+
+ # utils.summarize(
+ # writer=writer,
+ # global_step=global_step,
+ # images=image_dict,
+ # scalars=scalar_dict
+ # )
+
+ if global_step % hps.train.eval_interval == 0 and global_step != 0:
evaluate(hps, net_g, eval_loader, writer_eval)
utils.save_checkpoint(net_g, optim_g, hps.train.learning_rate, epoch,
os.path.join(hps.model_dir, "G_{}.pth".format(global_step)))
utils.save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch,
os.path.join(hps.model_dir, "D_{}.pth".format(global_step)))
# 达到2000个step则停止
- if global_step == 2000:
- logger.info('====> 2000 ==> Epoch: {},{}'.format(epoch, global_step))
+ if global_step == 1000:
+ logger.info('====> 1000 ==> Epoch: {},{}'.format(epoch, global_step))
exit(0)
global_step += 1
if rank == 0:
logger.info('====> Epoch: {},{}'.format(epoch, global_step))
def evaluate(hps, generator, eval_loader, writer_eval):
generator.eval()
image_dict = {}
audio_dict = {}
with torch.no_grad():
for batch_idx, items in enumerate(eval_loader):
c, f0, spec, y, spk = items
g = spk[:1].cuda(0)
spec, y = spec[:1].cuda(0), y[:1].cuda(0)
c = c[:1].cuda(0)
f0 = f0[:1].cuda(0)
mel = spec_to_mel_torch(
spec,
hps.data.filter_length,
hps.data.n_mel_channels,
hps.data.sampling_rate,
hps.data.mel_fmin,
hps.data.mel_fmax)
- y_hat = generator.module.infer(c, f0, g=g, mel=mel)
+
+ if hasattr(generator, 'module'):
+ y_hat = generator.module.infer(c, f0, g=g, mel=mel)
+ else:
+ y_hat = generator.infer(c, f0, g=g, mel=mel)
y_hat_mel = mel_spectrogram_torch(
y_hat.squeeze(1).float(),
hps.data.filter_length,
hps.data.n_mel_channels,
hps.data.sampling_rate,
hps.data.hop_length,
hps.data.win_length,
hps.data.mel_fmin,
hps.data.mel_fmax
)
audio_dict.update({
f"gen/audio_{batch_idx}": y_hat[0],
f"gt/audio_{batch_idx}": y[0]
})
image_dict.update({
f"gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy()),
"gt/mel": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy())
})
utils.summarize(
writer=writer_eval,
global_step=global_step,
images=image_dict,
audios=audio_dict,
audio_sampling_rate=hps.data.sampling_rate
)
generator.train()
if __name__ == "__main__":
+ import time
+
+ st = time.time()
main()
+ print("sp={}".format(time.time() - st))
diff --git a/AutoCoverTool/ref/so_vits_svc/utils.py b/AutoCoverTool/ref/so_vits_svc/utils.py
index 2dadf1a..6bba348 100644
--- a/AutoCoverTool/ref/so_vits_svc/utils.py
+++ b/AutoCoverTool/ref/so_vits_svc/utils.py
@@ -1,366 +1,366 @@
import os
import glob
import re
import sys
import argparse
import logging
import json
import subprocess
import librosa
import numpy as np
import torchaudio
from scipy.io.wavfile import read
import torch
import torchvision
from torch.nn import functional as F
from commons import sequence_mask
from hubert import hubert_model
MATPLOTLIB_FLAG = False
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logger = logging
f0_bin = 256
f0_max = 1100.0
f0_min = 50.0
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
def f0_to_coarse(f0):
"""
将f0按照Log10的级别进行区分,最后归一化到[1-255] 之间
:param f0:
:return:
"""
is_torch = isinstance(f0, torch.Tensor)
f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
# np.rint() 四舍五入取整
f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int)
assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min())
return f0_coarse
def get_hubert_model(rank=None):
hubert_soft = hubert_model.hubert_soft("/data/prod/so_vits_models/models/hubert-soft-0d54a1f4.pt")
if rank is not None:
hubert_soft = hubert_soft.cuda(rank)
return hubert_soft
def get_hubert_content(hmodel, y=None, path=None):
if path is not None:
source, sr = torchaudio.load(path)
source = torchaudio.functional.resample(source, sr, 16000)
if len(source.shape) == 2 and source.shape[1] >= 2:
source = torch.mean(source, dim=0).unsqueeze(0)
else:
source = y
source = source.unsqueeze(0)
with torch.inference_mode():
units = hmodel.units(source)
return units.transpose(1, 2)
def get_content(cmodel, y):
with torch.no_grad():
c = cmodel.extract_features(y.squeeze(1))[0]
c = c.transpose(1, 2)
return c
def transform(mel, height): # 68-92
# r = np.random.random()
# rate = r * 0.3 + 0.85 # 0.85-1.15
# height = int(mel.size(-2) * rate)
tgt = torchvision.transforms.functional.resize(mel, (height, mel.size(-1)))
if height >= mel.size(-2):
return tgt[:, :mel.size(-2), :]
else:
silence = tgt[:, -1:, :].repeat(1, mel.size(-2) - height, 1)
silence += torch.randn_like(silence) / 10
return torch.cat((tgt, silence), 1)
def stretch(mel, width): # 0.5-2
return torchvision.transforms.functional.resize(mel, (mel.size(-2), width))
def load_checkpoint(checkpoint_path, model, optimizer=None):
assert os.path.isfile(checkpoint_path)
checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
- iteration = checkpoint_dict['iteration']
- learning_rate = checkpoint_dict['learning_rate']
+ iteration = checkpoint_dict.get('iteration', None)
+ learning_rate = checkpoint_dict.get('learning_rate', None)
if iteration is None:
iteration = 1
if learning_rate is None:
learning_rate = 0.0002
- if optimizer is not None and checkpoint_dict['optimizer'] is not None:
+ if optimizer is not None and checkpoint_dict.get('optimizer', None) is not None:
optimizer.load_state_dict(checkpoint_dict['optimizer'])
saved_state_dict = checkpoint_dict['model']
if hasattr(model, 'module'):
state_dict = model.module.state_dict()
else:
state_dict = model.state_dict()
new_state_dict = {}
for k, v in state_dict.items():
try:
new_state_dict[k] = saved_state_dict[k]
except:
logger.info("%s is not in the checkpoint" % k)
new_state_dict[k] = v
if hasattr(model, 'module'):
model.module.load_state_dict(new_state_dict)
else:
model.load_state_dict(new_state_dict)
logger.info("Loaded checkpoint '{}' (iteration {})".format(
checkpoint_path, iteration))
return model, optimizer, learning_rate, iteration
def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
logger.info("Saving model and optimizer state at iteration {} to {}".format(
iteration, checkpoint_path))
if hasattr(model, 'module'):
state_dict = model.module.state_dict()
else:
state_dict = model.state_dict()
torch.save({'model': state_dict,
'iteration': iteration,
'optimizer': optimizer.state_dict(),
'learning_rate': learning_rate}, checkpoint_path)
clean_ckpt = False
if clean_ckpt:
clean_checkpoints(path_to_models='logs/32k/', n_ckpts_to_keep=3, sort_by_time=True)
def clean_checkpoints(path_to_models='logs/48k/', n_ckpts_to_keep=2, sort_by_time=True):
"""Freeing up space by deleting saved ckpts
Arguments:
path_to_models -- Path to the model directory
n_ckpts_to_keep -- Number of ckpts to keep, excluding G_0.pth and D_0.pth
sort_by_time -- True -> chronologically delete ckpts
False -> lexicographically delete ckpts
"""
ckpts_files = [f for f in os.listdir(path_to_models) if os.path.isfile(os.path.join(path_to_models, f))]
name_key = (lambda _f: int(re.compile('._(\d+)\.pth').match(_f).group(1)))
time_key = (lambda _f: os.path.getmtime(os.path.join(path_to_models, _f)))
sort_key = time_key if sort_by_time else name_key
x_sorted = lambda _x: sorted([f for f in ckpts_files if f.startswith(_x) and not f.endswith('_0.pth')],
key=sort_key)
to_del = [os.path.join(path_to_models, fn) for fn in
(x_sorted('G')[:-n_ckpts_to_keep] + x_sorted('D')[:-n_ckpts_to_keep])]
del_info = lambda fn: logger.info(f".. Free up space by deleting ckpt {fn}")
del_routine = lambda x: [os.remove(x), del_info(x)]
rs = [del_routine(fn) for fn in to_del]
def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
for k, v in scalars.items():
writer.add_scalar(k, v, global_step)
for k, v in histograms.items():
writer.add_histogram(k, v, global_step)
for k, v in images.items():
writer.add_image(k, v, global_step, dataformats='HWC')
for k, v in audios.items():
writer.add_audio(k, v, global_step, audio_sampling_rate)
def latest_checkpoint_path(dir_path, regex="G_*.pth"):
f_list = glob.glob(os.path.join(dir_path, regex))
f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
x = f_list[-1]
print(x)
return x
def plot_spectrogram_to_numpy(spectrogram):
global MATPLOTLIB_FLAG
if not MATPLOTLIB_FLAG:
import matplotlib
matplotlib.use("Agg")
MATPLOTLIB_FLAG = True
mpl_logger = logging.getLogger('matplotlib')
mpl_logger.setLevel(logging.WARNING)
import matplotlib.pylab as plt
import numpy as np
fig, ax = plt.subplots(figsize=(10, 2))
im = ax.imshow(spectrogram, aspect="auto", origin="lower",
interpolation='none')
plt.colorbar(im, ax=ax)
plt.xlabel("Frames")
plt.ylabel("Channels")
plt.tight_layout()
fig.canvas.draw()
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
plt.close()
return data
def plot_alignment_to_numpy(alignment, info=None):
global MATPLOTLIB_FLAG
if not MATPLOTLIB_FLAG:
import matplotlib
matplotlib.use("Agg")
MATPLOTLIB_FLAG = True
mpl_logger = logging.getLogger('matplotlib')
mpl_logger.setLevel(logging.WARNING)
import matplotlib.pylab as plt
import numpy as np
fig, ax = plt.subplots(figsize=(6, 4))
im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower',
interpolation='none')
fig.colorbar(im, ax=ax)
xlabel = 'Decoder timestep'
if info is not None:
xlabel += '\n\n' + info
plt.xlabel(xlabel)
plt.ylabel('Encoder timestep')
plt.tight_layout()
fig.canvas.draw()
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
plt.close()
return data
def load_wav_to_torch(full_path):
sampling_rate, data = read(full_path)
return torch.FloatTensor(data.astype(np.float32)), sampling_rate
def load_filepaths_and_text(filename, split="|"):
with open(filename, encoding='utf-8') as f:
filepaths_and_text = [line.strip().split(split) for line in f]
return filepaths_and_text
def get_hparams(init=True):
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--config', type=str, default="./configs/base.json",
help='JSON file for configuration')
parser.add_argument('-m', '--model', type=str, required=True,
help='Model name')
parser.add_argument('-l', '--logs', type=str, required=True,
help='log Name')
args = parser.parse_args()
model_dir = os.path.join(args.logs, args.model)
if not os.path.exists(model_dir):
os.makedirs(model_dir)
config_path = args.config
config_save_path = os.path.join(model_dir, "config.json")
if init:
with open(config_path, "r") as f:
data = f.read()
with open(config_save_path, "w") as f:
f.write(data)
else:
with open(config_save_path, "r") as f:
data = f.read()
config = json.loads(data)
hparams = HParams(**config)
hparams.model_dir = model_dir
return hparams
def get_hparams_from_dir(model_dir):
config_save_path = os.path.join(model_dir, "config.json")
with open(config_save_path, "r") as f:
data = f.read()
config = json.loads(data)
hparams = HParams(**config)
hparams.model_dir = model_dir
return hparams
def get_hparams_from_file(config_path):
with open(config_path, "r") as f:
data = f.read()
config = json.loads(data)
hparams = HParams(**config)
return hparams
def check_git_hash(model_dir):
source_dir = os.path.dirname(os.path.realpath(__file__))
if not os.path.exists(os.path.join(source_dir, ".git")):
logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(
source_dir
))
return
cur_hash = subprocess.getoutput("git rev-parse HEAD")
path = os.path.join(model_dir, "githash")
if os.path.exists(path):
saved_hash = open(path).read()
if saved_hash != cur_hash:
logger.warn("git hash values are different. {}(saved) != {}(current)".format(
saved_hash[:8], cur_hash[:8]))
else:
open(path, "w").write(cur_hash)
def get_logger(model_dir, filename="train.log"):
global logger
logger = logging.getLogger(os.path.basename(model_dir))
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
if not os.path.exists(model_dir):
os.makedirs(model_dir)
h = logging.FileHandler(os.path.join(model_dir, filename))
h.setLevel(logging.DEBUG)
h.setFormatter(formatter)
logger.addHandler(h)
return logger
class HParams():
def __init__(self, **kwargs):
for k, v in kwargs.items():
if type(v) == dict:
v = HParams(**v)
self[k] = v
def keys(self):
return self.__dict__.keys()
def items(self):
return self.__dict__.items()
def values(self):
return self.__dict__.values()
def __len__(self):
return len(self.__dict__)
def __getitem__(self, key):
return getattr(self, key)
def __setitem__(self, key, value):
return setattr(self, key, value)
def __contains__(self, key):
return key in self.__dict__
def __repr__(self):
return self.__dict__.__repr__()
diff --git a/AutoCoverTool/ref/so_vits_svc/vdecoder/hifigan/models.py b/AutoCoverTool/ref/so_vits_svc/vdecoder/hifigan/models.py
index 9747301..629812e 100644
--- a/AutoCoverTool/ref/so_vits_svc/vdecoder/hifigan/models.py
+++ b/AutoCoverTool/ref/so_vits_svc/vdecoder/hifigan/models.py
@@ -1,503 +1,511 @@
import os
import json
from .env import AttrDict
import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
from .utils import init_weights, get_padding
LRELU_SLOPE = 0.1
def load_model(model_path, device='cuda'):
config_file = os.path.join(os.path.split(model_path)[0], 'config.json')
with open(config_file) as f:
data = f.read()
global h
json_config = json.loads(data)
h = AttrDict(json_config)
generator = Generator(h).to(device)
cp_dict = torch.load(model_path)
generator.load_state_dict(cp_dict['generator'])
generator.eval()
generator.remove_weight_norm()
del cp_dict
return generator, h
class ResBlock1(torch.nn.Module):
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
super(ResBlock1, self).__init__()
self.h = h
- self.convs1 = nn.ModuleList([
- weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
- padding=get_padding(kernel_size, dilation[0]))),
- weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
- padding=get_padding(kernel_size, dilation[1]))),
- weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
- padding=get_padding(kernel_size, dilation[2])))
- ])
+ weight_norm_arr = []
+ for i in range(len(dilation)):
+ weight_norm_arr.append(weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[i],
+ padding=get_padding(kernel_size, dilation[i]))), )
+ self.convs1 = nn.ModuleList(
+ # [weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+ # padding=get_padding(kernel_size, dilation[0]))),
+ # weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+ # padding=get_padding(kernel_size, dilation[1]))),
+ # weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+ # padding=get_padding(kernel_size, dilation[2])))
+ # ]
+ weight_norm_arr
+ )
self.convs1.apply(init_weights)
self.convs2 = nn.ModuleList([
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
padding=get_padding(kernel_size, 1))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
padding=get_padding(kernel_size, 1))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
padding=get_padding(kernel_size, 1)))
])
self.convs2.apply(init_weights)
def forward(self, x):
for c1, c2 in zip(self.convs1, self.convs2):
xt = F.leaky_relu(x, LRELU_SLOPE)
xt = c1(xt)
xt = F.leaky_relu(xt, LRELU_SLOPE)
xt = c2(xt)
x = xt + x
return x
def remove_weight_norm(self):
for l in self.convs1:
remove_weight_norm(l)
for l in self.convs2:
remove_weight_norm(l)
class ResBlock2(torch.nn.Module):
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
super(ResBlock2, self).__init__()
self.h = h
self.convs = nn.ModuleList([
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
padding=get_padding(kernel_size, dilation[0]))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
padding=get_padding(kernel_size, dilation[1])))
])
self.convs.apply(init_weights)
def forward(self, x):
for c in self.convs:
xt = F.leaky_relu(x, LRELU_SLOPE)
xt = c(xt)
x = xt + x
return x
def remove_weight_norm(self):
for l in self.convs:
remove_weight_norm(l)
def padDiff(x):
- return F.pad(F.pad(x, (0,0,-1,1), 'constant', 0) - x, (0,0,0,-1), 'constant', 0)
+ return F.pad(F.pad(x, (0, 0, -1, 1), 'constant', 0) - x, (0, 0, 0, -1), 'constant', 0)
+
class SineGen(torch.nn.Module):
""" Definition of sine generator
SineGen(samp_rate, harmonic_num = 0,
sine_amp = 0.1, noise_std = 0.003,
voiced_threshold = 0,
flag_for_pulse=False)
samp_rate: sampling rate in Hz
harmonic_num: number of harmonic overtones (default 0)
sine_amp: amplitude of sine-wavefrom (default 0.1)
noise_std: std of Gaussian noise (default 0.003)
voiced_thoreshold: F0 threshold for U/V classification (default 0)
flag_for_pulse: this SinGen is used inside PulseGen (default False)
Note: when flag_for_pulse is True, the first time step of a voiced
segment is always sin(np.pi) or cos(0)
"""
def __init__(self, samp_rate, harmonic_num=0,
sine_amp=0.1, noise_std=0.003,
voiced_threshold=0,
flag_for_pulse=False):
super(SineGen, self).__init__()
self.sine_amp = sine_amp
self.noise_std = noise_std
self.harmonic_num = harmonic_num
self.dim = self.harmonic_num + 1
self.sampling_rate = samp_rate
self.voiced_threshold = voiced_threshold
self.flag_for_pulse = flag_for_pulse
def _f02uv(self, f0):
# generate uv signal
uv = (f0 > self.voiced_threshold).type(torch.float32)
return uv
def _f02sine(self, f0_values):
""" f0_values: (batchsize, length, dim)
where dim indicates fundamental tone and overtones
"""
# convert to F0 in rad. The interger part n can be ignored
# because 2 * np.pi * n doesn't affect phase
rad_values = (f0_values / self.sampling_rate) % 1
# initial phase noise (no noise for fundamental component)
rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], \
device=f0_values.device)
rand_ini[:, 0] = 0
rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
# instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
if not self.flag_for_pulse:
# for normal case
# To prevent torch.cumsum numerical overflow,
# it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.
# Buffer tmp_over_one_idx indicates the time step to add -1.
# This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi
tmp_over_one = torch.cumsum(rad_values, 1) % 1
tmp_over_one_idx = (padDiff(tmp_over_one)) < 0
cumsum_shift = torch.zeros_like(rad_values)
cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1)
* 2 * np.pi)
else:
# If necessary, make sure that the first time step of every
# voiced segments is sin(pi) or cos(0)
# This is used for pulse-train generation
# identify the last time step in unvoiced segments
uv = self._f02uv(f0_values)
uv_1 = torch.roll(uv, shifts=-1, dims=1)
uv_1[:, -1, :] = 1
u_loc = (uv < 1) * (uv_1 > 0)
# get the instantanouse phase
tmp_cumsum = torch.cumsum(rad_values, dim=1)
# different batch needs to be processed differently
for idx in range(f0_values.shape[0]):
temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
# stores the accumulation of i.phase within
# each voiced segments
tmp_cumsum[idx, :, :] = 0
tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
# rad_values - tmp_cumsum: remove the accumulation of i.phase
# within the previous voiced segment.
i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
# get the sines
sines = torch.cos(i_phase * 2 * np.pi)
return sines
def forward(self, f0):
""" sine_tensor, uv = forward(f0)
input F0: tensor(batchsize=1, length, dim=1)
f0 for unvoiced steps should be 0
output sine_tensor: tensor(batchsize=1, length, dim)
output uv: tensor(batchsize=1, length, 1)
"""
with torch.no_grad():
f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,
device=f0.device)
# fundamental component
fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
# generate sine waveforms
sine_waves = self._f02sine(fn) * self.sine_amp
# generate uv signal
# uv = torch.ones(f0.shape)
# uv = uv * (f0 > self.voiced_threshold)
uv = self._f02uv(f0)
# noise: for unvoiced should be similar to sine_amp
# std = self.sine_amp/3 -> max value ~ self.sine_amp
# . for voiced regions is self.noise_std
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
noise = noise_amp * torch.randn_like(sine_waves)
# first: set the unvoiced part to 0 by uv
# then: additive noise
sine_waves = sine_waves * uv + noise
return sine_waves, uv, noise
class SourceModuleHnNSF(torch.nn.Module):
""" SourceModule for hn-nsf
SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
add_noise_std=0.003, voiced_threshod=0)
sampling_rate: sampling_rate in Hz
harmonic_num: number of harmonic above F0 (default: 0)
sine_amp: amplitude of sine source signal (default: 0.1)
add_noise_std: std of additive Gaussian noise (default: 0.003)
note that amplitude of noise in unvoiced is decided
by sine_amp
voiced_threshold: threhold to set U/V given F0 (default: 0)
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
F0_sampled (batchsize, length, 1)
Sine_source (batchsize, length, 1)
noise_source (batchsize, length 1)
uv (batchsize, length, 1)
"""
def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1,
add_noise_std=0.003, voiced_threshod=0):
super(SourceModuleHnNSF, self).__init__()
self.sine_amp = sine_amp
self.noise_std = add_noise_std
# to produce sine waveforms
self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
sine_amp, add_noise_std, voiced_threshod)
# to merge source harmonics into a single excitation
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
self.l_tanh = torch.nn.Tanh()
def forward(self, x):
"""
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
F0_sampled (batchsize, length, 1)
Sine_source (batchsize, length, 1)
noise_source (batchsize, length 1)
"""
# source for harmonic branch
sine_wavs, uv, _ = self.l_sin_gen(x)
sine_merge = self.l_tanh(self.l_linear(sine_wavs))
# source for noise branch, in the same shape as uv
noise = torch.randn_like(uv) * self.sine_amp / 3
return sine_merge, noise, uv
class Generator(torch.nn.Module):
def __init__(self, h):
super(Generator, self).__init__()
self.h = h
self.num_kernels = len(h["resblock_kernel_sizes"])
self.num_upsamples = len(h["upsample_rates"])
self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(h["upsample_rates"]))
self.m_source = SourceModuleHnNSF(
sampling_rate=h["sampling_rate"],
harmonic_num=8)
self.noise_convs = nn.ModuleList()
self.conv_pre = weight_norm(Conv1d(h["inter_channels"], h["upsample_initial_channel"], 7, 1, padding=3))
resblock = ResBlock1 if h["resblock"] == '1' else ResBlock2
self.ups = nn.ModuleList()
for i, (u, k) in enumerate(zip(h["upsample_rates"], h["upsample_kernel_sizes"])):
c_cur = h["upsample_initial_channel"] // (2 ** (i + 1))
self.ups.append(weight_norm(
- ConvTranspose1d(h["upsample_initial_channel"] // (2 ** i), h["upsample_initial_channel"] // (2 ** (i + 1)),
+ ConvTranspose1d(h["upsample_initial_channel"] // (2 ** i),
+ h["upsample_initial_channel"] // (2 ** (i + 1)),
k, u, padding=(k - u) // 2)))
if i + 1 < len(h["upsample_rates"]): #
stride_f0 = np.prod(h["upsample_rates"][i + 1:])
self.noise_convs.append(Conv1d(
1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2))
else:
self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
self.resblocks = nn.ModuleList()
for i in range(len(self.ups)):
ch = h["upsample_initial_channel"] // (2 ** (i + 1))
for j, (k, d) in enumerate(zip(h["resblock_kernel_sizes"], h["resblock_dilation_sizes"])):
self.resblocks.append(resblock(h, ch, k, d))
self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
self.ups.apply(init_weights)
self.conv_post.apply(init_weights)
self.cond = nn.Conv1d(h['gin_channels'], h['upsample_initial_channel'], 1)
def forward(self, x, f0, g=None):
# print(1,x.shape,f0.shape,f0[:, None].shape)
f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t
# print(2,f0.shape)
har_source, noi_source, uv = self.m_source(f0)
har_source = har_source.transpose(1, 2)
x = self.conv_pre(x)
x = x + self.cond(g)
# print(124,x.shape,har_source.shape)
for i in range(self.num_upsamples):
x = F.leaky_relu(x, LRELU_SLOPE)
# print(3,x.shape)
x = self.ups[i](x)
x_source = self.noise_convs[i](har_source)
# print(4,x_source.shape,har_source.shape,x.shape)
x = x + x_source
xs = None
for j in range(self.num_kernels):
if xs is None:
xs = self.resblocks[i * self.num_kernels + j](x)
else:
xs += self.resblocks[i * self.num_kernels + j](x)
x = xs / self.num_kernels
x = F.leaky_relu(x)
x = self.conv_post(x)
x = torch.tanh(x)
return x
def remove_weight_norm(self):
print('Removing weight norm...')
for l in self.ups:
remove_weight_norm(l)
for l in self.resblocks:
l.remove_weight_norm()
remove_weight_norm(self.conv_pre)
remove_weight_norm(self.conv_post)
class DiscriminatorP(torch.nn.Module):
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
super(DiscriminatorP, self).__init__()
self.period = period
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
self.convs = nn.ModuleList([
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
])
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
def forward(self, x):
fmap = []
# 1d to 2d
b, c, t = x.shape
if t % self.period != 0: # pad first
n_pad = self.period - (t % self.period)
x = F.pad(x, (0, n_pad), "reflect")
t = t + n_pad
x = x.view(b, c, t // self.period, self.period)
for l in self.convs:
x = l(x)
x = F.leaky_relu(x, LRELU_SLOPE)
fmap.append(x)
x = self.conv_post(x)
fmap.append(x)
x = torch.flatten(x, 1, -1)
return x, fmap
class MultiPeriodDiscriminator(torch.nn.Module):
def __init__(self, periods=None):
super(MultiPeriodDiscriminator, self).__init__()
self.periods = periods if periods is not None else [2, 3, 5, 7, 11]
self.discriminators = nn.ModuleList()
for period in self.periods:
self.discriminators.append(DiscriminatorP(period))
def forward(self, y, y_hat):
y_d_rs = []
y_d_gs = []
fmap_rs = []
fmap_gs = []
for i, d in enumerate(self.discriminators):
y_d_r, fmap_r = d(y)
y_d_g, fmap_g = d(y_hat)
y_d_rs.append(y_d_r)
fmap_rs.append(fmap_r)
y_d_gs.append(y_d_g)
fmap_gs.append(fmap_g)
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
class DiscriminatorS(torch.nn.Module):
def __init__(self, use_spectral_norm=False):
super(DiscriminatorS, self).__init__()
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
self.convs = nn.ModuleList([
norm_f(Conv1d(1, 128, 15, 1, padding=7)),
norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
])
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
def forward(self, x):
fmap = []
for l in self.convs:
x = l(x)
x = F.leaky_relu(x, LRELU_SLOPE)
fmap.append(x)
x = self.conv_post(x)
fmap.append(x)
x = torch.flatten(x, 1, -1)
return x, fmap
class MultiScaleDiscriminator(torch.nn.Module):
def __init__(self):
super(MultiScaleDiscriminator, self).__init__()
self.discriminators = nn.ModuleList([
DiscriminatorS(use_spectral_norm=True),
DiscriminatorS(),
DiscriminatorS(),
])
self.meanpools = nn.ModuleList([
AvgPool1d(4, 2, padding=2),
AvgPool1d(4, 2, padding=2)
])
def forward(self, y, y_hat):
y_d_rs = []
y_d_gs = []
fmap_rs = []
fmap_gs = []
for i, d in enumerate(self.discriminators):
if i != 0:
y = self.meanpools[i - 1](y)
y_hat = self.meanpools[i - 1](y_hat)
y_d_r, fmap_r = d(y)
y_d_g, fmap_g = d(y_hat)
y_d_rs.append(y_d_r)
fmap_rs.append(fmap_r)
y_d_gs.append(y_d_g)
fmap_gs.append(fmap_g)
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
def feature_loss(fmap_r, fmap_g):
loss = 0
for dr, dg in zip(fmap_r, fmap_g):
for rl, gl in zip(dr, dg):
loss += torch.mean(torch.abs(rl - gl))
return loss * 2
def discriminator_loss(disc_real_outputs, disc_generated_outputs):
loss = 0
r_losses = []
g_losses = []
for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
r_loss = torch.mean((1 - dr) ** 2)
g_loss = torch.mean(dg ** 2)
loss += (r_loss + g_loss)
r_losses.append(r_loss.item())
g_losses.append(g_loss.item())
return loss, r_losses, g_losses
def generator_loss(disc_outputs):
loss = 0
gen_losses = []
for dg in disc_outputs:
l = torch.mean((1 - dg) ** 2)
gen_losses.append(l)
loss += l
return loss, gen_losses
diff --git a/AutoCoverTool/script/pure_model.py b/AutoCoverTool/script/pure_model.py
new file mode 100644
index 0000000..39fa2a1
--- /dev/null
+++ b/AutoCoverTool/script/pure_model.py
@@ -0,0 +1,26 @@
+"""
+只保留模型原始大小
+"""
+import torch
+
+
+def keep_pure(in_path, dst_path):
+ device = 'cuda'
+ mm = torch.load(in_path, map_location=device)
+ torch.save(mm["model"], dst_path)
+
+
+def change_iter(in_path, dst_path):
+ device = 'cuda'
+ mm = torch.load(in_path, map_location=device)
+ mm["iteration"] = 1
+ mm["learning_rate"] = 0.0001
+ torch.save(mm, dst_path)
+
+
+if __name__ == '__main__':
+ # keep_pure("data/train_users/qiankun_v1/logs/32k/G_1000.pth",
+ # "data/train_users/qiankun_v1/logs/32k/G_inference_1000.pth")
+ change_iter("data/online_models/models/base_model/sunyanzi_base_d_48000.pth",
+ "data/online_models/models/base_model/sunyanzi_base_d_48000_no_flow.pth",
+ )
diff --git a/AutoCoverTool/script/train.sh b/AutoCoverTool/script/train.sh
index 5f7a31b..f1b3a8a 100644
--- a/AutoCoverTool/script/train.sh
+++ b/AutoCoverTool/script/train.sh
@@ -1,22 +1,57 @@
#export LD_LIBRARY_PATH=/data/gpu_env_common/env/anaconda3/envs/so_vits_svc/lib:$LD_LIBRARY_PATH
export PATH=$PATH:/data/gpu_env_common/env/bin/ffmpeg/bin
#export PYTHONPATH=$PWD:$PWD/ref/music_remover/demucs
export PYTHONPATH=$PWD:$PWD/ref/music_remover/demucs:$PWD/ref/so_vits_svc:$PWD/ref/split_dirty_frame:$PWD/ref/adaptive_voice_conversion
mkdir -p /data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1/filelists
mkdir -p /data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1/config
# 1. 收集数据放到train_users/zjl/src
# 2. 提取人声 & 分片 & 取音量响度大的Top80
/data/gpu_env_common/env/anaconda3/envs/auto_song_cover/bin/python script/get_vocals_for_train.py $1
# 3. 重采样
/data/gpu_env_common/env/anaconda3/envs/auto_song_cover/bin/python ref/so_vits_svc/resample.py --in_dir=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1 --out_dir2=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1/slice_resample
# 4. 生成配置文件
/data/gpu_env_common/env/anaconda3/envs/auto_song_cover/bin/python ref/so_vits_svc/preprocess_flist_config.py --source_dir=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1/slice_resample --train_list=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1/filelists/train.txt --val_list=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1/filelists/val.txt --test_list=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1/filelists/test.txt --config_path=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1/config/config.json
# 5. 预处理提取特征
/data/gpu_env_common/env/anaconda3/envs/auto_song_cover/bin/python ref/so_vits_svc/preprocess_hubert_f0.py --in_dir=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1/slice_resample
# 6. 拷贝数据到logs文件夹
mkdir -p data/train_users/multi_users/$1/logs/32k
cp -r data/models/G_0.pth data/train_users/multi_users/$1/logs/32k
cp -r data/models/D_0.pth data/train_users/multi_users/$1/logs/32k
# 7. 训练
/data/gpu_env_common/env/anaconda3/envs/auto_song_cover/bin/python ref/so_vits_svc/train.py -c data/train_users/multi_users/$1/config/config.json -m 32k -l data/train_users/multi_users/$1/logs
+
+
+
+mkdir -p data/train_users/qiankun_v2/filelists
+mkdir -p data/train_users/qiankun_v2/config
+# 1. 收集数据放到train_users/zjl/src
+# 2. 提取人声 & 分片 & 取音量响度大的Top80
+python script/get_vocals_for_train.py $1
+# 3. 重采样
+python ref/so_vits_svc/resample.py --in_dir=data/train_users/qiankun_v2/vocals --out_dir2=data/train_users/qiankun_v2/slice_resample
+# 4. 生成配置文件
+python ref/so_vits_svc/preprocess_flist_config.py --source_dir=data/train_users/qiankun_v2/slice_resample --train_list=data/train_users/qiankun_v2/filelists/train.txt --val_list=data/train_users/qiankun_v2/filelists/val.txt --test_list=data/train_users/qiankun_v2/filelists/test.txt --config_path=data/train_users/qiankun_v2/config/config.json
+# 5. 预处理提取特征
+python ref/so_vits_svc/preprocess_hubert_f0.py --in_dir=data/train_users/qiankun_v2/slice_resample
+# 6. 拷贝数据到logs文件夹
+mkdir -p data/train_users/qiankun_v2/logs/32k
+
+# 基础的
+cp -r data/models/base_D_0.pth data/train_users/xiafan_v1/logs/32k/D_0.pth
+cp -r data/models/base_G_0.pth data/train_users/xiafan_v1/logs/32k/G_0.pth
+
+# 去除flow层的
+cp -r data/models/sunyanzi_v1_g_55000.pth data/train_users/qiankun_v1/logs/32k/G_0.pth
+cp -r data/models/sunyanzi_v1_d_55000.pth data/train_users/qiankun_v1/logs/32k/D_0.pth
+
+# 去除flow层,解码器从v1->v3的
+cp -r data/models/train_base_v2_g_330000.pth data/train_users/qiankun_v2/logs/32k/G_0.pth
+cp -r data/models/train_base_v2_d_330000.pth data/train_users/qiankun_v2/logs/32k/D_0.pth
+
+# 7. 训练
+python ref/so_vits_svc/train.py -c data/train_users/qiankun_v1/config/config.json -m 32k -l data/train_users/qiankun_v1/logs
+
+
+
+python ref/so_vits_svc/preprocess_flist_config.py --source_dir=data/train_users/sunyanzi_v1/slice_resample --train_list=data/train_users/sunyanzi_v1/filelists/train.txt --val_list=data/train_users/sunyanzi_v1/filelists/val.txt --test_list=data/train_users/sunyanzi_v1/filelists/test.txt --config_path=data/train_users/sunyanzi_v1/config/config.json
\ No newline at end of file
diff --git a/AutoCoverTool/script/train_user_by_one_media.py b/AutoCoverTool/script/train_user_by_one_media.py
new file mode 100644
index 0000000..e01176d
--- /dev/null
+++ b/AutoCoverTool/script/train_user_by_one_media.py
@@ -0,0 +1,531 @@
+"""
+使用一句话进行人声训练
+1. 数据集
+2. 训练
+"""
+from ref.so_vits_svc.models import SynthesizerTrn, MultiPeriodDiscriminator
+from ref.so_vits_svc.mel_processing import spectrogram_torch, spec_to_mel_torch, mel_spectrogram_torch
+import ref.so_vits_svc.utils as utils
+import ref.so_vits_svc.commons as commons
+from ref.so_vits_svc.losses import kl_loss, generator_loss, discriminator_loss, feature_loss
+
+import logging
+
+logging.getLogger('numba').setLevel(logging.WARNING)
+
+import os
+import time
+import torch
+import random
+import librosa
+import soundfile
+import torchaudio
+import parselmouth
+import numpy as np
+from tqdm import tqdm
+from scipy.io.wavfile import read
+from pyworld import pyworld
+from copy import deepcopy
+import torch.utils.data
+from torch.nn import functional as F
+from torch.utils.data import DataLoader
+from torch.cuda.amp import autocast, GradScaler
+
+gs_hmodel = utils.get_hubert_model(0 if torch.cuda.is_available() else None)
+gs_model_config = {
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 768,
+ "n_heads": 2,
+ "n_layers": 6,
+ "kernel_size": 3,
+ "p_dropout": 0.1,
+ "resblock": "1",
+ "resblock_kernel_sizes": [3, 7, 11],
+ "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+ "upsample_rates": [10, 8, 2, 2],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [16, 16, 4, 4],
+ "n_layers_q": 3,
+ "use_spectral_norm": False,
+ "gin_channels": 256,
+ "ssl_dim": 256,
+ "n_speakers": 2
+}
+
+gs_train_config = {
+ "log_interval": 1,
+ "eval_interval": 1000,
+ "seed": 1234,
+ "epochs": 1000,
+ "learning_rate": 0.0001,
+ "betas": [
+ 0.8,
+ 0.99
+ ],
+ "eps": 1e-09,
+ "batch_size": 12,
+ "fp16_run": False,
+ "lr_decay": 0.999875,
+ "segment_size": 17920,
+ "init_lr_ratio": 1,
+ "warmup_epochs": 0,
+ "c_mel": 1.0, # 45
+ "c_kl": 1.0,
+ "c_fm": 1.0,
+ "c_gen": 1.0,
+ "use_sr": True,
+ "max_speclen": 384
+}
+gs_data_config = {
+ "max_wav_value": 32768.0,
+ "sampling_rate": 32000,
+ "filter_length": 1280,
+ "hop_length": 320,
+ "win_length": 1280,
+ "n_mel_channels": 80,
+ "mel_fmin": 0.0,
+ "mel_fmax": None
+}
+
+
+def get_f0(x, p_len, f0_up_key=0):
+ time_step = 160 / 16000 * 1000
+ f0_min = 50
+ f0_max = 1100
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
+
+ f0 = parselmouth.Sound(x, 16000).to_pitch_ac(
+ time_step=time_step / 1000, voicing_threshold=0.6,
+ pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
+ if len(f0) > p_len:
+ f0 = f0[:p_len]
+ pad_size = (p_len - len(f0) + 1) // 2
+ if (pad_size > 0 or p_len - len(f0) - pad_size > 0):
+ f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode='constant')
+
+ f0 *= pow(2, f0_up_key / 12)
+ f0_mel = 1127 * np.log(1 + f0 / 700)
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
+ f0_mel[f0_mel <= 1] = 1
+ f0_mel[f0_mel > 255] = 255
+ f0_coarse = np.rint(f0_mel).astype(np.int)
+ return f0_coarse, f0
+
+
+def resize2d(x, target_len):
+ source = np.array(x)
+ source[source < 0.001] = np.nan
+ target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
+ source)
+ res = np.nan_to_num(target)
+ return res
+
+
+def compute_f0(x, sr, c_len):
+ # x, sr = librosa.load(path, sr=32000)
+ f0, t = pyworld.dio(
+ x.astype(np.double),
+ fs=sr,
+ f0_ceil=800,
+ frame_period=1000 * 320 / sr,
+ )
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, 32000)
+ for index, pitch in enumerate(f0):
+ f0[index] = round(pitch, 1)
+ assert abs(c_len - x.shape[0] // 320) < 3, (c_len, f0.shape)
+
+ return None, resize2d(f0, c_len)
+
+
+def process(filename):
+ hmodel = utils.get_hubert_model(0 if torch.cuda.is_available() else None)
+ save_name = filename + ".soft.pt"
+ if not os.path.exists(save_name):
+ devive = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ wav, _ = librosa.load(filename, sr=16000)
+ wav = torch.from_numpy(wav).unsqueeze(0).to(devive)
+ c = utils.get_hubert_content(hmodel, wav)
+ torch.save(c.cpu(), save_name)
+ else:
+ c = torch.load(save_name)
+ f0path = filename + ".f0.npy"
+ if not os.path.exists(f0path):
+ cf0, f0 = compute_f0(filename, c.shape[-1] * 2)
+ np.save(f0path, f0)
+
+
+def clean_pitch(input_pitch):
+ num_nan = np.sum(input_pitch == 1)
+ if num_nan / len(input_pitch) > 0.9:
+ input_pitch[input_pitch != 1] = 1
+ return input_pitch
+
+
+class TextAudioSpeakerLoader(torch.utils.data.Dataset):
+ """
+ 1) loads audio, speaker_id, text pairs
+ 2) normalizes text and converts them to sequences of integers
+ 3) computes spectrograms from audio files.
+ """
+
+ def __init__(self, audio_path):
+ self.audio_path = audio_path
+ self.max_wav_value = gs_data_config['max_wav_value']
+ self.sampling_rate = gs_data_config['sampling_rate']
+ self.filter_length = gs_data_config['filter_length']
+ self.hop_length = gs_data_config['hop_length']
+ self.win_length = gs_data_config['win_length']
+ self.use_sr = gs_train_config['use_sr']
+ self.spec_len = gs_train_config['max_speclen']
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ self.hmodel = gs_hmodel
+
+ random.seed(1234)
+ self.audio_data = self.get_audio(audio_path)
+
+ def get_audio(self, filename):
+ # 原始音频32k单声道
+
+ # 这里存在疑惑:
+ # audio, sr = librosa.load(filename, sr=self.sampling_rate, mono=True)
+ sr, audio = read(filename)
+ audio = torch.FloatTensor(audio.astype(np.float32))
+
+ audio_norm = audio / self.max_wav_value
+ audio_norm = torch.tensor(audio_norm)
+ audio_norm = audio_norm.unsqueeze(0)
+
+ # 幅度谱 帧长1280(40ms),帧移320(10ms),shape为(641, frame_num)
+ spec = spectrogram_torch(audio_norm, self.filter_length,
+ self.sampling_rate, self.hop_length, self.win_length,
+ center=False)
+ # print(torch.mean(spec))
+ spec = torch.squeeze(spec, 0)
+ spk = torch.LongTensor([0])
+
+ # # 提取hubert特征,shape为(256, frame_num // 2),后面做补齐
+ wav = librosa.resample(audio.numpy(), sr, 16000)
+ wav = torch.from_numpy(wav).unsqueeze(0).to(self.device)
+ c = utils.get_hubert_content(self.hmodel, wav).squeeze(0)
+
+ # 提取f0特征,shape为(frame_num)
+ cf0, f0 = compute_f0(audio.numpy(), sr, c.shape[-1] * 2)
+ f0 = torch.FloatTensor(f0)
+ c = torch.repeat_interleave(c, repeats=2, dim=1) # shape=(256, frame_num)
+
+ lmin = min(c.size(-1), spec.size(-1), f0.shape[0])
+ # 当assert的前面的条件不成立的时候,会报错,并给出后面的信息
+ assert abs(c.size(-1) - spec.size(-1)) < 4, (c.size(-1), spec.size(-1), f0.shape, filename)
+ assert abs(lmin - spec.size(-1)) < 4, (c.size(-1), spec.size(-1), f0.shape)
+ assert abs(lmin - c.size(-1)) < 4, (c.size(-1), spec.size(-1), f0.shape)
+ spec, c, f0 = spec[:, :lmin], c[:, :lmin], f0[:lmin]
+ audio_norm = audio_norm[:, :lmin * self.hop_length]
+ _spec, _c, _audio_norm, _f0 = spec, c, audio_norm, f0
+ # 取幅度谱特征,hubert特征、f0信息
+ while spec.size(-1) < self.spec_len:
+ spec = torch.cat((spec, _spec), -1)
+ c = torch.cat((c, _c), -1)
+ f0 = torch.cat((f0, _f0), -1)
+ audio_norm = torch.cat((audio_norm, _audio_norm), -1)
+ # hubert特征,f0,幅度谱特征,对应音频段波形,人声编码
+ return c, f0, spec, audio_norm, spk
+
+ def random_one(self):
+ c, f0, spec, audio_norm, spk = self.audio_data
+ start = random.randint(0, spec.size(-1) - self.spec_len)
+ end = start + self.spec_len
+ spec = spec[:, start:end]
+ c = c[:, start:end]
+ f0 = f0[start:end]
+ audio_norm = audio_norm[:, start * self.hop_length:end * self.hop_length]
+ return c, f0, spec, audio_norm, spk
+
+ def __getitem__(self, index):
+ return self.random_one()
+
+ def __len__(self):
+ return 1
+
+
+class SoVitsSVCOnlineTrain:
+
+ def construct_model(self):
+ net_g = SynthesizerTrn(
+ gs_data_config["filter_length"] // 2 + 1,
+ gs_train_config["segment_size"] // gs_data_config["hop_length"],
+ **gs_model_config,
+ no_flow=False,
+ use_v3=False).cuda()
+ net_d = MultiPeriodDiscriminator(gs_model_config['use_spectral_norm']).cuda()
+ optim_g = torch.optim.AdamW(
+ net_g.parameters(),
+ 0.0001,
+ betas=[0.8, 0.99],
+ eps=1e-09)
+ optim_d = torch.optim.AdamW(
+ net_d.parameters(),
+ 0.0001,
+ betas=[0.8, 0.99],
+ eps=1e-09)
+
+ # checkpoint_dict = torch.load(base_g_model, map_location='cuda')
+ net_g.load_state_dict(self.g_model_dict)
+ net_d.load_state_dict(self.d_model_dict)
+ optim_g.load_state_dict(self.g_opt_dict)
+ optim_d.load_state_dict(self.d_opt_dict)
+
+ # 设置初始学习率
+ optim_g.param_groups[0]['lr'] = 2e-4
+ optim_d.param_groups[0]['lr'] = 2e-4
+ return net_g, net_d, optim_g, optim_d
+
+ def __init__(self, base_g_model, base_d_model):
+ st1 = time.time()
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+ checkpoint_dict = torch.load(base_g_model, map_location='cpu')
+ self.g_model_dict = checkpoint_dict["model"]
+ self.g_opt_dict = checkpoint_dict["optimizer"]
+
+ checkpoint_dict = torch.load(base_d_model, map_location='cpu')
+ self.d_model_dict = checkpoint_dict["model"]
+ self.d_opt_dict = checkpoint_dict["optimizer"]
+
+ print("load model_path={},{},sp={}".format(base_g_model, base_d_model, time.time() - st1))
+
+ def get_units(self, source, sr):
+ source = source.unsqueeze(0).to(self.device)
+ print("source_shape===>", source.shape)
+ with torch.inference_mode():
+ start = time.time()
+ units = gs_hmodel.units(source)
+ use_time = time.time() - start
+ print("hubert use time:{}".format(use_time))
+ return units
+
+ def get_unit_pitch(self, source, sr, tran):
+ source = torchaudio.functional.resample(source, sr, 16000)
+ if len(source.shape) == 2 and source.shape[1] >= 2:
+ source = torch.mean(source, dim=0).unsqueeze(0)
+ soft = self.get_units(source, sr).squeeze(0).cpu().numpy()
+ f0_coarse, f0 = get_f0(source.cpu().numpy()[0], soft.shape[0] * 2, tran)
+ return soft, f0
+
+ def train(self, in_wav, epoch_num):
+ train_dataset = TextAudioSpeakerLoader(in_wav)
+ train_loader = DataLoader(train_dataset, num_workers=0, shuffle=False, batch_size=12)
+ net_g, net_d, optim_g, optim_d = self.construct_model()
+
+ rank = 0
+ # 用于训练加速
+ torch.set_float32_matmul_precision('high')
+ net_g.train()
+ net_d.train()
+ global_step = 0
+ scaler = GradScaler(enabled=gs_train_config['fp16_run'])
+
+ scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=gs_train_config['lr_decay'], last_epoch=1)
+ scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=gs_train_config['lr_decay'], last_epoch=1)
+ # 根据上一次的情况来进行学习率更新
+ # 思路: loss 下降 学习率增加,loss上升学习率减少
+ for epoch in tqdm(range(0, epoch_num)):
+ for batch_idx, items in enumerate(train_loader):
+ # hubert特征,f0,幅度谱特征,对应音频段波形(384 * hop_length),人声编码[0]
+ c, f0, spec, y, spk = items
+ g = spk.cuda(rank, non_blocking=True)
+ spec, y = spec.cuda(rank, non_blocking=True), y.cuda(rank, non_blocking=True)
+ c = c.cuda(rank, non_blocking=True)
+ f0 = f0.cuda(rank, non_blocking=True)
+ """
+ "sampling_rate": 32000,
+ "filter_length": 1280,
+ "hop_length": 320,
+ "win_length": 1280,
+ "n_mel_channels": 80,
+ "mel_fmin": 0.0,
+ "mel_fmax": null
+ """
+
+ # spec, n_fft, num_mels, sampling_rate, fmin, fmax
+ mel = spec_to_mel_torch(spec, gs_data_config['filter_length'], gs_data_config['n_mel_channels'],
+ gs_data_config['sampling_rate'], gs_data_config['mel_fmin'],
+ gs_data_config['mel_fmax'])
+ with autocast(enabled=gs_train_config['fp16_run']):
+ # net_g的输入: hubert特征,f0,幅度谱特征,说话人id,mel谱特征
+ # net_g的输出:
+ # 原始波形,批次中每个采样到的帧的位置,批次中幅度谱的有效帧位置,
+ # 幅度谱编码得到正态分布后随机采样得到的z, z经过标准化流之后得到z_p, hubert特征层得到的正态分布的均值,
+ # hubert特征层得到的正态分布的标准差(logs_p),幅度谱和人声信息得到的均值(m_q),幅度谱和人声信息得到的标准差(logs_q)
+ y_hat, ids_slice, z_mask, \
+ (z, z_p, m_p, logs_p, m_q, logs_q) = net_g(c, f0, spec, g=g, mel=mel)
+
+ y_mel = commons.slice_segments(mel, ids_slice,
+ gs_train_config['segment_size'] // gs_data_config['hop_length'])
+
+ y_hat_mel = mel_spectrogram_torch(
+ y_hat.squeeze(1),
+ gs_data_config['filter_length'],
+ gs_data_config['n_mel_channels'],
+ gs_data_config['sampling_rate'],
+ gs_data_config['hop_length'],
+ gs_data_config['win_length'],
+ gs_data_config['mel_fmin'],
+ gs_data_config['mel_fmax']
+ )
+ y = commons.slice_segments(y, ids_slice * gs_data_config['hop_length'],
+ gs_train_config['segment_size']) # slice
+
+ # Discriminator
+ y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach())
+
+ with autocast(enabled=False):
+ loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g)
+ loss_disc_all = loss_disc
+
+ optim_d.zero_grad()
+ scaler.scale(loss_disc_all).backward()
+ scaler.unscale_(optim_d)
+ scaler.step(optim_d)
+ with autocast(enabled=gs_train_config['fp16_run']):
+ # Generator
+ y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat)
+ with autocast(enabled=False):
+ # mel谱之间的损失函数,后面是系数,误差越小越好
+ loss_mel = F.l1_loss(y_mel, y_hat_mel) * gs_train_config['c_mel']
+ # KL散度,z_p: 幅度谱侧得到的采样值经过标准化流之后的结果,logs_q: 幅度谱侧得到的标准差,m_p:hubert侧得到的均值
+ # logs_p: hubert侧得到的标准差,z_mask: 批次中幅度谱的有效帧位置,
+ loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * gs_train_config['c_kl']
+ # 在d模型中将y和y_hat的每一层特征结果都拿出来,做l1距离
+ loss_fm = feature_loss(fmap_r, fmap_g) * gs_train_config['c_fm']
+ loss_gen, losses_gen = generator_loss(y_d_hat_g)
+ loss_gen_all = loss_gen * gs_train_config['c_gen'] + loss_fm + loss_mel + loss_kl
+ optim_g.zero_grad()
+ scaler.scale(loss_gen_all).backward()
+ scaler.unscale_(optim_g)
+ scaler.step(optim_g)
+ scaler.update()
+
+ if global_step % gs_train_config['log_interval'] == 0:
+ lr = optim_g.param_groups[0]['lr']
+ losses_numpy = [round(loss_disc.item(), 3), round(loss_gen.item(), 3),
+ round(loss_fm.item(), 3), round(loss_mel.item(), 3), round(loss_kl.item(), 3)]
+ print("gstep={},lr={},disc={},gen={},fm={},mel={},kl={},tot={}".format(global_step, lr,
+ losses_numpy[0],
+ losses_numpy[1],
+ losses_numpy[2],
+ losses_numpy[3],
+ losses_numpy[4],
+ sum(losses_numpy)))
+
+ if global_step % 200 == 0:
+ torch.save(net_g.state_dict(), "data/web_trained_models/xiafan_{}.pth".format(global_step))
+
+ global_step += 1
+
+ scheduler_g.step()
+ scheduler_d.step()
+ return net_g
+
+ def infer(self, in_wav, dst_wav, model):
+ tran = 0 # 变化的音高
+ source, sr = librosa.load(in_wav, sr=32000, mono=True)
+ source = torch.tensor(source).unsqueeze(0)
+ sid = torch.LongTensor([0]).to(self.device).unsqueeze(0)
+ soft, pitch = self.get_unit_pitch(source, sr, tran)
+ f0 = torch.FloatTensor(clean_pitch(pitch)).unsqueeze(0).to(self.device)
+ stn_tst = torch.FloatTensor(soft)
+
+ with torch.no_grad():
+ model.eval()
+ x_tst = stn_tst.unsqueeze(0).to(self.device)
+ start = time.time()
+ x_tst = torch.repeat_interleave(x_tst, repeats=2, dim=1).transpose(1, 2)
+ audio = model.infer(x_tst, f0=f0, g=sid)[0, 0].data.float()
+ use_time = time.time() - start
+ print("vits use time:{}".format(use_time))
+ # 写入文件
+ soundfile.write(dst_wav, audio.cpu().numpy(), sr, format='wav')
+
+ ####### 对外接口,训练并预测
+ def process_train_and_infer(self, train_media, in_path, dst_path, dst_model_path=None, params={}):
+ """
+ :param train_media: 训练时使用的数据
+ :param in_path: 待转换的人声信息
+ :param dst_path: 转换后的文件地址
+ :param dst_model_path: 是否缓存模型
+ :return:
+ """
+ # 对train_media转码为32k单声道
+ tmp_wav = train_media + "_321.wav"
+ cmd = "ffmpeg -i {} -ar 32000 -ac 1 -y {}".format(train_media, tmp_wav)
+ os.system(cmd)
+ if not os.path.exists(tmp_wav):
+ return 1
+ in_wav_tmp = in_path + "_321.wav"
+ cmd = "ffmpeg -i {} -ar 32000 -ac 1 -y {}".format(in_path, in_wav_tmp)
+ os.system(cmd)
+ if not os.path.exists(in_wav_tmp):
+ os.unlink(tmp_wav)
+ return 2
+
+ global gs_train_config
+ max_step = params.get('max_step', 200)
+ gs_train_config['c_mel'] = params.get("c_mel", 45)
+ gs_train_config['c_fm'] = params.get("c_fm", 1.0)
+ gs_train_config['c_gen'] = params.get("c_gen", 1.0)
+
+ print("params:{}".format(params))
+ st = time.time()
+ model = self.train(tmp_wav, max_step)
+ print("train sp={}".format(time.time() - st))
+
+ st = time.time()
+ self.infer(in_wav_tmp, dst_path, model)
+ print("infer sp={}".format(time.time() - st))
+
+ if dst_model_path is not None:
+ st = time.time()
+ torch.save(model.state_dict(), dst_model_path)
+ print("save model sp={}".format(time.time() - st))
+
+ os.unlink(tmp_wav)
+ os.unlink(in_wav_tmp)
+ return 0
+
+ # 推理结果
+ def process_infer(self, model_path, in_path, dst_path):
+ net_g = SynthesizerTrn(
+ gs_data_config["filter_length"] // 2 + 1,
+ gs_train_config["segment_size"] // gs_data_config["hop_length"],
+ **gs_model_config,
+ no_flow=False,
+ use_v3=False).cuda()
+ model_dict = torch.load(model_path, map_location='cpu')
+ net_g.load_state_dict(model_dict)
+ in_wav_tmp = in_path + "_321.wav"
+ cmd = "ffmpeg -i {} -ar 32000 -ac 1 -y {}".format(in_path, in_wav_tmp)
+ os.system(cmd)
+ if not os.path.exists(in_wav_tmp):
+ return 2
+
+ self.infer(in_wav_tmp, dst_path, net_g)
+
+ def get_f0(self, vocal_path):
+ get_f0()
+
+
+if __name__ == '__main__':
+ pp = "data/train_users/qiankun_v1/vocals/speaker0/qiankun.wav"
+ in_p = "data/test/vocal_32.wav"
+ dst_p = "data/test/vocal_32_out.wav"
+ dst_m_p = "data/test/mm.pth"
+
+ g_path = "data/online_models/models/base_model/sunyanzi_base_2000.pth"
+ d_path = "data/online_models/models/base_model/sunyanzi_base_d_2000.pth"
+ svsot = SoVitsSVCOnlineTrain(g_path, d_path)
+
+ start_time = time.time()
+ ret = svsot.process_train_and_infer(pp, in_p, dst_p, dst_m_p)
+ print("process = {} ret={}".format(time.time() - start_time, ret))
diff --git a/AutoCoverTool/svc_inference/config.json b/AutoCoverTool/svc_inference/config.json
index 8399ea3..ec1f0a5 100644
--- a/AutoCoverTool/svc_inference/config.json
+++ b/AutoCoverTool/svc_inference/config.json
@@ -1,90 +1,91 @@
{
"train": {
"log_interval": 200,
"eval_interval": 1000,
"seed": 1234,
"epochs": 1000,
"learning_rate": 0.0001,
"betas": [
0.8,
0.99
],
"eps": 1e-09,
"batch_size": 12,
"fp16_run": false,
"lr_decay": 0.999875,
"segment_size": 17920,
"init_lr_ratio": 1,
"warmup_epochs": 0,
"c_mel": 45,
"c_kl": 1.0,
"use_sr": true,
"max_speclen": 384,
"port": "8002"
},
"data": {
"training_files": "/data/rsync/jianli.yang/AutoCoverTool/data/train_users/dlj_v1/filelists/train.txt",
"validation_files": "/data/rsync/jianli.yang/AutoCoverTool/data/train_users/dlj_v1/filelists/val.txt",
"max_wav_value": 32768.0,
"sampling_rate": 32000,
"filter_length": 1280,
"hop_length": 320,
"win_length": 1280,
"n_mel_channels": 80,
"mel_fmin": 0.0,
"mel_fmax": null
},
"model": {
"inter_channels": 192,
"hidden_channels": 192,
"filter_channels": 768,
"n_heads": 2,
"n_layers": 6,
"kernel_size": 3,
"p_dropout": 0.1,
"resblock": "1",
"resblock_kernel_sizes": [
3,
7,
11
],
"resblock_dilation_sizes": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"upsample_rates": [
10,
8,
2,
2
],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [
16,
16,
4,
4
],
"n_layers_q": 3,
"use_spectral_norm": false,
"gin_channels": 256,
"ssl_dim": 256,
"n_speakers": 2
},
"spk": {
- "speaker0": 0
+ "speaker0": 0,
+ "speaker1": 1
}
}
\ No newline at end of file
diff --git a/AutoCoverTool/svc_inference/webui.py b/AutoCoverTool/svc_inference/webui.py
index 48a7031..0652207 100644
--- a/AutoCoverTool/svc_inference/webui.py
+++ b/AutoCoverTool/svc_inference/webui.py
@@ -1,76 +1,77 @@
"""
构建唱歌音色转换网页(基于3.0)
要求:
1. 音频上传
2. 推理
3. 下载
"""
import os
import time
import glob
import shutil
import librosa
import soundfile
import gradio as gr
from online.inference_one import inf
gs_tmp_dir = "/tmp/svc_inference_one_web"
-gs_model_dir = "/data/prod/so_vits_models/3.0"
+# gs_model_dir = "/data/prod/so_vits_models/3.0"
+gs_model_dir = "/data/rsync/jianli.yang/AutoCoverToolNew/AutoCoverTool/data/online_models/models"
gs_config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "config.json")
gs_models_choices = glob.glob(os.path.join(gs_model_dir, "*/*pth"))
gs_model_list_dropdown = None
def svc(audio_data, model_path):
sr, data = audio_data
if os.path.exists(gs_tmp_dir):
shutil.rmtree(gs_tmp_dir)
os.makedirs(gs_tmp_dir)
tmp_path = os.path.join(gs_tmp_dir, "tmp.wav")
soundfile.write(tmp_path, data, sr, format="wav")
# 重采样到32k
audio, sr = librosa.load(tmp_path, sr=32000, mono=True)
tmp_path = os.path.join(gs_tmp_dir, "tmp_32.wav")
out_path = os.path.join(gs_tmp_dir, "out.wav")
soundfile.write(tmp_path, data, sr, format="wav")
# 推理
print("svc: {}".format(model_path))
st = time.time()
inf(model_path, gs_config_path, tmp_path, out_path, 'cuda')
print("input d={}, sp = {}".format(len(audio) / sr, time.time() - st))
return out_path
def model_select():
files = glob.glob(os.path.join(gs_model_dir, "*/*pth"))
return gs_model_list_dropdown.update(choices=files)
def main():
# header
app = gr.Blocks()
with app:
# 头部介绍
gr.Markdown(value="""
### 唱歌音色转换
作者:starmaker音视频
""")
global gs_model_list_dropdown
gs_model_list_dropdown = gr.Dropdown(choices=gs_models_choices, interactive=True, label="model list")
refresh_btn = gr.Button("refresh_model_list")
refresh_btn.click(fn=model_select, inputs=[], outputs=gs_model_list_dropdown)
# 提示词输入框
input_audio = gr.inputs.Audio(label="input")
gen_btn = gr.Button("generate", variant="primary")
output_audio = gr.outputs.Audio(label="output", type='filepath')
gen_btn.click(fn=svc, inputs=[input_audio, gs_model_list_dropdown], outputs=output_audio)
# 本方法实现同一时刻只有一个程序在服务器端运行
app.queue(concurrency_count=1, max_size=2044).launch(server_name="0.0.0.0", inbrowser=True, quiet=True,
server_port=7860)
if __name__ == '__main__':
main()
diff --git a/AutoCoverTool/svc_inference/webui_play.py b/AutoCoverTool/svc_inference/webui_play.py
new file mode 100644
index 0000000..abe787f
--- /dev/null
+++ b/AutoCoverTool/svc_inference/webui_play.py
@@ -0,0 +1,133 @@
+"""
+构建唱歌音色转换网页(基于3.0)
+1. 要求上传一个音频
+2. 选定男女声
+3. 选择一首歌曲
+4. 训练特定轮次并合成歌曲
+"""
+
+import os
+import time
+import glob
+import hashlib
+import shutil
+import librosa
+import soundfile
+import gradio as gr
+from script.train_user_by_one_media import SoVitsSVCOnlineTrain
+
+gs_g_path = "data/online_models/models/base_model/sunyanzi_base_2000.pth"
+gs_d_path = "data/online_models/models/base_model/sunyanzi_base_d_2000.pth"
+gs_out_model_dir = "data/web_trained_models_for_play"
+gs_out_audio_dir = "data/web_trained_models_for_play_audio"
+gs_work_dir = "/tmp/train_user_by_one_media_for_play"
+gs_ssot_inst = SoVitsSVCOnlineTrain(gs_g_path, gs_d_path)
+gs_draw_volume_exe = "/data/gpu_env_common/bin/draw_volume"
+gs_simple_mixer_path = "/data/gpu_env_common/bin/simple_mixer"
+
+
+def get_song_map():
+ female_song_names = []
+ song_list = glob.glob("data/resource/female/*")
+ for song in song_list:
+ female_song_names.append(song.replace("data/resource/female/", ""))
+ male_song_names = []
+ song_list = glob.glob("data/resource/male/*")
+ for song in song_list:
+ male_song_names.append(song.replace("data/resource/male/", ""))
+ song_map = {
+ "female": female_song_names,
+ "male": male_song_names,
+ }
+ return song_map
+
+
+gs_song_map = get_song_map()
+gs_song_list_dropdown = None
+
+
+def song_select(gender):
+ return gs_song_list_dropdown.update(choices=gs_song_map[gender]), gs_song_map[gender][0]
+
+
+def get_file_md5(filename):
+ with open(filename, "rb") as fp:
+ return hashlib.md5(fp.read()).hexdigest()
+
+
+def mix(in_path, acc_path, dst_path):
+ # svc转码到442
+ svc_442_file = in_path + "_442.wav"
+ st = time.time()
+ cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {} -loglevel fatal".format(in_path, svc_442_file)
+ os.system(cmd)
+ if not os.path.exists(svc_442_file):
+ return -1
+ print("transcode,{},sp={}".format(in_path, time.time() - st))
+
+ # 混合
+ st = time.time()
+ cmd = "{} {} {} {} 1".format(gs_simple_mixer_path, svc_442_file, acc_path, dst_path)
+ os.system(cmd)
+ print("mixer,{},sp={}".format(in_path, time.time() - st))
+
+
+def train_svc(train_audio_data, gender, song_name):
+ if os.path.exists(gs_work_dir):
+ shutil.rmtree(gs_work_dir)
+ os.makedirs(gs_work_dir)
+
+ train_audio_path = os.path.join(gs_work_dir, "inf.wav")
+
+ sr, data = train_audio_data
+ soundfile.write(train_audio_path, data, samplerate=sr, format='wav')
+
+ inf_audio_path = os.path.join("data/resource/{}/{}/vocal321.wav".format(gender, song_name)) # 人声
+ inf_acc_path = os.path.join("data/resource/{}/{}/acc.wav".format(gender, song_name)) # 伴奏
+ inf_out_path = os.path.join(gs_work_dir, "tmp.wav")
+ print("svc: {}".format(train_audio_path))
+
+ st = time.time()
+ md5 = get_file_md5(train_audio_path)
+ out_model_path = os.path.join(gs_out_model_dir, "{}.pth".format(md5))
+
+ print("inputMsg:", train_audio_path, inf_audio_path, out_model_path)
+ if os.path.exists(out_model_path):
+ err = gs_ssot_inst.process_infer(out_model_path, inf_audio_path, inf_out_path)
+ else:
+ err = gs_ssot_inst.process_train_and_infer(train_audio_path, inf_audio_path, inf_out_path, out_model_path)
+
+ # 人声伴奏合并
+ out_path = os.path.join(gs_out_audio_dir, "{}.wav".format(md5))
+ mix(inf_out_path, inf_acc_path, out_path)
+ print("input err={}, sp = {}".format(err, time.time() - st))
+ return out_path
+
+
+def main():
+ # header
+ app = gr.Blocks()
+ with app:
+ # 头部介绍
+ gr.Markdown(value="""
+ ### 用你的音色来唱歌
+ #### 使用说明: 手机录一个15s左右的音频,拖拽到网页上,点击开始后,稍等2分钟~
+ 作者:starmaker音视频
+ """)
+ # train_audio_path
+ train_audio = gr.inputs.Audio(label="input_audio")
+ gender = gr.inputs.Radio(choices=["female", "male"], default="female")
+ global gs_song_list_dropdown
+ gs_song_list_dropdown = gr.Dropdown(choices=gs_song_map["female"], interactive=True, label="song list")
+ gender.change(song_select, inputs=[gender], outputs=[gs_song_list_dropdown, gs_song_list_dropdown])
+ gen_btn = gr.Button("generate", variant="primary")
+
+ output_audio = gr.outputs.Audio(label="output", type='filepath')
+ gen_btn.click(fn=train_svc, inputs=[train_audio, gender, gs_song_list_dropdown], outputs=output_audio)
+ # 本方法实现同一时刻只有一个程序在服务器端运行
+ app.queue(concurrency_count=1, max_size=2044).launch(server_name="0.0.0.0", inbrowser=True, quiet=True,
+ server_port=7860)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/AutoCoverTool/svc_inference/webui_v1.py b/AutoCoverTool/svc_inference/webui_v1.py
new file mode 100644
index 0000000..b05c8af
--- /dev/null
+++ b/AutoCoverTool/svc_inference/webui_v1.py
@@ -0,0 +1,107 @@
+"""
+构建唱歌音色转换网页(基于3.0)
+要求:
+1. 音频上传
+2. 训练
+3. 推理
+4. 下载
+第一版功能: 给定两个文件(1) 待训练音频 (2) 待推理音频 输出 音色转换后的音频
+"""
+import os
+import time
+import glob
+import shutil
+import librosa
+import soundfile
+import gradio as gr
+from script.train_user_by_one_media import SoVitsSVCOnlineTrain
+
+gs_g_path = "data/online_models/models/base_model/sunyanzi_base_2000.pth"
+gs_d_path = "data/online_models/models/base_model/sunyanzi_base_d_2000.pth"
+gs_out_model_dir = "data/web_trained_models"
+gs_work_dir = "/tmp/train_user_by_one_media"
+gs_ssot_inst = SoVitsSVCOnlineTrain(gs_g_path, gs_d_path)
+gs_models_choices = glob.glob(os.path.join(gs_out_model_dir, "*pth"))
+
+
+def train_svc(train_audio_path, inf_audio_path, name, train_step, c_mel, c_fm, c_gen):
+ train_step = int(train_step)
+ if os.path.exists(gs_work_dir):
+ shutil.rmtree(gs_work_dir)
+ print("svc: {}".format(train_audio_path, inf_audio_path))
+ st = time.time()
+ os.makedirs(gs_work_dir)
+ out_path = os.path.join(gs_work_dir, "tmp.wav")
+ out_model_path = None
+ if name != "":
+ out_model_path = os.path.join(gs_out_model_dir, "{}_{}.pth".format(name, train_step))
+ print("inputMsg:", train_audio_path, inf_audio_path, out_path, out_model_path, train_step)
+ err = gs_ssot_inst.process_train_and_infer(train_audio_path, inf_audio_path, out_path, out_model_path,
+ params={'max_step': 100, 'c_mel': int(c_mel), 'c_fm': int(c_fm),
+ 'c_gen': int(c_gen)})
+ print("input err={}, step={}, sp = {}".format(err, train_step, time.time() - st))
+ return out_path
+
+
+def svc_v1(inf_audio_path, model_path):
+ print("svc={}", model_path)
+ if os.path.exists(gs_work_dir):
+ shutil.rmtree(gs_work_dir)
+ os.makedirs(gs_work_dir)
+
+ # 重采样到32k
+ audio, sr = librosa.load(inf_audio_path, sr=32000, mono=True)
+ tmp_path = os.path.join(gs_work_dir, "tmp_32.wav")
+ out_path = os.path.join(gs_work_dir, "out.wav")
+ soundfile.write(tmp_path, audio, sr, format="wav")
+
+ # 推理
+ print("svc: {}".format(model_path))
+ st = time.time()
+ gs_ssot_inst.process_infer(model_path, tmp_path, out_path)
+ print("input d={}, sp = {}".format(len(audio) / sr, time.time() - st))
+ return out_path
+
+
+def model_select():
+ files = glob.glob(os.path.join(gs_out_model_dir, "*pth"))
+ return gs_model_list_dropdown.update(choices=files)
+
+
+def main():
+ # header
+ app = gr.Blocks()
+ with app:
+ # 头部介绍
+ gr.Markdown(value="""
+ ### 唱歌音色转换
+ 作者:starmaker音视频
+ """)
+ global gs_model_list_dropdown
+ gs_model_list_dropdown = gr.Dropdown(choices=gs_models_choices, interactive=True, label="model list")
+ refresh_btn = gr.Button("refresh_model_list")
+ refresh_btn.click(fn=model_select, inputs=[], outputs=gs_model_list_dropdown)
+
+ # train_audio_path
+ train_audio = gr.inputs.Audio(label="train_audio", type='filepath')
+ svc_audio = gr.inputs.Audio(label="svc_audio", type='filepath')
+ name_text = gr.inputs.Textbox(label="model_name", default="")
+ with gr.Row():
+ max_step = gr.inputs.Number(label="max_step", default=100)
+ c_mel = gr.inputs.Number(label="c_mel", default=45)
+ c_fm = gr.inputs.Number(label="c_fm", default=1)
+ c_gen = gr.inputs.Number(label="c_gen", default=1)
+ gen_btn = gr.Button("generate", variant="primary")
+ gen_btn1 = gr.Button("generateByModel", variant="primary")
+
+ output_audio = gr.outputs.Audio(label="output", type='filepath')
+ gen_btn.click(fn=train_svc, inputs=[train_audio, svc_audio, name_text, max_step, c_mel, c_fm, c_gen],
+ outputs=output_audio)
+ gen_btn1.click(fn=svc_v1, inputs=[svc_audio, gs_model_list_dropdown], outputs=output_audio)
+ # 本方法实现同一时刻只有一个程序在服务器端运行
+ app.queue(concurrency_count=1, max_size=2044).launch(server_name="0.0.0.0", inbrowser=True, quiet=True,
+ server_port=7860)
+
+
+if __name__ == '__main__':
+ main()

File Metadata

Mime Type
text/x-diff
Expires
Sun, Jan 12, 08:32 (1 d, 15 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1347183
Default Alt Text
(138 KB)

Event Timeline