diff --git a/AutoCoverTool/script/train_user_by_one_media.py b/AutoCoverTool/script/train_user_by_one_media.py
index e733f67..561cd63 100644
--- a/AutoCoverTool/script/train_user_by_one_media.py
+++ b/AutoCoverTool/script/train_user_by_one_media.py
@@ -1,547 +1,544 @@
 """
 使用一句话进行人声训练
 1. 数据集
 2. 训练
 """
 from ref.so_vits_svc.models import SynthesizerTrn, MultiPeriodDiscriminator
 from ref.so_vits_svc.mel_processing import spectrogram_torch, spec_to_mel_torch, mel_spectrogram_torch
 import ref.so_vits_svc.utils as utils
 import ref.so_vits_svc.commons as commons
 from ref.so_vits_svc.losses import kl_loss, generator_loss, discriminator_loss, feature_loss
 
 import logging
 
 logging.getLogger('numba').setLevel(logging.WARNING)
 
 import os
 import time
 import torch
 import random
 import librosa
 import soundfile
 import torchaudio
 import parselmouth
 import numpy as np
 from tqdm import tqdm
 from scipy.io.wavfile import read
 from pyworld import pyworld
 from copy import deepcopy
 import torch.utils.data
 from torch.nn import functional as F
 from torch.utils.data import DataLoader
 from torch.cuda.amp import autocast, GradScaler
 
 gs_denoise_exe = "/data/gpu_env_common/bin/denoise_exe"
 
 gs_hmodel = utils.get_hubert_model(0 if torch.cuda.is_available() else None)
 gs_model_config = {
     "inter_channels": 192,
     "hidden_channels": 192,
     "filter_channels": 768,
     "n_heads": 2,
     "n_layers": 6,
     "kernel_size": 3,
     "p_dropout": 0.1,
     "resblock": "1",
     "resblock_kernel_sizes": [3, 7, 11],
     "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
     "upsample_rates": [10, 8, 2, 2],
     "upsample_initial_channel": 512,
     "upsample_kernel_sizes": [16, 16, 4, 4],
     "n_layers_q": 3,
     "use_spectral_norm": False,
     "gin_channels": 256,
     "ssl_dim": 256,
     "n_speakers": 2
 }
 
 gs_train_config = {
     "log_interval": 1,
     "eval_interval": 1000,
     "seed": 1234,
     "epochs": 1000,
     "learning_rate": 0.0001,
     "betas": [
         0.8,
         0.99
     ],
     "eps": 1e-09,
     "batch_size": 12,
     "fp16_run": False,
     "lr_decay": 0.999875,
     "segment_size": 17920,
     "init_lr_ratio": 1,
     "warmup_epochs": 0,
     "c_mel": 1.0,  # 45
     "c_kl": 1.0,
     "c_fm": 1.0,
     "c_gen": 1.0,
     "use_sr": True,
     "max_speclen": 384
 }
 gs_data_config = {
     "max_wav_value": 32768.0,
     "sampling_rate": 32000,
     "filter_length": 1280,
     "hop_length": 320,
     "win_length": 1280,
     "n_mel_channels": 80,
     "mel_fmin": 0.0,
     "mel_fmax": None
 }
 
 
 def get_f0(x, p_len, f0_up_key=0):
     time_step = 160 / 16000 * 1000
     f0_min = 50
     f0_max = 1100
     f0_mel_min = 1127 * np.log(1 + f0_min / 700)
     f0_mel_max = 1127 * np.log(1 + f0_max / 700)
 
     f0 = parselmouth.Sound(x, 16000).to_pitch_ac(
         time_step=time_step / 1000, voicing_threshold=0.6,
         pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
     if len(f0) > p_len:
         f0 = f0[:p_len]
     pad_size = (p_len - len(f0) + 1) // 2
     if (pad_size > 0 or p_len - len(f0) - pad_size > 0):
         f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode='constant')
 
     f0 *= pow(2, f0_up_key / 12)
     f0_mel = 1127 * np.log(1 + f0 / 700)
     f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
     f0_mel[f0_mel <= 1] = 1
     f0_mel[f0_mel > 255] = 255
     f0_coarse = np.rint(f0_mel).astype(np.int)
     return f0_coarse, f0
 
 
 def resize2d(x, target_len):
     source = np.array(x)
     source[source < 0.001] = np.nan
     target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
                        source)
     res = np.nan_to_num(target)
     return res
 
 
 def compute_f0(x, sr, c_len):
     # x, sr = librosa.load(path, sr=32000)
     f0, t = pyworld.dio(
         x.astype(np.double),
         fs=sr,
         f0_ceil=800,
         frame_period=1000 * 320 / sr,
     )
     f0 = pyworld.stonemask(x.astype(np.double), f0, t, 32000)
     for index, pitch in enumerate(f0):
         f0[index] = round(pitch, 1)
     assert abs(c_len - x.shape[0] // 320) < 3, (c_len, f0.shape)
 
     return None, resize2d(f0, c_len)
 
 
 def process(filename):
     hmodel = utils.get_hubert_model(0 if torch.cuda.is_available() else None)
     save_name = filename + ".soft.pt"
     if not os.path.exists(save_name):
         devive = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         wav, _ = librosa.load(filename, sr=16000)
         wav = torch.from_numpy(wav).unsqueeze(0).to(devive)
         c = utils.get_hubert_content(hmodel, wav)
         torch.save(c.cpu(), save_name)
     else:
         c = torch.load(save_name)
     f0path = filename + ".f0.npy"
     if not os.path.exists(f0path):
         cf0, f0 = compute_f0(filename, c.shape[-1] * 2)
         np.save(f0path, f0)
 
 
 def clean_pitch(input_pitch):
     num_nan = np.sum(input_pitch == 1)
     if num_nan / len(input_pitch) > 0.9:
         input_pitch[input_pitch != 1] = 1
     return input_pitch
 
 
 class TextAudioSpeakerLoader(torch.utils.data.Dataset):
     """
         1) loads audio, speaker_id, text pairs
         2) normalizes text and converts them to sequences of integers
         3) computes spectrograms from audio files.
     """
 
     def __init__(self, audio_path):
         self.audio_path = audio_path
         self.max_wav_value = gs_data_config['max_wav_value']
         self.sampling_rate = gs_data_config['sampling_rate']
         self.filter_length = gs_data_config['filter_length']
         self.hop_length = gs_data_config['hop_length']
         self.win_length = gs_data_config['win_length']
         self.use_sr = gs_train_config['use_sr']
         self.spec_len = gs_train_config['max_speclen']
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.hmodel = gs_hmodel
 
         random.seed(1234)
         self.audio_data = self.get_audio(audio_path)
 
     def get_audio(self, filename):
         # 原始音频32k单声道
 
         # 这里存在疑惑:
         # audio, sr = librosa.load(filename, sr=self.sampling_rate, mono=True)
         sr, audio = read(filename)
         audio = torch.FloatTensor(audio.astype(np.float32))
 
         audio_norm = audio / self.max_wav_value
         audio_norm = torch.tensor(audio_norm)
         audio_norm = audio_norm.unsqueeze(0)
 
         # 幅度谱 帧长1280(40ms),帧移320(10ms),shape为(641, frame_num)
         spec = spectrogram_torch(audio_norm, self.filter_length,
                                  self.sampling_rate, self.hop_length, self.win_length,
                                  center=False)
         # print(torch.mean(spec))
         spec = torch.squeeze(spec, 0)
         spk = torch.LongTensor([0])
 
         # # 提取hubert特征,shape为(256, frame_num // 2)，后面做补齐
-        wav = librosa.resample(audio.numpy(), sr, 16000)
+        wav = librosa.resample(y=audio.numpy(), orig_sr=sr, target_sr=16000)
         wav = torch.from_numpy(wav).unsqueeze(0).to(self.device)
         c = utils.get_hubert_content(self.hmodel, wav).squeeze(0)
 
         # 提取f0特征,shape为(frame_num)
         cf0, f0 = compute_f0(audio.numpy(), sr, c.shape[-1] * 2)
         f0 = torch.FloatTensor(f0)
         c = torch.repeat_interleave(c, repeats=2, dim=1)  # shape=(256, frame_num)
 
         lmin = min(c.size(-1), spec.size(-1), f0.shape[0])
         # 当assert的前面的条件不成立的时候，会报错，并给出后面的信息
         assert abs(c.size(-1) - spec.size(-1)) < 4, (c.size(-1), spec.size(-1), f0.shape, filename)
         assert abs(lmin - spec.size(-1)) < 4, (c.size(-1), spec.size(-1), f0.shape)
         assert abs(lmin - c.size(-1)) < 4, (c.size(-1), spec.size(-1), f0.shape)
         spec, c, f0 = spec[:, :lmin], c[:, :lmin], f0[:lmin]
         audio_norm = audio_norm[:, :lmin * self.hop_length]
         _spec, _c, _audio_norm, _f0 = spec, c, audio_norm, f0
         # 取幅度谱特征，hubert特征、f0信息
         while spec.size(-1) < self.spec_len:
             spec = torch.cat((spec, _spec), -1)
             c = torch.cat((c, _c), -1)
             f0 = torch.cat((f0, _f0), -1)
             audio_norm = torch.cat((audio_norm, _audio_norm), -1)
         # hubert特征,f0,幅度谱特征，对应音频段波形,人声编码
         return c, f0, spec, audio_norm, spk
 
     def random_one(self):
         c, f0, spec, audio_norm, spk = self.audio_data
         start = random.randint(0, spec.size(-1) - self.spec_len)
         end = start + self.spec_len
         spec = spec[:, start:end]
         c = c[:, start:end]
         f0 = f0[start:end]
         audio_norm = audio_norm[:, start * self.hop_length:end * self.hop_length]
         return c, f0, spec, audio_norm, spk
 
     def __getitem__(self, index):
         c, f0, spec, audio_norm, spk = self.random_one()
         # 没有人声的段，不要
         cnt = 0
         while torch.mean(torch.abs(audio_norm)) < 0.02 and cnt < 3:
             c, f0, spec, audio_norm, spk = self.random_one()
             cnt += 1
         return c, f0, spec, audio_norm, spk
 
     def __len__(self):
         return 1
 
 
 class SoVitsSVCOnlineTrain:
 
     def construct_model(self):
         net_g = SynthesizerTrn(
             gs_data_config["filter_length"] // 2 + 1,
             gs_train_config["segment_size"] // gs_data_config["hop_length"],
             **gs_model_config,
             no_flow=False,
             use_v3=False).cuda()
         net_d = MultiPeriodDiscriminator(gs_model_config['use_spectral_norm']).cuda()
         optim_g = torch.optim.AdamW(
             net_g.parameters(),
             0.0001,
             betas=[0.8, 0.99],
             eps=1e-09)
         optim_d = torch.optim.AdamW(
             net_d.parameters(),
             0.0001,
             betas=[0.8, 0.99],
             eps=1e-09)
 
         # checkpoint_dict = torch.load(base_g_model, map_location='cuda')
         net_g.load_state_dict(self.g_model_dict)
         net_d.load_state_dict(self.d_model_dict)
         optim_g.load_state_dict(self.g_opt_dict)
         optim_d.load_state_dict(self.d_opt_dict)
 
         # 设置初始学习率
         optim_g.param_groups[0]['lr'] = 2e-4
         optim_d.param_groups[0]['lr'] = 2e-4
         return net_g, net_d, optim_g, optim_d
 
     def __init__(self, base_g_model, base_d_model):
         st1 = time.time()
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
         checkpoint_dict = torch.load(base_g_model, map_location='cpu')
         self.g_model_dict = checkpoint_dict["model"]
         self.g_opt_dict = checkpoint_dict["optimizer"]
 
         checkpoint_dict = torch.load(base_d_model, map_location='cpu')
         self.d_model_dict = checkpoint_dict["model"]
         self.d_opt_dict = checkpoint_dict["optimizer"]
 
         print("load model_path={},{},sp={}".format(base_g_model, base_d_model, time.time() - st1))
 
     def get_units(self, source, sr):
         source = source.unsqueeze(0).to(self.device)
         print("source_shape===>", source.shape)
         with torch.inference_mode():
             start = time.time()
             units = gs_hmodel.units(source)
             use_time = time.time() - start
             print("hubert use time:{}".format(use_time))
             return units
 
     def get_unit_pitch(self, source, sr, tran):
         source = torchaudio.functional.resample(source, sr, 16000)
         if len(source.shape) == 2 and source.shape[1] >= 2:
             source = torch.mean(source, dim=0).unsqueeze(0)
         soft = self.get_units(source, sr).squeeze(0).cpu().numpy()
         f0_coarse, f0 = get_f0(source.cpu().numpy()[0], soft.shape[0] * 2, tran)
         return soft, f0
 
     def train(self, in_wav, epoch_num):
         train_dataset = TextAudioSpeakerLoader(in_wav)
         train_loader = DataLoader(train_dataset, num_workers=0, shuffle=False, batch_size=12)
         net_g, net_d, optim_g, optim_d = self.construct_model()
 
         rank = 0
         # 用于训练加速
         torch.set_float32_matmul_precision('high')
         net_g.train()
         net_d.train()
         global_step = 0
         scaler = GradScaler(enabled=gs_train_config['fp16_run'])
 
         scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=gs_train_config['lr_decay'], last_epoch=1)
         scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=gs_train_config['lr_decay'], last_epoch=1)
         # 根据上一次的情况来进行学习率更新
         # 思路: loss 下降 学习率增加，loss上升学习率减少
         for epoch in tqdm(range(0, epoch_num)):
             for batch_idx, items in enumerate(train_loader):
                 # hubert特征,f0,幅度谱特征,对应音频段波形(384 * hop_length),人声编码[0]
                 c, f0, spec, y, spk = items
                 g = spk.cuda(rank, non_blocking=True)
                 spec, y = spec.cuda(rank, non_blocking=True), y.cuda(rank, non_blocking=True)
                 c = c.cuda(rank, non_blocking=True)
                 f0 = f0.cuda(rank, non_blocking=True)
                 """
                 "sampling_rate": 32000,
                 "filter_length": 1280,
                 "hop_length": 320,
                 "win_length": 1280,
                 "n_mel_channels": 80,
                 "mel_fmin": 0.0,
                 "mel_fmax": null
                 """
 
                 # spec, n_fft, num_mels, sampling_rate, fmin, fmax
                 mel = spec_to_mel_torch(spec, gs_data_config['filter_length'], gs_data_config['n_mel_channels'],
                                         gs_data_config['sampling_rate'], gs_data_config['mel_fmin'],
                                         gs_data_config['mel_fmax'])
                 with autocast(enabled=gs_train_config['fp16_run']):
                     # net_g的输入: hubert特征,f0,幅度谱特征,说话人id,mel谱特征
                     # net_g的输出:
                     # 原始波形,批次中每个采样到的帧的位置,批次中幅度谱的有效帧位置，
                     # 幅度谱编码得到正态分布后随机采样得到的z, z经过标准化流之后得到z_p, hubert特征层得到的正态分布的均值，
                     # hubert特征层得到的正态分布的标准差(logs_p)，幅度谱和人声信息得到的均值(m_q),幅度谱和人声信息得到的标准差(logs_q)
                     y_hat, ids_slice, z_mask, \
                     (z, z_p, m_p, logs_p, m_q, logs_q) = net_g(c, f0, spec, g=g, mel=mel)
 
                     y_mel = commons.slice_segments(mel, ids_slice,
                                                    gs_train_config['segment_size'] // gs_data_config['hop_length'])
 
                     y_hat_mel = mel_spectrogram_torch(
                         y_hat.squeeze(1),
                         gs_data_config['filter_length'],
                         gs_data_config['n_mel_channels'],
                         gs_data_config['sampling_rate'],
                         gs_data_config['hop_length'],
                         gs_data_config['win_length'],
                         gs_data_config['mel_fmin'],
                         gs_data_config['mel_fmax']
                     )
                     y = commons.slice_segments(y, ids_slice * gs_data_config['hop_length'],
                                                gs_train_config['segment_size'])  # slice
 
                     # Discriminator
                     y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach())
 
                     with autocast(enabled=False):
                         loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g)
                         loss_disc_all = loss_disc
 
                 optim_d.zero_grad()
                 scaler.scale(loss_disc_all).backward()
                 scaler.unscale_(optim_d)
                 scaler.step(optim_d)
                 with autocast(enabled=gs_train_config['fp16_run']):
                     # Generator
                     y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat)
                     with autocast(enabled=False):
                         # mel谱之间的损失函数，后面是系数，误差越小越好
                         loss_mel = F.l1_loss(y_mel, y_hat_mel) * gs_train_config['c_mel']
                         # KL散度,z_p: 幅度谱侧得到的采样值经过标准化流之后的结果，logs_q: 幅度谱侧得到的标准差,m_p:hubert侧得到的均值
                         # logs_p: hubert侧得到的标准差,z_mask: 批次中幅度谱的有效帧位置，
                         loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * gs_train_config['c_kl']
                         # 在d模型中将y和y_hat的每一层特征结果都拿出来，做l1距离
                         loss_fm = feature_loss(fmap_r, fmap_g) * gs_train_config['c_fm']
                         loss_gen, losses_gen = generator_loss(y_d_hat_g)
                         loss_gen_all = loss_gen * gs_train_config['c_gen'] + loss_fm + loss_mel + loss_kl
                 optim_g.zero_grad()
                 scaler.scale(loss_gen_all).backward()
                 scaler.unscale_(optim_g)
                 scaler.step(optim_g)
                 scaler.update()
 
                 if global_step % gs_train_config['log_interval'] == 0:
                     lr = optim_g.param_groups[0]['lr']
                     losses_numpy = [round(loss_disc.item(), 3), round(loss_gen.item(), 3),
                                     round(loss_fm.item(), 3), round(loss_mel.item(), 3), round(loss_kl.item(), 3)]
                     print("gstep={},lr={},disc={},gen={},fm={},mel={},kl={},tot={}".format(global_step, lr,
                                                                                            losses_numpy[0],
                                                                                            losses_numpy[1],
                                                                                            losses_numpy[2],
                                                                                            losses_numpy[3],
                                                                                            losses_numpy[4],
                                                                                            sum(losses_numpy)))
 
-                    if global_step % 200 == 0:
-                        torch.save(net_g.state_dict(), "data/web_trained_models/xiafan_{}.pth".format(global_step))
-
                 global_step += 1
 
             scheduler_g.step()
             scheduler_d.step()
         return net_g
 
     def infer(self, in_wav, dst_wav, model):
         tran = 0  # 变化的音高
         source, sr = librosa.load(in_wav, sr=32000, mono=True)
         source = torch.tensor(source).unsqueeze(0)
         sid = torch.LongTensor([0]).to(self.device).unsqueeze(0)
         soft, pitch = self.get_unit_pitch(source, sr, tran)
         f0 = torch.FloatTensor(clean_pitch(pitch)).unsqueeze(0).to(self.device)
         stn_tst = torch.FloatTensor(soft)
 
         with torch.no_grad():
             model.eval()
             x_tst = stn_tst.unsqueeze(0).to(self.device)
             start = time.time()
             x_tst = torch.repeat_interleave(x_tst, repeats=2, dim=1).transpose(1, 2)
             audio = model.infer(x_tst, f0=f0, g=sid)[0, 0].data.float()
             use_time = time.time() - start
             print("vits use time:{}".format(use_time))
         # 写入文件
         soundfile.write(dst_wav, audio.cpu().numpy(), sr, format='wav')
 
     ####### 对外接口，训练并预测
     def process_train_and_infer(self, train_media, in_path, dst_path, dst_model_path=None, params={}):
         """
         :param train_media:  训练时使用的数据
         :param in_path: 待转换的人声信息
         :param dst_path: 转换后的文件地址
         :param dst_model_path: 是否缓存模型
         :return:
         """
         # 对train_media转码为32k单声道
         tmp_32_wav = train_media + "_321.wav"
         cmd = "ffmpeg -i {} -ar 32000 -ac 1 -y {}".format(train_media, tmp_32_wav)
         os.system(cmd)
         if not os.path.exists(tmp_32_wav):
             return 1
 
         # 做降噪
         tmp_wav = train_media + "_de321.wav"
         cmd = "{} {} {}".format(gs_denoise_exe, tmp_32_wav, tmp_wav)
         os.system(cmd)
         if not os.path.exists(tmp_wav):
             os.unlink(tmp_32_wav)
             return 2
 
         in_wav_tmp = in_path + "_321.wav"
         cmd = "ffmpeg -i {} -ar 32000 -ac 1 -y {}".format(in_path, in_wav_tmp)
         os.system(cmd)
         if not os.path.exists(in_wav_tmp):
             os.unlink(tmp_32_wav)
             os.unlink(tmp_wav)
             return 3
 
         global gs_train_config
         max_step = params.get('max_step', 200)
         gs_train_config['c_mel'] = params.get("c_mel", 45)
         gs_train_config['c_fm'] = params.get("c_fm", 1.0)
         gs_train_config['c_gen'] = params.get("c_gen", 1.0)
 
         print("params:{}".format(params))
         st = time.time()
         model = self.train(tmp_wav, max_step)
         print("train sp={}".format(time.time() - st))
 
         st = time.time()
         self.infer(in_wav_tmp, dst_path, model)
         print("infer sp={}".format(time.time() - st))
 
         if dst_model_path is not None:
             st = time.time()
             torch.save(model.state_dict(), dst_model_path)
             print("save model sp={}".format(time.time() - st))
 
         os.unlink(tmp_32_wav)
         os.unlink(tmp_wav)
         os.unlink(in_wav_tmp)
         return 0
 
     # 推理结果
     def process_infer(self, model_path, in_path, dst_path):
         net_g = SynthesizerTrn(
             gs_data_config["filter_length"] // 2 + 1,
             gs_train_config["segment_size"] // gs_data_config["hop_length"],
             **gs_model_config,
             no_flow=False,
             use_v3=False).cuda()
         model_dict = torch.load(model_path, map_location='cpu')
         net_g.load_state_dict(model_dict)
         in_wav_tmp = in_path + "_321.wav"
         cmd = "ffmpeg -i {} -ar 32000 -ac 1 -y {}".format(in_path, in_wav_tmp)
         os.system(cmd)
         if not os.path.exists(in_wav_tmp):
             return 2
 
         self.infer(in_wav_tmp, dst_path, net_g)
 
 
 if __name__ == '__main__':
     pp = "data/train_users/qiankun_v1/vocals/speaker0/qiankun.wav"
     in_p = "data/test/vocal_32.wav"
     dst_p = "data/test/vocal_32_out.wav"
     dst_m_p = "data/test/mm.pth"
 
     g_path = "data/online_models/models/base_model/sunyanzi_base_2000.pth"
     d_path = "data/online_models/models/base_model/sunyanzi_base_d_2000.pth"
     svsot = SoVitsSVCOnlineTrain(g_path, d_path)
 
     start_time = time.time()
     ret = svsot.process_train_and_infer(pp, in_p, dst_p, dst_m_p)
     print("process = {} ret={}".format(time.time() - start_time, ret))