diff --git a/AutoCoverTool/online/inference_one.py b/AutoCoverTool/online/inference_one.py
index 8113e10..a73d87d 100644
--- a/AutoCoverTool/online/inference_one.py
+++ b/AutoCoverTool/online/inference_one.py
@@ -1,688 +1,689 @@
 """
 单个处理的逻辑
 song_id:
     ---src.mp3                              // 源数据，需要提前放进去
     ---cache
         ---vocal.wav                            // 分离之后产生
         ---acc.wav                              // 分离之后产生
         ---vocal_32.wav                         // 分离之后产生
         ---song_id_sp1.wav                      // 合成之后产生
         ---song_id_sp2.wav                      // 合成之后产生
         ---song_id_sp2_d.wav                    // 降噪之后生成
         ---song_id_sp2_dv.wav                   // 降噪+拉伸之后产生 [占比太高的不产生]
         ---song_id_sp2_dve442.wav               // 手动调整之后产生
         ---song_id_sp2_dve442_replace.wav       // 替换之后产生
         ---song_id_sp2_dve442_replace_mix.wav   // 人声+伴奏混合之后产生
     ---song_id
         --acc.mp3                           // 44k双声道320k
         --vocal.mp3                         // 44k双声道320k
         --src.mp3                           // 44k双声道320k
         --song_id_sp2_dv.mp3                // 44k单声道320k
     ---song_id_out                          // 对外输出
         --src.mp3                           // 原始音频
         --song_id_sp2_dv_replace_mix.mp3    // 制作完成的音频
 
 环境安装:
 conda create -n auto_song_cover  python=3.9
 # 安装demucs环境[进入到ref.music_remover 执行pip install -r requirements.txt]
 # 安装so_vits_svc环境[进入到ref.so_vits_svc 执行pip install -r requirements.txt]
 pip install librosa
 pip install scikit-maad
 pip install praat-parselmouth
 pip install matplotlib
 pip install torchvision
 pip install madmom
 pip install torchstat
 环境设置:
 export PATH=$PATH:/data/gpu_env_common/env/bin/ffmpeg/bin
 export PYTHONPATH=$PWD:$PWD/ref/music_remover/demucs:$PWD/ref/so_vits_svc:$PWD/ref/split_dirty_frame
 """
 
 import os
 import time
 import shutil
 import random
 import logging
 import librosa
 
 logging.basicConfig(filename='/tmp/inference.log', level=logging.INFO)
 
 gs_err_code_success = 0
 gs_err_code_no_src_mp3 = 1
 gs_err_code_separate = 2
 gs_err_code_trans_32 = 3
 gs_err_code_encode_err = 4
 gs_err_code_replace_err = 5
 gs_err_code_replace_trans_err = 6
 gs_err_code_mix_err = 7
 gs_err_code_mix_transcode_err = 8
 gs_err_code_no_src_dir = 9
 gs_err_code_volume_err = 10
 gs_err_code_trans2_442 = 11
 gs_err_code_reverb = 12
 gs_err_code_no_good_choice = 13
 gs_err_code_preprocess_vocal = 14
 gs_err_code_replace_except_err = 15
 
 gs_denoise_exe = "/opt/soft/bin/denoise_exe"
 gs_draw_volume_exe = "/opt/soft/bin/draw_volume"
 gs_simple_mixer_path = "/opt/soft/bin/simple_mixer"
 gs_rever_path = "/opt/soft/bin/dereverbrate"
 
 from ref.music_remover.separate_interface import SeparateInterface
 from ref.so_vits_svc.inference_main import *
 from ref.split_dirty_frame.script.process_one import ReplaceVocalFrame, construct_power_fragment
 
 
 class SongCoverInference:
     def __init__(self):
         self.work_dir = None
         self.cache_dir = None
         self.cid = None
         self.src_mp3 = None
         self.vocal_path = None
         self.vocal_32_path = None
         self.acc_path = None
         self.speakers = [
             10414574138721494,
             10414574140317353,
             1688849864840588,
             3634463651,
             5629499489839033,
             5910973794723621,
             6755399374234747,
             8162774327817435,
             8162774329368194,
             1125899914308640,  # 以下为男声,包括这个
             12384898975368914,
             12947848931397021,
             3096224748076687,
             3096224751151928,
             5066549357604730,
             5348024335101054,
             6755399442719465,
             7036874421386111
         ]
 
         self.speakers2gender = {
             10414574138721494: 2,
             10414574140317353: 2,
             1688849864840588: 2,
             3634463651: 2,
             5629499489839033: 2,
             5910973794723621: 2,
             6755399374234747: 2,
             8162774327817435: 2,
             8162774329368194: 2,
             1125899914308640: 1,  # 1是男
             12384898975368914: 1,
             12947848931397021: 1,
             3096224748076687: 1,
             3096224751151928: 1,
             5066549357604730: 1,
             5348024335101054: 1,
             6755399442719465: 1,
             7036874421386111: 1
         }
         self.speakers_model_path = "data/train_users/{}/logs/32k/G_2000.pth"
         self.speakers_model_config = "data/train_users/{}/config/config.json"
 
         st = time.time()
         self.separate_inst = None
         logging.info("post process ... ReplaceVocalFrame init sp={}".format(time.time() - st))
         self.replace_vocal_frame_inst = None
         logging.info("SongCoverInference init sp={}".format(time.time() - st))
 
     def separate(self, cid, src_mp3, vocal_path, acc_path):
         """
         人声伴奏分离
         :param cid:
         :param src_mp3:
         :param vocal_path:
         :param acc_path:
         :return:
         """
         st = time.time()
         if self.separate_inst is None:
             self.separate_inst = SeparateInterface()
         if not self.separate_inst.process(cid, src_mp3, vocal_path, acc_path):
             return gs_err_code_separate
         if not os.path.exists(vocal_path) or not os.path.exists(acc_path):
             return gs_err_code_separate
 
         # 转码出一个32k单声道的数据
         cmd = "ffmpeg -i {} -ar 32000 -ac 1 -y {} -loglevel fatal".format(vocal_path, self.vocal_32_path)
         os.system(cmd)
         if not os.path.exists(self.vocal_32_path):
             return gs_err_code_trans_32
         print("separate:cid={}|sp={}".format(cid, time.time() - st))
         return gs_err_code_success
 
     def get_start_ms(self, vocal_path):
         """
         给定原始音频，找一段连续10s的音频
         :param vocal_path:
         :return:
         """
         audio, sr = librosa.load(vocal_path, sr=16000)
         audio = librosa.util.normalize(audio)
         # 帧长100ms,帧移10ms,计算能量
         power_arr = []
         for i in range(0, len(audio) - 1600, 160):
             power_arr.append(np.sum(np.abs(audio[i:i + 160])) / 160)
             # 将能量小于等于10的部分做成段
         power_arr = construct_power_fragment(power_arr)
         fragments = []
         last_pos = 0
         for idx, line in enumerate(power_arr):
             start = round(float(line[0]) * 0.01, 3)
             duration = round(float(line[1]) * 0.01, 3)
             fragments.append([last_pos, start - last_pos])
             last_pos = start + duration
         if last_pos < len(audio) / sr:
             fragments.append([last_pos, len(audio) / sr - last_pos])
 
         # 合并数据，两者间隔在50ms以内的合并起来
         idx = 0
         while idx < len(fragments) - 1:
             if fragments[idx + 1][0] - (fragments[idx][0] + fragments[idx][1]) < 0.05:
                 fragments[idx][1] = fragments[idx + 1][0] + fragments[idx + 1][1] - fragments[idx][0]
                 del fragments[idx + 1]
                 idx -= 1
             idx += 1
 
         # out_file = vocal_path + "_power.csv"
         # with open(out_file, "w") as f:
         #     f.write("Name\tStart\tDuration\tTime Format\tType\n")
         #     for fragment in fragments:
         #         start = round(float(fragment[0]), 3)
         #         duration = round(float(fragment[1]), 3)
         #         strr = "{}\t{}\t{}\t{}\n".format("11", start, duration, "decimal\tCue\t")
         #         f.write(strr)
 
         # 筛选出开始的位置
         # 1. 连续时长大于10s，当前段长度大于3s
         # 2. 不可用
         # 从0到fragments[idx], 包含idx其中人声段的总和
         tot_vocal_duration = [fragments[0][1]]
         for i in range(1, len(fragments)):
             tot_vocal_duration.append(tot_vocal_duration[i - 1] + fragments[i][1])
 
         # 计算出任意两段之间非人声占比
         for i in range(0, len(fragments)):
             if fragments[i][1] >= 3:
                 now_tot = 0
                 if i > 0:
                     now_tot = tot_vocal_duration[i - 1]
                 for j in range(i + 1, len(fragments)):
                     cur_rate = tot_vocal_duration[j] - now_tot
                     cur_rate = cur_rate / (fragments[j][1] + fragments[j][0] - fragments[i][0])
                     if cur_rate > 0.1:
                         return fragments[i][0]
         return -1
 
     def inference_speaker(self):
         """
         推理生成合成后的音频
         随机取5个干声,选择占比最小的，并且要求占比小于0.3
         :return:
         """
         st = time.time()
         out_speakers = random.sample(self.speakers, 15)
         out_songs_dict = {}
         for speaker in out_speakers:
             model_path = self.speakers_model_path.format(speaker)
             config_path = self.speakers_model_config.format(speaker)
             song_path = os.path.join(self.cache_dir, "{}_{}.wav".format(self.cid, speaker))
             try:
                 inf(model_path, config_path, self.vocal_32_path, song_path, "prod")
             except Exception as ex:
                 logging.info("cid={}, inference_speaker err={}".format(self.cid, ex))
                 continue
             if os.path.exists(song_path):
                 if self.replace_vocal_frame_inst is None:
                     self.replace_vocal_frame_inst = ReplaceVocalFrame(
                         "data/models/split_dirty_frame_v5_3_epoch3_852.pth")
                 rate = self.replace_vocal_frame_inst.get_rate(song_path)
                 if rate < 0.3:
                     out_songs_dict[song_path] = rate
 
         # 从内部选择占比最低的
         out_songs = []
         if len(out_songs_dict.keys()) > 0:
             st_sec = self.get_start_ms(self.vocal_path)
             song_msg = sorted(out_songs_dict.items(), key=lambda kv: kv[1])[0]
             out_songs = [song_msg[0]]
             logging.info("GetRate:cid={},song={},rate={},st_tm={}".format(self.cid, song_msg[0], round(song_msg[1], 2),
                                                                           round(st_sec, 3)))
             print("GetRate:cid={},song={},rate={},st_tm={}".format(self.cid, song_msg[0], round(song_msg[1], 2),
                                                                    round(st_sec, 3)))
         # logging.info("inference_speaker len = {} finish sp = {}".format(len(out_songs), time.time() - st))
         print("inference_speaker len = {} finish sp = {}".format(len(out_songs), time.time() - st))
         return out_songs
 
     def get_new_vocal_rate(self, songs):
         """
         获取人声的比率
         :param songs:
         :return:
         """
         st = time.time()
         need_to_process_song = []
         for song in songs:
             if self.replace_vocal_frame_inst is None:
                 self.replace_vocal_frame_inst = ReplaceVocalFrame("data/models/split_dirty_frame_v5_3_epoch3_852.pth")
             rate = self.replace_vocal_frame_inst.get_rate(song)
             logging.info("{} {} replace_rate={}".format(self.cid, song, rate))
             if rate < 1.0:
                 need_to_process_song.append(song)
         logging.info(
             "get_new_vocal_rate belen = {} len = {} finish sp = {}".format(len(songs), len(need_to_process_song),
                                                                            time.time() - st))
         return need_to_process_song
 
     def preprocess_vocal(self, songs, vocal_path):
         """
         1. 降噪
         2. 拉伸
         :param songs:
         :param vocal_path: 参考的音频信号
         :return:
         """
         st = time.time()
         dv_out_list = []
         for song in songs:
             denoise_path = str(song).replace(".wav", "_d.wav")
             cmd = "{} {} {}".format(gs_denoise_exe, song, denoise_path)
             os.system(cmd)
             if not os.path.exists(denoise_path):
                 print("{} {} ERROR denoise".format(self.cid, song))
                 continue
             # 拉伸
             volume_path = str(song).replace(".wav", "_dv.wav")
             cmd = "{} {} {} {}".format(gs_draw_volume_exe, denoise_path, vocal_path, volume_path)
             os.system(cmd)
             if not os.path.exists(volume_path):
                 print("{} {} ERROR denoise".format(self.cid, volume_path))
                 continue
             dv_out_list.append(volume_path)
         print(
             "preprocess_vocal belen = {} len = {} finish sp = {}".format(len(songs), len(dv_out_list),
                                                                          time.time() - st))
         return dv_out_list
 
     def output(self, dv_out_list):
         """
         对外输出数据
         :param dv_out_list:
         :return:
         """
         st = time.time()
         out_dir = os.path.join(self.work_dir, self.cid)
         if os.path.exists(out_dir):
             shutil.rmtree(out_dir)
         os.makedirs(out_dir)
 
         # 拷贝数据
         dst_mp3_path = os.path.join(out_dir, "src_mp3")
         dst_acc_path = os.path.join(out_dir, "acc.mp3")
         dst_vocal_path = os.path.join(out_dir, "vocal.mp3")
         shutil.copyfile(self.src_mp3, dst_mp3_path)
         cmd = "ffmpeg -i {} -ab 320k -y {} -loglevel fatal".format(self.acc_path, dst_acc_path)
         os.system(cmd)
         if not os.path.exists(dst_acc_path):
             return gs_err_code_encode_err
         cmd = "ffmpeg -i {} -ab 320k -y {} -loglevel fatal".format(self.vocal_path, dst_vocal_path)
         os.system(cmd)
         if not os.path.exists(dst_vocal_path):
             return gs_err_code_encode_err
 
         # 将所有数据放到out_dir中，用于给人工标注
         for dv_wav in dv_out_list:
             dv_wav_name = str(dv_wav).split("/")[-1].replace(".wav", "_441.mp3")
             dst_dv_path = os.path.join(out_dir, dv_wav_name)
 
             cmd = "ffmpeg -i {} -ar 44100 -ac 1 -ab 320k -y {} -loglevel fatal".format(dv_wav, dst_dv_path)
             os.system(cmd)
             if not os.path.exists(dst_dv_path):
                 print("{} encode err!".format(cmd))
                 continue
         logging.info(
             "preprocess_vocal output sp = {}".format(time.time() - st))
 
     def process_one(self, cid, work_dir, enable_output=False):
         logging.info("\nstart:cid={},work_dir={}----------------------->>>>>>>>".format(cid, work_dir))
         self.cid = cid
         self.work_dir = work_dir
 
         # 所有不对外交付的，全部放到这里
         self.cache_dir = os.path.join(work_dir, "cache")
         if os.path.exists(self.cache_dir):
             shutil.rmtree(self.cache_dir)
         os.makedirs(self.cache_dir)
 
         self.src_mp3 = os.path.join(self.work_dir, "src.mp3")
         if not os.path.exists(self.src_mp3):
             return gs_err_code_no_src_mp3
         self.vocal_path = os.path.join(self.cache_dir, "vocal.wav")
         self.vocal_32_path = os.path.join(self.cache_dir, "vocal_32.wav")
         self.acc_path = os.path.join(self.cache_dir, "acc.wav")
 
         if not os.path.exists(self.vocal_32_path):
             logging.info("start separate ... {} {} {}".format(self.src_mp3, self.vocal_path, self.acc_path))
             err = self.separate(cid, self.src_mp3, self.vocal_path, self.acc_path)
             if err != gs_err_code_success:
                 return err, None, None
         logging.info("start inference_speaker ...")
         out_songs = self.inference_speaker()
         dv_out_list = self.preprocess_vocal(out_songs, self.vocal_path)
         if len(dv_out_list) == 0:
             return gs_err_code_no_good_choice, None, None
 
         mix_mp3_path = None
         gender = -1
         if enable_output:
             self.output(dv_out_list)
         else:
             # 默认全部处理一遍
             for dv_out_path in dv_out_list:
                 src_path = dv_out_path.replace("_dv.wav", ".wav")
                 err, mix_mp3_path = self.after_process(self.cid, self.work_dir, src_path, dv_out_path, self.vocal_path,
                                                        self.acc_path,
                                                        True, False)
                 if err != gs_err_code_success:
                     logging.info("after_process err {}".format(err))
 
                 # 取出性别属性
                 if err == gs_err_code_success and mix_mp3_path is not None:
                     gender = self.speakers2gender[int(str(os.path.basename(mix_mp3_path)).split("_")[1])]
         logging.info("finish:cid={},work_dir={}----------------------->>>>>>>>".format(cid, work_dir))
         return gs_err_code_success, mix_mp3_path, gender
 
     def reverb_by_vocal(self, file):
         st = time.time()
         file_442 = file.replace(".wav", "_442.wav")
         if not os.path.exists(file_442):
             cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {}".format(file, file_442)
             os.system(cmd)
             if not os.path.exists(file_442):
                 return gs_err_code_trans2_442, None
 
         file_dst = file.replace(".wav", "_442_dr.wav")
         cmd = "{} {} {} {}".format(gs_rever_path, self.vocal_path, file_442, file_dst)
         os.system(cmd)
         if not os.path.exists(file_dst):
             return gs_err_code_reverb, None
         print("cid = {}, reverb_by_vocal sp={}".format(self.cid, time.time() - st))
         return gs_err_code_success, file_dst
 
     def after_process(self, cid, work_dir, in_file, effect_file, vocal_file, acc_file, need_draw=True,
                       need_reverb=True):
         """
         后处理逻辑
         将处理好的音频进行替换，然后和伴奏进行混合，最后进行编码
         :return:
         """
         if need_reverb:
             # 抓取混响
             err, effect_file = self.reverb_by_vocal(in_file)
             if err != gs_err_code_success:
                 return err, None
 
         if need_draw:
             # 增加一个拉伸的步骤
             volume_path = str(effect_file).replace(".wav", "_dv.wav")
             cmd = "{} {} {} {}".format(gs_draw_volume_exe, effect_file, vocal_file, volume_path)
             print(cmd)
             os.system(cmd)
             if not os.path.exists(volume_path):
                 print("{} {} ERROR draw volume".format(self.cid, volume_path))
                 return gs_err_code_volume_err, None
             effect_file = volume_path
 
         st = time.time()
         self.cid = cid
         self.work_dir = work_dir
         self.src_mp3 = os.path.join(self.work_dir, "src.mp3")
         if not os.path.exists(self.work_dir):
             return gs_err_code_no_src_dir
         self.replace_vocal_frame_inst.process(in_file, effect_file, vocal_file)
         dst_path = effect_file + "_replace.wav"
         if not os.path.exists(dst_path):
             return gs_err_code_replace_err, None
         print("replace_vocal_frame_inst sp = {}".format(time.time() - st))
 
         # 转码
         dst_path_442 = dst_path.replace("_replace.wav", "_replace442.wav")
         cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {} -loglevel fatal".format(dst_path, dst_path_442)
         os.system(cmd)
         if not os.path.exists(dst_path_442):
             return gs_err_code_replace_trans_err, None
 
         # 合并转码后再做一次拉伸，保证响度
         volume_path = str(dst_path_442).replace(".wav", "_dv.wav")
         cmd = "{} {} {} {}".format(gs_draw_volume_exe, dst_path_442, vocal_file, volume_path)
         print(cmd)
         os.system(cmd)
         if not os.path.exists(volume_path):
             print("{} {} ERROR draw volume".format(self.cid, volume_path))
             return gs_err_code_volume_err, None
         dst_path_442 = volume_path
 
         # 混合
         mix_path = dst_path_442.replace("_replace442.wav", "_replace442_mix.wav")
         cmd = "{} {} {} {}".format(gs_simple_mixer_path, dst_path_442, acc_file, mix_path)
         print("{}".format(cmd))
         os.system(cmd)
         if not os.path.exists(mix_path):
             return gs_err_code_mix_err, None
 
         # 编码为mp3
         output_dir = os.path.join(self.work_dir, self.cid + "_out")
         if not os.path.exists(output_dir):
             os.makedirs(output_dir)
         name = str(mix_path).replace("_replace442_mix.wav", "_replace442_mix.mp3").split("/")[-1]
         mix_path_mp3 = os.path.join(output_dir, name)
         cmd = "ffmpeg -i {} -ab 320k -y {} -loglevel fatal".format(mix_path, mix_path_mp3)
         os.system(cmd)
         if not os.path.exists(mix_path_mp3):
             return gs_err_code_mix_transcode_err, None
 
         # 拷贝src到output_dir
         # shutil.copyfile(self.src_mp3, os.path.join(output_dir, "src.mp3"))
         # logging.info("after_process sp = {}".format(time.time() - st))
         return gs_err_code_success, mix_path_mp3
 
     ####################################新对外接口############################################################
     def prepare_env(self, cid, work_dir, create_dir=False):
         self.cid = cid
         self.work_dir = work_dir
 
         # 所有不对外交付的，全部放到这里
         self.cache_dir = os.path.join(work_dir, "cache")
         if create_dir:
             if os.path.exists(self.cache_dir):
                 shutil.rmtree(self.cache_dir)
             os.makedirs(self.cache_dir)
 
         self.src_mp3 = os.path.join(self.work_dir, "src.mp3")
         if not os.path.exists(self.src_mp3):
             return gs_err_code_no_src_mp3
         self.vocal_path = os.path.join(self.cache_dir, "vocal.wav")
         self.vocal_32_path = os.path.join(self.cache_dir, "vocal_32.wav")
         self.acc_path = os.path.join(self.cache_dir, "acc.wav")
         return gs_err_code_success
 
     def generate_svc_file(self, cid, work_dir):
         """
         :param cid:
         :param work_dir:
         :return:err_code, 生成出的svc的文件名称
         """
         err = self.prepare_env(cid, work_dir, create_dir=True)
         if err != gs_err_code_success:
             return err, None
 
         # 音源分离
         if not os.path.exists(self.vocal_32_path):
             st = time.time()
             err = self.separate(cid, self.src_mp3, self.vocal_path, self.acc_path)
             logging.info("cid={},separate,sp={}".format(self.cid, time.time() - st))
             if err != gs_err_code_success:
                 return err, None
 
         # 生成svc,只保留一个最佳的
         st = time.time()
         out_songs = self.inference_speaker()
         if len(out_songs) == 0:
             return gs_err_code_no_good_choice, None
         logging.info("cid={},inference_speaker,{},sp={}".format(self.cid, out_songs[0], time.time() - st))
         return gs_err_code_success, out_songs[0]
 
     def effect(self, cid, work_dir, svc_file):
         st = time.time()
         err = self.prepare_env(cid, work_dir)
         if err != gs_err_code_success:
             return err, None
         logging.info("cid={},effect_and_mix,{},sp={}".format(self.cid, svc_file, time.time() - st))
 
         # 预处理人声
         dv_out_list = self.preprocess_vocal([svc_file], self.vocal_path)
         if len(dv_out_list) == 0:
             return gs_err_code_preprocess_vocal, None
         svc_file = dv_out_list[0]
 
         # 做音效
         st = time.time()
         err, effect_file = self.reverb_by_vocal(svc_file)
         if err != gs_err_code_success:
             return err, None
         logging.info("cid={},reverb_by_vocal,{},sp={}".format(self.cid, svc_file, time.time() - st))
         return err, effect_file
 
     def mix(self, cid, work_dir, svc_file, effect_file):
         """
         做音效以及合并
         :param cid:
         :param work_dir:
         :param svc_file:
         :param effect_file:
         :return: err_code, 完成的mp3文件
         """
         st = time.time()
         err = self.prepare_env(cid, work_dir)
         if err != gs_err_code_success:
             return err, None
         logging.info("cid={},effect_and_mix,{},sp={}".format(self.cid, svc_file, time.time() - st))
 
         # 拉伸
         st = time.time()
         volume_path = str(effect_file).replace(".wav", "_dv.wav")
         cmd = "{} {} {} {}".format(gs_draw_volume_exe, effect_file, self.vocal_path, volume_path)
         os.system(cmd)
         if not os.path.exists(volume_path):
             print("{} {} ERROR draw volume".format(self.cid, volume_path))
             return gs_err_code_volume_err, None
         effect_file = volume_path
         logging.info("cid={},draw_volume,{},sp={}".format(self.cid, svc_file, time.time() - st))
 
         # 替换
         st = time.time()
         try:
             if self.replace_vocal_frame_inst is None:
                 self.replace_vocal_frame_inst = ReplaceVocalFrame("data/models/split_dirty_frame_v5_3_epoch3_852.pth")
             self.replace_vocal_frame_inst.process(svc_file, effect_file, self.vocal_path)
         except Exception as ex:
             logging.info("{},replace_vocal_frame_inst, {}", self.cid, ex)
             return gs_err_code_replace_except_err, None
         dst_path = effect_file + "_replace.wav"
         if not os.path.exists(dst_path):
             return gs_err_code_replace_err, None
         logging.info("cid={},replace_vocal_frame_inst,{},sp={}".format(self.cid, svc_file, time.time() - st))
 
         # 转码
         st = time.time()
         dst_path_442 = dst_path.replace("_replace.wav", "_replace442.wav")
         cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {} -loglevel fatal".format(dst_path, dst_path_442)
         os.system(cmd)
         if not os.path.exists(dst_path_442):
             return gs_err_code_replace_trans_err, None
         logging.info("cid={},transcode,{},sp={}".format(self.cid, svc_file, time.time() - st))
 
         # 合并转码后再做一次拉伸，保证响度
         st = time.time()
         volume_path = str(dst_path_442).replace("_replace442.wav", "_replace442_dv.wav")
         cmd = "{} {} {} {}".format(gs_draw_volume_exe, dst_path_442, self.vocal_path, volume_path)
         os.system(cmd)
         if not os.path.exists(volume_path):
             print("{} {} ERROR draw volume".format(self.cid, volume_path))
             return gs_err_code_volume_err, None
         dst_path_442 = volume_path
         logging.info("cid={},draw_volume2,{},sp={}".format(self.cid, svc_file, time.time() - st))
 
         # 混合
         st = time.time()
         mix_path = dst_path_442.replace("_replace442_dv.wav", "_replace442_dv_mix.wav")
         cmd = "{} {} {} {}".format(gs_simple_mixer_path, dst_path_442, self.acc_path, mix_path)
         os.system(cmd)
         if not os.path.exists(mix_path):
             return gs_err_code_mix_err, None
         logging.info("cid={},mixer,{},sp={}".format(self.cid, svc_file, time.time() - st))
 
         # 编码为mp3
         st = time.time()
         output_dir = os.path.join(self.work_dir, self.cid + "_out")
         if not os.path.exists(output_dir):
             os.makedirs(output_dir)
         name = str(mix_path).replace("_replace442_dv_mix.wav", "_replace442_dv_mix.mp3").split("/")[-1]
         mix_path_mp3 = os.path.join(output_dir, name)
         cmd = "ffmpeg -i {} -ab 320k -y {} -loglevel fatal".format(mix_path, mix_path_mp3)
         print(cmd)
         os.system(cmd)
         if not os.path.exists(mix_path_mp3):
             return gs_err_code_mix_transcode_err, None
         logging.info("cid={},encode,{},sp={}".format(self.cid, svc_file, time.time() - st))
         return gs_err_code_success, mix_path_mp3
 
     def get_gender(self, svc_file):
         return self.speakers2gender[int(os.path.basename(svc_file.replace(".wav", "")).split("_")[1])]
 
     def process_one_logic(self, cid, work_dir):
         """
         搞成两部分:
         1. 分离数据+5次推理，获取最佳结果,并保存
         2. 利用最佳结果做音效以及合并
         :return:
         """
         err, svc_file = self.generate_svc_file(cid, work_dir)
         gender = -1
         if err != gs_err_code_success:
             return err, svc_file, gender,
         gender = self.get_gender(svc_file)
         err, effect_file = self.effect(cid, work_dir, svc_file)
         if err != gs_err_code_success:
             return err, svc_file, gender
         err, mix_mp3_path = self.mix(cid, work_dir, svc_file, effect_file)
         return err, mix_mp3_path, gender
 
 
 def test():
     arr = [
         # "611752105020343687",
         # "611752105023532439",
         # "611752105030419688",
         # "611752105030485748",
-        "611752105030485685"
+        # "611752105030485685",
+        "dzq",
     ]
     base_dir = "/data/rsync/jianli.yang/AutoCoverTool/data/test"
     s_inst = SongCoverInference()
     for cid in arr:
         st = time.time()
         # err, mix_mp3, gender = s_inst.process_one(cid, os.path.join(base_dir, cid), False)
         err, mix_mp3, gender = s_inst.process_one_logic(cid, os.path.join(base_dir, cid))
         print(mix_mp3, gender)
         print("cid={} RealFinish err={} sp={}".format(cid, err, time.time() - st))
 
 
 if __name__ == '__main__':
     test()
diff --git a/AutoCoverTool/script/shuffle_music.py b/AutoCoverTool/script/shuffle_music.py
index 81b3e60..b3496d2 100644
--- a/AutoCoverTool/script/shuffle_music.py
+++ b/AutoCoverTool/script/shuffle_music.py
@@ -1,263 +1,429 @@
 """
 载入人声，将人声的频谱进行向上平移
 """
 import librosa
 import soundfile
 import numpy as np
 from copy import deepcopy
 
 
 def local_maxium(x):
     """
     求序列的极大值
     :param x:
     :return:
     """
     d = np.diff(x)
     l_d = len(d)
     maxium = []
     loc = []
     for i in range(l_d - 1):
         if d[i] > 0 and d[i + 1] <= 0:
             maxium.append(x[i + 1])
             loc.append(i + 1)
     return maxium, loc
 
 
 def Formant_Cepst(u, cepstL):
     """
     来源: https://github.com/taw19960426/-Speech-signal-processing-experiment-tutorial-_python/blob/master/%E5%85%B1%E6%8C%AF%E5%B3%B0%E4%BC%B0%E8%AE%A1%E5%87%BD%E6%95%B0.py
     倒谱法共振峰估计函数
     :param u:输入信号
     :param cepstL:🔪频率上窗函数的宽度
     :return: val共振峰幅值
     :return: loc共振峰位置
     :return: spec包络线
     """
     wlen2 = len(u) // 2
     u_fft = np.fft.fft(u)  # 按式（2-1）计算
     U = np.log(np.abs(u_fft[:wlen2]))
     Cepst = np.fft.ifft(U)  # 按式（2-2）计算
     cepst = np.zeros(wlen2, dtype=np.complex)
     cepst[:cepstL] = Cepst[:cepstL]  # 按式（2-3）计算
     cepst[-cepstL + 1:] = Cepst[-cepstL + 1:]  # 取第二个式子的相反
     spec = np.real(np.fft.fft(cepst))
     val, loc = local_maxium(spec)  # 在包络线上寻找极大值
     return val, loc, spec
 
 
 def get_ref_stft():
     sr = 44100
     audio, sr = librosa.load(
         "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal_ref.wav", \
         sr=sr, mono=True)
     stft = librosa.stft(audio, n_fft=2048)
     stft = stft.transpose()
     print(stft.shape)
     data = np.mean(np.abs(stft), axis=0)
     data = data / np.max(data)
     return data
 
 
 def test(in_vocal):
     import matplotlib.pyplot as plt
 
     sr = 44100
     audio, sr = librosa.load(in_vocal, sr=sr, mono=True)
 
     stft = librosa.stft(audio, n_fft=2048)
     stft = stft.transpose()
     new_stft = np.zeros_like(stft)
     w1 = get_ref_stft()
     data = np.mean(np.abs(stft), axis=0)
     data = data / np.max(data)
     w = w1 / data
 
     for ii in range(0, len(stft)):
         # 第一种，整体向上+3
         # for i in range(0, 3):
         #     new_stft[ii][i] = stft[ii][i]
         # for i in range(0, len(stft[ii]) - 3):
         #     dst_i = i + 3
         #     new_stft[ii][dst_i] = stft[ii][i]
 
         # 第二种,整体向上拉伸1.12倍[2个音高]
         # for i in range(0, 1):
         #     new_stft[ii][i] = stft[ii][i]
         # for i in range(1, len(stft[ii])):
         #     dst_i = int(i * 1.12 + 0.5)
         #     if dst_i >= len(stft[ii]):
         #         break
         #     new_stft[ii][dst_i] += stft[ii][i]
 
         # 第三种，第一共振峰部分不移动,其他部分移动
         # power = np.abs(stft[ii])
         # power = power / (np.max(power))
         #
         # x = np.array(list(range(0, len(stft[ii]))))
         # y = power
         #
         # new_x = []
         # new_y = []
         # for i in range(1, len(x) - 1, 1):
         #     if y[i - 1] < y[i] > y[i + 1] and y[i] > 0.01:
         #         new_x.append(x[i])
         #         new_y.append(y[i])
         #
         # # 前后100hz的合并
         # x = new_x
         # y = new_y
         # new_x = []
         # new_y = []
         # for i in range(1, len(x) - 1, 1):
         #     if y[i - 1] < y[i] > y[i + 1]:
         #         if x[i] - x[i - 1] > 5:
         #             new_x.append(x[i - 1])
         #             new_y.append(y[i - 1])
         #         new_x.append(x[i])
         #         new_y.append(y[i])
         #         if x[i + 1] - x[i] > 5:
         #             new_x.append(x[i + 1])
         #             new_y.append(y[i + 1])
         #
         # if len(new_x) <= 1:
         #     new_stft[ii] = deepcopy(stft[ii])
         #     continue
         #
         # # 从第一共振峰开始向上加
         # st_freq_idx = new_x[1]
         # for i in range(st_freq_idx, len(stft[ii])):
         #     dst_i = int(i * 1.12 + 0.5)
         #     if dst_i >= len(stft[ii]):
         #         continue
         #     new_stft[ii][dst_i] = stft[ii][i]
         # new_stft[ii][0] = stft[ii][0]
         # for i in range(0, st_freq_idx):
         #     new_stft[ii][i] = stft[ii][i]
-        new_stft[ii] = stft[ii] * w
+        # new_stft[ii] = stft[ii] * w
+        pass
 
     new_stft = new_stft.transpose()
     istft = librosa.istft(new_stft)
     soundfile.write(str(in_vocal).replace(".wav", "_out.wav"), istft, 44100, format="wav")
 
 
 def test_v5(vocal, vocal_ref, vocal_ref2):
     sr = 44100
     audio, sr = librosa.load(vocal, sr=sr, mono=True)
     stft = librosa.stft(audio, n_fft=2048)
     stft = stft.transpose()
     new_stft = np.zeros_like(stft)
 
     audio_ref, sr = librosa.load(vocal_ref, sr=sr, mono=True)
     stft_ref = librosa.stft(audio_ref, n_fft=2048)
     stft_ref = stft_ref.transpose()
 
     audio_ref2, sr = librosa.load(vocal_ref2, sr=sr, mono=True)
     stft_ref2 = librosa.stft(audio_ref2, n_fft=2048)
     stft_ref2 = stft_ref2.transpose()
 
     w1 = np.ones(len(stft[0]))
     for i in range(0, 800):
         w1[i] = i / 800
     w2 = 1.0 - w1
     for i in range(0, min(len(stft), len(stft_ref2), len(stft_ref))):
         # new_stft[i] = stft_ref2[i] * w2 + stft[i] * w1
         w = np.abs(stft_ref2[i]) / np.abs(stft[i])
         new_stft[i] = w * stft[i]
 
     new_stft = new_stft.transpose()
     istft = librosa.istft(new_stft)
     soundfile.write(str(vocal).replace(".wav", "_out5.wav"), istft, 44100, format="wav")
 
 
 def ttt(path):
     from scipy.signal import lfilter
     import matplotlib.pyplot as plt
     # path="C4_3_y.wav"
     # data, fs = soundBase('C4_3_y.wav').audioread()
     data, fs = librosa.load(path, sr=44100, mono=True)  # sr=None声音保持原采样频率， mono=False声音保持原通道数
     # 预处理-预加重
     u = lfilter([1, -0.99], [1], data)
 
     cepstL = 7
     wlen = len(u)
     wlen2 = wlen // 2
     print("帧长={}".format(wlen))
     print("帧移={}".format(wlen2))
     # wlen = 256
     # wlen2 = 256//2
     # 预处理-加窗
     u2 = np.multiply(u, np.hamming(wlen))
     # 预处理-FFT,取对数 获得频域图像 取一半
     U_abs = np.log(np.abs(np.fft.fft(u2))[:wlen2])
     # 4.3.1
     freq = [i * fs / wlen for i in range(wlen2)]
     # print(freq)
     # val共振峰幅值 loc共振峰位置 spec包络线
     val, loc, spec = Formant_Cepst(u, cepstL)
     plt.subplot(2, 1, 1)
     plt.plot(freq, U_abs, 'k')
     plt.xlabel('频率/Hz')  # 设置x，y轴的标签
     plt.ylabel('幅值')
     plt.title('男性a的发音频谱')
     plt.subplot(2, 1, 2)
     plt.plot(freq, spec, 'k')
     plt.xlabel('频率/Hz')  # 设置x，y轴的标签
     plt.ylabel('幅值')
     plt.title('倒谱法共振峰估计')
     for i in range(len(loc)):
         plt.subplot(2, 1, 2)
         plt.plot([freq[loc[i]], freq[loc[i]]], [np.min(spec), spec[loc[i]]], '-.k')
         plt.text(freq[loc[i]], spec[loc[i]], 'Freq={}'.format(int(freq[loc[i]])))
 
     # plt.savefig('images/共振峰估计.png')
     plt.show()
     plt.close()
 
 
 def main(path):
     import numpy as np
     import pyworld as pw
     from scipy.signal import freqz
     import librosa
     import math
 
     """
     思路: 
     先变调，再轻微调整共振峰进行合成
     """
 
     base_rate = 1.05946
     pitch = 0
 
     fs = 44100
     x, sr = librosa.load(path, sr=fs, mono=True)
     x = x.reshape(-1).astype(np.float)
     f0, t = pw.dio(x, fs)
     f0 = pw.stonemask(x, f0, t, fs)
     sp = pw.cheaptrick(x, f0, t, fs)
     sp2 = np.zeros_like(sp)
     cur_rate = 1
     for i in range(sp.shape[1]):
         sp2[:, i] = sp[:, min(int(i * 1 / cur_rate), sp.shape[1] - 1)]
     ap = pw.d4c(x, f0, t, fs)
     rate = math.pow(base_rate, pitch)
     out = pw.synthesize(f0 * rate, sp2, ap, fs).reshape(-1, 1)
     soundfile.write(path.replace(".wav", "_out2.wav"), out, fs)
 
 
+def test_v4(in_file, ref_file):
+    freqs = get_pitch_freq(in_file)
+    # 将其转变为每毫秒一帧
+    new_freqs = np.zeros(len(freqs) * 5)
+    for i in range(0, len(new_freqs)):
+        new_freqs[i] = freqs[int(i / 5)]
+
+    freqs_ref = get_pitch_freq(ref_file)
+    # 将其转变为每毫秒一帧
+    new_freqs_ref = np.zeros(len(freqs_ref) * 5)
+    for i in range(0, len(new_freqs_ref)):
+        new_freqs_ref[i] = freqs_ref[int(i / 5)]
+
+    sr = 44100
+    audio, sr = librosa.load(in_file, sr=sr, mono=True)
+    stft = librosa.stft(audio, n_fft=2048)
+    stft = stft.transpose()
+    new_stft = np.zeros_like(stft)
+
+    sr = 44100
+    audio_ref, sr = librosa.load(ref_file, sr=sr, mono=True)
+    stft_ref = librosa.stft(audio_ref, n_fft=2048)
+    stft_ref = stft_ref.transpose()
+    # 按照上面的频率段找到基频，然后先打印出来看看
+    for i in range(min(len(stft), len(stft_ref))):
+        cur_tm = int(i * 11.61)
+        cur_pitch = max(new_freqs[cur_tm], new_freqs_ref[cur_tm])
+
+        # # 方案1
+        new_stft[i] = stft_ref[i]
+        if cur_pitch < 10:
+            continue
+        # cur_pitch = int(cur_pitch / (44100 / 2048) * 1.5 + 0.5)
+        # # 保证前后总能量基本一致
+        # rate = np.sum(np.abs(new_stft[i][:cur_pitch])) / np.sum(np.abs(stft[i][:cur_pitch]))
+        # new_stft[i][:cur_pitch] = stft[i][:cur_pitch] * rate
+
+        # 方案2
+        vocal_pitch = int(new_freqs[cur_tm] / (44100 / 2048) * 1.5 + 0.5)
+        ref_vocal_pitch = int(new_freqs_ref[cur_tm] / (44100 / 2048) * 1.5 + 0.5)
+        base_vocal_pitch = vocal_pitch
+        base_ref_vocal_pitch = ref_vocal_pitch
+        # 穿插，第一共振峰用vocal,第二用ref,第三用vocal,第四用ref,第五用vocal,之后全用ref
+        # 第一共振峰
+        new_stft[i][vocal_pitch - 5:vocal_pitch + 5] = stft[i][vocal_pitch - 5:vocal_pitch + 5]
+        new_stft[i][ref_vocal_pitch - 5:ref_vocal_pitch + 5] = stft[i][ref_vocal_pitch - 5:ref_vocal_pitch + 5]
+
+        # 第二共振峰
+        vocal_pitch = base_vocal_pitch * 2
+        ref_vocal_pitch = base_ref_vocal_pitch * 2
+        new_stft[i][vocal_pitch - 5:vocal_pitch + 5] = stft[i][vocal_pitch - 5:vocal_pitch + 5]
+        new_stft[i][ref_vocal_pitch - 5:ref_vocal_pitch + 5] = stft[i][ref_vocal_pitch - 5:ref_vocal_pitch + 5]
+
+        # 第三共振峰
+        vocal_pitch = base_vocal_pitch * 3
+        ref_vocal_pitch = base_ref_vocal_pitch * 3
+        new_stft[i][vocal_pitch - 5:vocal_pitch + 5] = stft[i][vocal_pitch - 5:vocal_pitch + 5]
+        new_stft[i][ref_vocal_pitch - 5:ref_vocal_pitch + 5] = stft[i][ref_vocal_pitch - 5:ref_vocal_pitch + 5]
+
+        # 方案3
+
+        # 第五共振峰
+        # vocal_pitch = int(vocal_pitch / 3 * 5)
+        # ref_vocal_pitch = int(vocal_pitch / 3 * 5)
+        # new_stft[i][vocal_pitch - 5:] = stft[i][vocal_pitch - 5:]
+        # new_stft[i][ref_vocal_pitch - 5:] = stft[i][ref_vocal_pitch - 5:]
+
+    new_stft = new_stft.transpose()
+    istft = librosa.istft(new_stft)
+    soundfile.write(str(in_file).replace(".wav", "_out7.wav"), istft, 44100, format="wav")
+
+
+def get_pitch_freq(in_file):
+    # 1 转码到16k单声道
+    # 2 提取音高
+    import os
+
+    gs_ffmpeg = "/usr/local/bin/ffmpeg"
+    in_16k_wav = in_file + "_16k.wav"
+    cmd = "{} -i {} -ar 16000 -ac 1 -y {}".format(gs_ffmpeg, in_file, in_16k_wav)
+    os.system(cmd)
+    if not os.path.exists(in_16k_wav):
+        return np.array([])
+
+    gs_pitch_exe = "/Users/yangjianli/linux/opt/soft/bin/dpitch"
+    out_path = in_file + ".cache"
+    cmd = "{} {} {}".format(gs_pitch_exe, in_16k_wav, out_path)
+    print("exec:{}\n".format(cmd))
+    os.system(cmd)
+    if not os.path.exists(out_path):
+        return np.array([])
+
+    midi_arr = []
+    with open(out_path, "r") as f:
+        while True:
+            line = f.readline()
+            line = line.strip()
+            if not line:
+                break
+            freq = float(line)
+            midi_arr.append(freq)  # 5ms一帧
+    return np.array(midi_arr)
+
+
+def test_v5(in_file):
+    sr = 44100
+    audio, sr = librosa.load(in_file, sr=sr, mono=True)
+    stft = librosa.stft(audio, n_fft=2048)
+    stft = stft.transpose()
+
+    w = np.ones(1025) * 0.1
+    for mid in [23, 23 * 2, 23 * 3, 23 * 4]:
+        for i in range(0, 5):
+            rate = i / 5
+            w[mid + 5 - i] = rate * (10 - 0.1) + 0.1
+            w[mid - 5 + i] = rate * (10 - 0.1) + 0.1
+    stft = stft * w
+    stft = stft.transpose()
+    istft = librosa.istft(stft)
+    soundfile.write(str(in_file).replace(".wav", "_out8.wav"), istft, 44100, format="wav")
+
+
+def test_v6(in_file, ref_file):
+    sr = 44100
+    audio, sr = librosa.load(in_file, sr=sr, mono=True)
+    stft = librosa.stft(audio, n_fft=2048)
+    stft = stft.transpose()
+
+    sr = 44100
+    audio_ref, sr = librosa.load(ref_file, sr=sr, mono=True)
+    stft_ref = librosa.stft(audio_ref, n_fft=2048)
+    stft_ref = stft_ref.transpose()
+
+    # 由A映射到B的情况
+    new_stft = np.zeros_like(stft)
+    step = 85
+    new_stft[:step] = stft_ref[:step]
+    a1 = np.mean(np.abs(stft), axis=0)
+    a2 = np.mean(np.abs(stft_ref), axis=0)
+    w = a2 / a1
+    print(np.max(w), np.min(w), np.mean(w))
+    for i in range(step, min(len(stft_ref), len(stft))):
+        a1 = np.mean(np.abs(stft[i - step:i]), axis=0)  # 原声
+        a2 = np.mean(np.abs(stft_ref[i - step:i]), axis=0)  # 变调之后
+        # w = a2 - a1
+        w = a2 / a1
+        # w[w > 2] = 2
+        # w[w < 0.5] = 0.5
+        # 微调，乘以幅度谱
+        new_stft[i] = (np.abs(stft[i]) * w) * np.exp(1j * np.angle(stft[i]))
+
+    new_stft = new_stft.transpose()
+    istft = librosa.istft(new_stft)
+    soundfile.write(str(in_file).replace(".wav", "_out8.wav"), istft, 44100, format="wav")
+
+
 if __name__ == '__main__':
     # vc = VoiceChanger()
     # vc.process("/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal.wav",
     #            "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal_out1.wav")
-
+    #
     # test(
-    # "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal.wav")
+    #     "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal.wav")
 
-    vocal_pp = "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal_p1.wav"
-    vocal_p2 = "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal.wav"
-    vocal_p3 = "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal_p2.wav"
-    test_v5(vocal_pp, vocal_p2, vocal_p3)
+    # vocal_pp = "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal_p1.wav"
+    # vocal_p2 = "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal.wav"
+    # vocal_p3 = "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal_p2.wav"
+    # test_v5(vocal_pp, vocal_p2, vocal_p3)
 
     # main("/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal_p2.wav")
     # ttt("/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal_02_01.wav")
+
+    # vv = "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal.wav"
+    # vref = "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal_p2.wav"
+    # # test_v4(vv, vref)
+    # test_v5(vv)
+
+    vv = "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/out1/vocal.wav"
+    vref = "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/out1/vocal_p2.wav"
+    test_v6(vv, vref)