diff --git a/AutoCoverTool/online/inference_one.py b/AutoCoverTool/online/inference_one.py index a59619e..d1cde10 100644 --- a/AutoCoverTool/online/inference_one.py +++ b/AutoCoverTool/online/inference_one.py @@ -1,712 +1,713 @@ """ 单个处理的逻辑 song_id: ---src.mp3 // 源数据,需要提前放进去 ---cache ---vocal.wav // 分离之后产生 ---acc.wav // 分离之后产生 ---vocal_32.wav // 分离之后产生 ---song_id_sp1.wav // 合成之后产生 ---song_id_sp2.wav // 合成之后产生 ---song_id_sp2_d.wav // 降噪之后生成 ---song_id_sp2_dv.wav // 降噪+拉伸之后产生 [占比太高的不产生] ---song_id_sp2_dve442.wav // 手动调整之后产生 ---song_id_sp2_dve442_replace.wav // 替换之后产生 ---song_id_sp2_dve442_replace_mix.wav // 人声+伴奏混合之后产生 ---song_id --acc.mp3 // 44k双声道320k --vocal.mp3 // 44k双声道320k --src.mp3 // 44k双声道320k --song_id_sp2_dv.mp3 // 44k单声道320k ---song_id_out // 对外输出 --src.mp3 // 原始音频 --song_id_sp2_dv_replace_mix.mp3 // 制作完成的音频 环境安装: conda create -n auto_song_cover python=3.9 # 安装demucs环境[进入到ref.music_remover 执行pip install -r requirements.txt] # 安装so_vits_svc环境[进入到ref.so_vits_svc 执行pip install -r requirements.txt] pip install librosa pip install scikit-maad pip install praat-parselmouth pip install matplotlib pip install torchvision pip install madmom pip install torchstat 环境设置: export PATH=$PATH:/data/gpu_env_common/env/bin/ffmpeg/bin export PYTHONPATH=$PWD:$PWD/ref/music_remover/demucs:$PWD/ref/so_vits_svc:$PWD/ref/split_dirty_frame """ import os import time import shutil import random import logging import librosa logging.basicConfig(filename='/tmp/inference.log', level=logging.INFO) gs_err_code_success = 0 gs_err_code_no_src_mp3 = 1 gs_err_code_separate = 2 gs_err_code_trans_32 = 3 gs_err_code_encode_err = 4 gs_err_code_replace_err = 5 gs_err_code_replace_trans_err = 6 gs_err_code_mix_err = 7 gs_err_code_mix_transcode_err = 8 gs_err_code_no_src_dir = 9 gs_err_code_volume_err = 10 gs_err_code_trans2_442 = 11 gs_err_code_reverb = 12 gs_err_code_no_good_choice = 13 gs_err_code_preprocess_vocal = 14 gs_err_code_replace_except_err = 15 gs_denoise_exe = "/opt/soft/bin/denoise_exe" gs_draw_volume_exe = "/opt/soft/bin/draw_volume" gs_simple_mixer_path = "/opt/soft/bin/simple_mixer" gs_rever_path = "/opt/soft/bin/dereverbrate" from ref.music_remover.separate_interface import SeparateInterface from ref.so_vits_svc.inference_main import * from ref.split_dirty_frame.script.process_one import ReplaceVocalFrame, construct_power_fragment class SongCoverInference: def __init__(self): self.work_dir = None self.cache_dir = None self.cid = None self.src_mp3 = None self.vocal_path = None self.vocal_32_path = None self.acc_path = None self.speakers = [ 10414574138721494, 10414574140317353, 1688849864840588, 3634463651, 5629499489839033, 5910973794723621, 6755399374234747, 8162774327817435, 8162774329368194, 1125899914308640, # 以下为男声,包括这个 12384898975368914, 12947848931397021, 3096224748076687, 3096224751151928, 5066549357604730, 5348024335101054, 6755399442719465, 7036874421386111 ] self.speakers2gender = { 10414574138721494: 2, 10414574140317353: 2, 1688849864840588: 2, 3634463651: 2, 5629499489839033: 2, 5910973794723621: 2, 6755399374234747: 2, 8162774327817435: 2, 8162774329368194: 2, 1125899914308640: 1, # 1是男 12384898975368914: 1, 12947848931397021: 1, 3096224748076687: 1, 3096224751151928: 1, 5066549357604730: 1, 5348024335101054: 1, 6755399442719465: 1, 7036874421386111: 1 } self.speakers_model_path = "data/train_users/{}/logs/32k/G_2000.pth" self.speakers_model_config = "data/train_users/{}/config/config.json" st = time.time() self.separate_inst = None logging.info("post process ... ReplaceVocalFrame init sp={}".format(time.time() - st)) self.replace_vocal_frame_inst = None logging.info("SongCoverInference init sp={}".format(time.time() - st)) def separate(self, cid, src_mp3, vocal_path, acc_path): """ 人声伴奏分离 :param cid: :param src_mp3: :param vocal_path: :param acc_path: :return: """ st = time.time() if self.separate_inst is None: self.separate_inst = SeparateInterface() if not self.separate_inst.process(cid, src_mp3, vocal_path, acc_path): return gs_err_code_separate if not os.path.exists(vocal_path) or not os.path.exists(acc_path): return gs_err_code_separate # 转码出一个32k单声道的数据 cmd = "ffmpeg -i {} -ar 32000 -ac 1 -y {} -loglevel fatal".format(vocal_path, self.vocal_32_path) os.system(cmd) if not os.path.exists(self.vocal_32_path): return gs_err_code_trans_32 print("separate:cid={}|sp={}".format(cid, time.time() - st)) return gs_err_code_success def get_start_ms(self, vocal_path): """ 给定原始音频,找一段连续10s的音频 :param vocal_path: :return: """ audio, sr = librosa.load(vocal_path, sr=16000) audio = librosa.util.normalize(audio) # 帧长100ms,帧移10ms,计算能量 power_arr = [] for i in range(0, len(audio) - 1600, 160): power_arr.append(np.sum(np.abs(audio[i:i + 160])) / 160) # 将能量小于等于10的部分做成段 power_arr = construct_power_fragment(power_arr) fragments = [] last_pos = 0 for idx, line in enumerate(power_arr): start = round(float(line[0]) * 0.01, 3) duration = round(float(line[1]) * 0.01, 3) fragments.append([last_pos, start - last_pos]) last_pos = start + duration if last_pos < len(audio) / sr: fragments.append([last_pos, len(audio) / sr - last_pos]) # 合并数据,两者间隔在50ms以内的合并起来 idx = 0 while idx < len(fragments) - 1: if fragments[idx + 1][0] - (fragments[idx][0] + fragments[idx][1]) < 0.05: fragments[idx][1] = fragments[idx + 1][0] + fragments[idx + 1][1] - fragments[idx][0] del fragments[idx + 1] idx -= 1 idx += 1 # out_file = vocal_path + "_power.csv" # with open(out_file, "w") as f: # f.write("Name\tStart\tDuration\tTime Format\tType\n") # for fragment in fragments: # start = round(float(fragment[0]), 3) # duration = round(float(fragment[1]), 3) # strr = "{}\t{}\t{}\t{}\n".format("11", start, duration, "decimal\tCue\t") # f.write(strr) # 筛选出开始的位置 # 1. 连续时长大于10s,当前段长度大于3s # 2. 不可用 # 从0到fragments[idx], 包含idx其中人声段的总和 tot_vocal_duration = [fragments[0][1]] for i in range(1, len(fragments)): tot_vocal_duration.append(tot_vocal_duration[i - 1] + fragments[i][1]) # 计算出任意两段之间非人声占比 for i in range(0, len(fragments)): if fragments[i][1] >= 3: now_tot = 0 if i > 0: now_tot = tot_vocal_duration[i - 1] for j in range(i + 1, len(fragments)): cur_rate = tot_vocal_duration[j] - now_tot cur_rate = cur_rate / (fragments[j][1] + fragments[j][0] - fragments[i][0]) if cur_rate > 0.1: return fragments[i][0] return -1 def inference_speaker(self): """ 推理生成合成后的音频 随机取5个干声,选择占比最小的,并且要求占比小于0.3 :return: """ st = time.time() out_speakers = random.sample(self.speakers, 15) out_songs_dict = {} for speaker in out_speakers: model_path = self.speakers_model_path.format(speaker) config_path = self.speakers_model_config.format(speaker) song_path = os.path.join(self.cache_dir, "{}_{}.wav".format(self.cid, speaker)) try: inf(model_path, config_path, self.vocal_32_path, song_path, "prod") except Exception as ex: logging.info("cid={}, inference_speaker err={}".format(self.cid, ex)) continue if os.path.exists(song_path): if self.replace_vocal_frame_inst is None: self.replace_vocal_frame_inst = ReplaceVocalFrame( "data/models/split_dirty_frame_v5_3_epoch3_852.pth") rate = self.replace_vocal_frame_inst.get_rate(song_path) if rate < 0.3: out_songs_dict[song_path] = rate # 从内部选择占比最低的 out_songs = [] if len(out_songs_dict.keys()) > 0: st_sec = self.get_start_ms(self.vocal_path) song_msg = sorted(out_songs_dict.items(), key=lambda kv: kv[1])[0] out_songs = [song_msg[0]] logging.info("GetRate:cid={},song={},rate={},st_tm={}".format(self.cid, song_msg[0], round(song_msg[1], 2), round(st_sec, 3))) print("GetRate:cid={},song={},rate={},st_tm={}".format(self.cid, song_msg[0], round(song_msg[1], 2), round(st_sec, 3))) # logging.info("inference_speaker len = {} finish sp = {}".format(len(out_songs), time.time() - st)) print("inference_speaker len = {} finish sp = {}".format(len(out_songs), time.time() - st)) return out_songs def get_new_vocal_rate(self, songs): """ 获取人声的比率 :param songs: :return: """ st = time.time() need_to_process_song = [] for song in songs: if self.replace_vocal_frame_inst is None: self.replace_vocal_frame_inst = ReplaceVocalFrame("data/models/split_dirty_frame_v5_3_epoch3_852.pth") rate = self.replace_vocal_frame_inst.get_rate(song) logging.info("{} {} replace_rate={}".format(self.cid, song, rate)) if rate < 1.0: need_to_process_song.append(song) logging.info( "get_new_vocal_rate belen = {} len = {} finish sp = {}".format(len(songs), len(need_to_process_song), time.time() - st)) return need_to_process_song def preprocess_vocal(self, songs, vocal_path): """ 1. 降噪 2. 拉伸 :param songs: :param vocal_path: 参考的音频信号 :return: """ st = time.time() dv_out_list = [] for song in songs: denoise_path = str(song).replace(".wav", "_d.wav") cmd = "{} {} {}".format(gs_denoise_exe, song, denoise_path) os.system(cmd) if not os.path.exists(denoise_path): print("{} {} ERROR denoise".format(self.cid, song)) continue # 拉伸 volume_path = str(song).replace(".wav", "_dv.wav") cmd = "{} {} {} {}".format(gs_draw_volume_exe, denoise_path, vocal_path, volume_path) os.system(cmd) if not os.path.exists(volume_path): print("{} {} ERROR denoise".format(self.cid, volume_path)) continue dv_out_list.append(volume_path) print( "preprocess_vocal belen = {} len = {} finish sp = {}".format(len(songs), len(dv_out_list), time.time() - st)) return dv_out_list def output(self, dv_out_list): """ 对外输出数据 :param dv_out_list: :return: """ st = time.time() out_dir = os.path.join(self.work_dir, self.cid) if os.path.exists(out_dir): shutil.rmtree(out_dir) os.makedirs(out_dir) # 拷贝数据 dst_mp3_path = os.path.join(out_dir, "src_mp3") dst_acc_path = os.path.join(out_dir, "acc.mp3") dst_vocal_path = os.path.join(out_dir, "vocal.mp3") shutil.copyfile(self.src_mp3, dst_mp3_path) cmd = "ffmpeg -i {} -ab 320k -y {} -loglevel fatal".format(self.acc_path, dst_acc_path) os.system(cmd) if not os.path.exists(dst_acc_path): return gs_err_code_encode_err cmd = "ffmpeg -i {} -ab 320k -y {} -loglevel fatal".format(self.vocal_path, dst_vocal_path) os.system(cmd) if not os.path.exists(dst_vocal_path): return gs_err_code_encode_err # 将所有数据放到out_dir中,用于给人工标注 for dv_wav in dv_out_list: dv_wav_name = str(dv_wav).split("/")[-1].replace(".wav", "_441.mp3") dst_dv_path = os.path.join(out_dir, dv_wav_name) cmd = "ffmpeg -i {} -ar 44100 -ac 1 -ab 320k -y {} -loglevel fatal".format(dv_wav, dst_dv_path) os.system(cmd) if not os.path.exists(dst_dv_path): print("{} encode err!".format(cmd)) continue logging.info( "preprocess_vocal output sp = {}".format(time.time() - st)) def process_one(self, cid, work_dir, enable_output=False): logging.info("\nstart:cid={},work_dir={}----------------------->>>>>>>>".format(cid, work_dir)) self.cid = cid self.work_dir = work_dir # 所有不对外交付的,全部放到这里 self.cache_dir = os.path.join(work_dir, "cache") if os.path.exists(self.cache_dir): shutil.rmtree(self.cache_dir) os.makedirs(self.cache_dir) self.src_mp3 = os.path.join(self.work_dir, "src.mp3") if not os.path.exists(self.src_mp3): return gs_err_code_no_src_mp3 self.vocal_path = os.path.join(self.cache_dir, "vocal.wav") self.vocal_32_path = os.path.join(self.cache_dir, "vocal_32.wav") self.acc_path = os.path.join(self.cache_dir, "acc.wav") if not os.path.exists(self.vocal_32_path): logging.info("start separate ... {} {} {}".format(self.src_mp3, self.vocal_path, self.acc_path)) err = self.separate(cid, self.src_mp3, self.vocal_path, self.acc_path) if err != gs_err_code_success: return err, None, None logging.info("start inference_speaker ...") out_songs = self.inference_speaker() dv_out_list = self.preprocess_vocal(out_songs, self.vocal_path) if len(dv_out_list) == 0: return gs_err_code_no_good_choice, None, None mix_mp3_path = None gender = -1 if enable_output: self.output(dv_out_list) else: # 默认全部处理一遍 for dv_out_path in dv_out_list: src_path = dv_out_path.replace("_dv.wav", ".wav") err, mix_mp3_path = self.after_process(self.cid, self.work_dir, src_path, dv_out_path, self.vocal_path, self.acc_path, True, False) if err != gs_err_code_success: logging.info("after_process err {}".format(err)) # 取出性别属性 if err == gs_err_code_success and mix_mp3_path is not None: gender = self.speakers2gender[int(str(os.path.basename(mix_mp3_path)).split("_")[1])] logging.info("finish:cid={},work_dir={}----------------------->>>>>>>>".format(cid, work_dir)) return gs_err_code_success, mix_mp3_path, gender def reverb_by_vocal(self, file): st = time.time() file_442 = file.replace(".wav", "_442.wav") if not os.path.exists(file_442): cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {}".format(file, file_442) os.system(cmd) if not os.path.exists(file_442): return gs_err_code_trans2_442, None file_dst = file.replace(".wav", "_442_dr.wav") cmd = "{} {} {} {}".format(gs_rever_path, self.vocal_path, file_442, file_dst) os.system(cmd) if not os.path.exists(file_dst): return gs_err_code_reverb, None print("cid = {}, reverb_by_vocal sp={}".format(self.cid, time.time() - st)) return gs_err_code_success, file_dst def after_process(self, cid, work_dir, in_file, effect_file, vocal_file, acc_file, need_draw=True, need_reverb=True): """ 后处理逻辑 将处理好的音频进行替换,然后和伴奏进行混合,最后进行编码 :return: """ if need_reverb: # 抓取混响 err, effect_file = self.reverb_by_vocal(in_file) if err != gs_err_code_success: return err, None if need_draw: # 增加一个拉伸的步骤 volume_path = str(effect_file).replace(".wav", "_dv.wav") cmd = "{} {} {} {}".format(gs_draw_volume_exe, effect_file, vocal_file, volume_path) print(cmd) os.system(cmd) if not os.path.exists(volume_path): print("{} {} ERROR draw volume".format(self.cid, volume_path)) return gs_err_code_volume_err, None effect_file = volume_path st = time.time() self.cid = cid self.work_dir = work_dir self.src_mp3 = os.path.join(self.work_dir, "src.mp3") if not os.path.exists(self.work_dir): return gs_err_code_no_src_dir self.replace_vocal_frame_inst.process(in_file, effect_file, vocal_file) dst_path = effect_file + "_replace.wav" if not os.path.exists(dst_path): return gs_err_code_replace_err, None print("replace_vocal_frame_inst sp = {}".format(time.time() - st)) # 转码 dst_path_442 = dst_path.replace("_replace.wav", "_replace442.wav") cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {} -loglevel fatal".format(dst_path, dst_path_442) os.system(cmd) if not os.path.exists(dst_path_442): return gs_err_code_replace_trans_err, None # 合并转码后再做一次拉伸,保证响度 volume_path = str(dst_path_442).replace(".wav", "_dv.wav") cmd = "{} {} {} {}".format(gs_draw_volume_exe, dst_path_442, vocal_file, volume_path) print(cmd) os.system(cmd) if not os.path.exists(volume_path): print("{} {} ERROR draw volume".format(self.cid, volume_path)) return gs_err_code_volume_err, None dst_path_442 = volume_path # 混合 mix_path = dst_path_442.replace("_replace442.wav", "_replace442_mix.wav") cmd = "{} {} {} {}".format(gs_simple_mixer_path, dst_path_442, acc_file, mix_path) print("{}".format(cmd)) os.system(cmd) if not os.path.exists(mix_path): return gs_err_code_mix_err, None # 编码为mp3 output_dir = os.path.join(self.work_dir, self.cid + "_out") if not os.path.exists(output_dir): os.makedirs(output_dir) name = str(mix_path).replace("_replace442_mix.wav", "_replace442_mix.mp3").split("/")[-1] mix_path_mp3 = os.path.join(output_dir, name) cmd = "ffmpeg -i {} -ab 320k -y {} -loglevel fatal".format(mix_path, mix_path_mp3) os.system(cmd) if not os.path.exists(mix_path_mp3): return gs_err_code_mix_transcode_err, None # 拷贝src到output_dir # shutil.copyfile(self.src_mp3, os.path.join(output_dir, "src.mp3")) # logging.info("after_process sp = {}".format(time.time() - st)) return gs_err_code_success, mix_path_mp3 ####################################新对外接口############################################################ def prepare_env(self, cid, work_dir, create_dir=False): self.cid = cid self.work_dir = work_dir # 所有不对外交付的,全部放到这里 self.cache_dir = os.path.join(work_dir, "cache") if create_dir: if os.path.exists(self.cache_dir): shutil.rmtree(self.cache_dir) os.makedirs(self.cache_dir) self.src_mp3 = os.path.join(self.work_dir, "src.mp3") if not os.path.exists(self.src_mp3): return gs_err_code_no_src_mp3 self.vocal_path = os.path.join(self.cache_dir, "vocal.wav") self.vocal_32_path = os.path.join(self.cache_dir, "vocal_32.wav") self.acc_path = os.path.join(self.cache_dir, "acc.wav") return gs_err_code_success def generate_svc_file(self, cid, work_dir): """ :param cid: :param work_dir: :return:err_code, 生成出的svc的文件名称 """ err = self.prepare_env(cid, work_dir, create_dir=True) if err != gs_err_code_success: return err, None # 音源分离 if not os.path.exists(self.vocal_32_path): st = time.time() err = self.separate(cid, self.src_mp3, self.vocal_path, self.acc_path) logging.info("cid={},separate,sp={}".format(self.cid, time.time() - st)) if err != gs_err_code_success: return err, None # 生成svc,只保留一个最佳的 st = time.time() out_songs = self.inference_speaker() if len(out_songs) == 0: return gs_err_code_no_good_choice, None logging.info("cid={},inference_speaker,{},sp={}".format(self.cid, out_songs[0], time.time() - st)) return gs_err_code_success, out_songs[0] def effect(self, cid, work_dir, svc_file): st = time.time() err = self.prepare_env(cid, work_dir) if err != gs_err_code_success: return err, None logging.info("cid={},effect_and_mix,{},sp={}".format(self.cid, svc_file, time.time() - st)) # 预处理人声 dv_out_list = self.preprocess_vocal([svc_file], self.vocal_path) if len(dv_out_list) == 0: return gs_err_code_preprocess_vocal, None svc_file = dv_out_list[0] # 做音效 st = time.time() err, effect_file = self.reverb_by_vocal(svc_file) if err != gs_err_code_success: return err, None logging.info("cid={},reverb_by_vocal,{},sp={}".format(self.cid, svc_file, time.time() - st)) return err, effect_file def mix(self, cid, work_dir, svc_file, effect_file): """ 做音效以及合并 :param cid: :param work_dir: :param svc_file: :param effect_file: :return: err_code, 完成的mp3文件 """ st = time.time() err = self.prepare_env(cid, work_dir) if err != gs_err_code_success: return err, None logging.info("cid={},effect_and_mix,{},sp={}".format(self.cid, svc_file, time.time() - st)) # 拉伸 st = time.time() volume_path = str(effect_file).replace(".wav", "_dv.wav") cmd = "{} {} {} {}".format(gs_draw_volume_exe, effect_file, self.vocal_path, volume_path) os.system(cmd) if not os.path.exists(volume_path): print("{} {} ERROR draw volume".format(self.cid, volume_path)) return gs_err_code_volume_err, None effect_file = volume_path logging.info("cid={},draw_volume,{},sp={}".format(self.cid, svc_file, time.time() - st)) # 替换 st = time.time() try: if self.replace_vocal_frame_inst is None: self.replace_vocal_frame_inst = ReplaceVocalFrame("data/models/split_dirty_frame_v5_3_epoch3_852.pth") self.replace_vocal_frame_inst.process(svc_file, effect_file, self.vocal_path) except Exception as ex: logging.info("{},replace_vocal_frame_inst, {}", self.cid, ex) return gs_err_code_replace_except_err, None dst_path = effect_file + "_replace.wav" if not os.path.exists(dst_path): return gs_err_code_replace_err, None logging.info("cid={},replace_vocal_frame_inst,{},sp={}".format(self.cid, svc_file, time.time() - st)) # 转码 st = time.time() dst_path_442 = dst_path.replace("_replace.wav", "_replace442.wav") cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {} -loglevel fatal".format(dst_path, dst_path_442) os.system(cmd) if not os.path.exists(dst_path_442): return gs_err_code_replace_trans_err, None logging.info("cid={},transcode,{},sp={}".format(self.cid, svc_file, time.time() - st)) # 合并转码后再做一次拉伸,保证响度 st = time.time() volume_path = str(dst_path_442).replace("_replace442.wav", "_replace442_dv.wav") cmd = "{} {} {} {}".format(gs_draw_volume_exe, dst_path_442, self.vocal_path, volume_path) os.system(cmd) if not os.path.exists(volume_path): print("{} {} ERROR draw volume".format(self.cid, volume_path)) return gs_err_code_volume_err, None dst_path_442 = volume_path logging.info("cid={},draw_volume2,{},sp={}".format(self.cid, svc_file, time.time() - st)) # 混合 st = time.time() mix_path = dst_path_442.replace("_replace442_dv.wav", "_replace442_dv_mix.wav") cmd = "{} {} {} {}".format(gs_simple_mixer_path, dst_path_442, self.acc_path, mix_path) os.system(cmd) if not os.path.exists(mix_path): return gs_err_code_mix_err, None logging.info("cid={},mixer,{},sp={}".format(self.cid, svc_file, time.time() - st)) # 编码为mp3 st = time.time() output_dir = os.path.join(self.work_dir, self.cid + "_out") if not os.path.exists(output_dir): os.makedirs(output_dir) name = str(mix_path).replace("_replace442_dv_mix.wav", "_replace442_dv_mix.mp3").split("/")[-1] mix_path_mp3 = os.path.join(output_dir, name) cmd = "ffmpeg -i {} -ab 320k -y {} -loglevel fatal".format(mix_path, mix_path_mp3) print(cmd) os.system(cmd) if not os.path.exists(mix_path_mp3): return gs_err_code_mix_transcode_err, None logging.info("cid={},encode,{},sp={}".format(self.cid, svc_file, time.time() - st)) return gs_err_code_success, mix_path_mp3 def get_gender(self, svc_file): return self.speakers2gender[int(os.path.basename(svc_file.replace(".wav", "")).split("_")[1])] def process_one_logic(self, cid, work_dir): """ 搞成两部分: 1. 分离数据+5次推理,获取最佳结果,并保存 2. 利用最佳结果做音效以及合并 :return: """ err, svc_file = self.generate_svc_file(cid, work_dir) gender = -1 if err != gs_err_code_success: return err, svc_file, gender, gender = self.get_gender(svc_file) err, effect_file = self.effect(cid, work_dir, svc_file) if err != gs_err_code_success: return err, svc_file, gender err, mix_mp3_path = self.mix(cid, work_dir, svc_file, effect_file) return err, mix_mp3_path, gender def test(): arr = [ # "611752105020343687", # "611752105023532439", # "611752105030419688", # "611752105030485748", # "611752105030485685", "dzq", ] base_dir = "/data/rsync/jianli.yang/AutoCoverTool/data/test" s_inst = SongCoverInference() for cid in arr: st = time.time() # err, mix_mp3, gender = s_inst.process_one(cid, os.path.join(base_dir, cid), False) err, mix_mp3, gender = s_inst.process_one_logic(cid, os.path.join(base_dir, cid)) print(mix_mp3, gender) print("cid={} RealFinish err={} sp={}".format(cid, err, time.time() - st)) def test_gene_svc(): base_dir = "/data/rsync/jianli.yang/AutoCoverTool/data/test" # cid = "clean_yibo" cid = "dzq" work_dir = os.path.join(base_dir, cid) st = time.time() - speaker = "1125899914308640_v1" + speaker = "jianli" speakers_model_path = "data/train_users/{}/logs/32k/G_2000.pth" speakers_model_config = "data/train_users/{}/config/config.json" model_path = speakers_model_path.format(speaker) config_path = speakers_model_config.format(speaker) # 缓存目录: cache_dir = os.path.join(work_dir, "cache") if os.path.exists(cache_dir): shutil.rmtree(cache_dir) os.makedirs(cache_dir) song_path = os.path.join(cache_dir, "{}_{}.wav".format(cid, speaker)) - vocal_path = os.path.join(work_dir, "vocal_32.wav") + # vocal_path = os.path.join(work_dir, "vocal_32.wav") + vocal_path = os.path.join(work_dir, "test_silce.wav") inf(model_path, config_path, vocal_path, song_path, "prod") print("finish....") if __name__ == '__main__': test_gene_svc() diff --git a/AutoCoverTool/online/tone_shift_one.py b/AutoCoverTool/online/tone_shift_one.py index 585ed35..d734c21 100644 --- a/AutoCoverTool/online/tone_shift_one.py +++ b/AutoCoverTool/online/tone_shift_one.py @@ -1,232 +1,267 @@ """ 变调的方式做处理 1. 下载 2. 分离 3. 针对于人声变调+2,伴奏+1 4. 合成 """ import os import json import shutil import librosa import logging import numpy as np from ref.music_remover.separate_interface import SeparateInterface from online.inference_worker import upload_file2cos, gs_state_use, gs_state_finish, gs_state_default from online.common import * logging.basicConfig(filename='/tmp/tone_shift_one.log', level=logging.INFO) gs_tone_shift_exe = "/opt/soft/bin/tone_shift_exe" gs_simple_mixer_path = "/opt/soft/bin/simple_mixer" gs_err_code_success = 0 gs_err_code_tone_shift = 1 gs_err_code_mix = 2 gs_err_code_transcode = 3 gs_err_code_upload = 4 gs_err_code_download = 5 gs_err_code_trans_to_mp3 = 6 gs_err_code_separate = 7 gs_err_code_duration_too_long = 8 gs_err_code_duration_no_vocal = 9 gs_err_code_duration_err = 10 +gs_err_code_transcode_acc = 11 +gs_err_code_upload_acc = 12 def exec_cmd(cmd): r = os.popen(cmd) text = r.read() r.close() return text def get_d(audio_path): cmd = "ffprobe -v quiet -print_format json -show_format -show_streams {}".format(audio_path) data = exec_cmd(cmd) data = json.loads(data) # 返回秒 if 'format' in data.keys() and 'duration' in data['format']: return float(data["format"]["duration"]) return -1 def get_mean_power(audio_path): sr = 44100 audio, sr = librosa.load(audio_path, sr=sr, mono=True) mm = np.mean(np.abs(audio)) return mm class ToneShift: def __init__(self): self.separate_inst = SeparateInterface() def update_state(self, song_id, state): sql = "update svc_queue_table set state={},update_time={} where song_id = {}". \ format(state, int(time.time()), song_id) banned_user_map['db'] = "av_db" update_db(sql, banned_user_map) def get_url_by_id(self, song_id): sql = "select song_id, url from svc_queue_table where song_id={}".format(song_id) banned_user_map["db"] = "av_db" data = get_data_by_mysql(sql) if len(data) == 0: return None, None return str(data[0][0]), data[0][1] - def get_one_data(self): - sql = "select song_id, url from svc_queue_table where state = 0 and song_src=3 order by create_time desc limit 1" + def get_one_data_logic(self): + """ + 按照5,4,3的优先级进行获取 + :return: + """ + song_src_arr = [5, 4, 3] + for song_src in song_src_arr: + song_id, song_url = self.get_one_data(song_src=song_src) + if song_id is not None: + return song_id, song_url + return None, None + + def get_one_data(self, song_src=3): + sql = "select song_id, url from svc_queue_table where state = 0 and song_src={} order by create_time desc limit 1".format( + song_src) banned_user_map["db"] = "av_db" data = get_data_by_mysql(sql, banned_user_map) if len(data) == 0: return None, None song_id, song_url = data[0] if song_id != "": self.update_state(song_id, gs_state_use) return str(song_id), song_url def pre_process(self, work_dir, song_url): """ 创建文件夹,下载数据 :return: """ if "?sign=" in song_url: return gs_err_code_download ext = str(song_url).split(".")[-1] dst_file = "{}/src_origin.{}".format(work_dir, ext) cmd = "wget {} -O {}".format(song_url, dst_file) os.system(cmd) if not os.path.exists(dst_file): return gs_err_code_download duration = get_d(dst_file) if duration < 0: return gs_err_code_duration_err print("Duration:", dst_file, duration) if duration > 20 * 60: return gs_err_code_duration_too_long - dst_mp3_file = "{}/src.mp3".format(work_dir) + dst_mp3_file = "{}/src.wav".format(work_dir) cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {} ".format(dst_file, dst_mp3_file) os.system(cmd) if not os.path.exists(dst_mp3_file): return gs_err_code_trans_to_mp3 return gs_err_code_success def tone_shift_one(self, in_file, dst_file, pitch): cmd = "{} {} {} {}".format(gs_tone_shift_exe, in_file, dst_file, pitch) os.system(cmd) return os.path.exists(dst_file) def mix(self, cid, vocal_path, acc_path, tp): if tp == 1: vocal_pitch = 2 acc_pitch = 0 else: vocal_pitch = -2 acc_pitch = 0 vocal_path_2 = vocal_path.replace(".wav", "_{}.wav".format(vocal_pitch)) acc_path_2 = acc_path.replace(".wav", "_{}.wav".format(acc_pitch)) err = self.tone_shift_one(vocal_path, vocal_path_2, vocal_pitch) if not err: return gs_err_code_tone_shift, None err = self.tone_shift_one(acc_path, acc_path_2, acc_pitch) if not err: return gs_err_code_tone_shift, None base_dir = os.path.dirname(vocal_path) mix_path = "{}/mix_{}_{}.wav".format(base_dir, vocal_pitch, acc_pitch) cmd = "{} {} {} {}".format(gs_simple_mixer_path, vocal_path_2, acc_path_2, mix_path) print("exec_cmd={}".format(cmd)) os.system(cmd) if not os.path.exists(mix_path): return gs_err_code_mix, None # 转码 - mix_path_mp3 = mix_path.replace(".wav", ".mp3") - cmd = "ffmpeg -i {} -ab 320k -y {} -loglevel fatal".format(mix_path, mix_path_mp3) + mix_path_mp3 = mix_path.replace(".wav", ".mp4") + cmd = "ffmpeg -i {} -b:a 128k -c:a aac -ar 44100 -ac 2 -y {} -loglevel fatal".format(mix_path, mix_path_mp3) os.system(cmd) if not os.path.exists(mix_path_mp3): return gs_err_code_transcode, None # 上传到cos mix_name = os.path.basename(mix_path_mp3) key = "av_res/svc_res_tone_shift/{}/{}".format(str(cid), mix_name) if not upload_file2cos(key, mix_path_mp3): return gs_err_code_upload, None return gs_err_code_success, key + def upload_acc(self, cid, acc_path): + # 转码 + mix_path_aac = acc_path.replace(".wav", ".m4a") + cmd = "ffmpeg -i {} -b:a 128k -c:a aac -ar 44100 -ac 2 -y {} -loglevel fatal".format(acc_path, mix_path_aac) + os.system(cmd) + if not os.path.exists(mix_path_aac): + return gs_err_code_transcode_acc, None + + # 上传 + mix_name = os.path.basename(mix_path_aac) + key = "av_res/svc_res_tone_shift/{}/{}".format(str(cid), mix_name) + if not upload_file2cos(key, mix_path_aac): + return gs_err_code_upload_acc, None + return gs_err_code_success, key + def process_one(self, cid, work_dir): """ :param cid: :param work_dir: :return: """ - src_mp3 = os.path.join(work_dir, "src.mp3") + src_mp3 = os.path.join(work_dir, "src.wav") vocal_path = os.path.join(work_dir, "vocal.wav") acc_path = os.path.join(work_dir, "acc.wav") if not self.separate_inst.process(cid, src_mp3, vocal_path, acc_path): return gs_err_code_separate, [] if not os.path.exists(vocal_path) or not os.path.exists(acc_path): return gs_err_code_separate, [] # 当人声的平均能量小于一定值时,则认为无人声(0.01是经验值判定,样本分析来看) # 无人声的样本[0.0056, 0.0003], 有人声的样本(目前最小)[0.046, 0.049] print("power:{},{}".format(cid, get_mean_power(vocal_path))) if get_mean_power(vocal_path) < 0.02: return gs_err_code_duration_no_vocal, [] err, type1_mix_mp3 = self.mix(cid, vocal_path, acc_path, 1) if err != gs_err_code_success: return err, [] err, type2_mix_mp3 = self.mix(cid, vocal_path, acc_path, 2) if err != gs_err_code_success: return err, [] - return gs_err_code_success, [type1_mix_mp3, type2_mix_mp3] + + # 上传伴奏文件 + err, acc_path_m4a = self.upload_acc(cid, acc_path) + if err != gs_err_code_success: + return err, [] + return gs_err_code_success, [type1_mix_mp3, type2_mix_mp3, acc_path_m4a] def process_worker(self): logging.info("start process_worker .....") base_dir = "/tmp/tone_shift_one" if not os.path.exists(base_dir): os.makedirs(base_dir) while True: worker_st = time.time() - cid, song_url = self.get_one_data() - # cid, song_url = self.get_url_by_id('611752105022612883') + cid, song_url = self.get_one_data_logic() + # cid, song_url = self.get_url_by_id('611752105029706360') if cid is None: time.sleep(5) logging.info("get one data is None ...") continue work_dir = os.path.join(base_dir, str(cid)) if os.path.exists(work_dir): shutil.rmtree(work_dir) os.makedirs(work_dir) err = self.pre_process(work_dir, song_url) if err != gs_err_code_success: self.update_state(str(cid), -err) continue st = time.time() err, data = self.process_one(str(cid), work_dir) logging.info("process_finish,{},{}".format(cid, time.time() - st)) - if err == gs_err_code_success and len(data) == 2: + if err == gs_err_code_success and len(data) != 0: sql = "update svc_queue_table set state={},update_time={},svc_url=\"{}\" where song_id = {}". \ format(gs_state_finish, int(time.time()), ",".join(data), str(cid)) banned_user_map['db'] = "av_db" update_db(sql, banned_user_map) else: self.update_state(str(cid), -err) shutil.rmtree(work_dir) logging.info("process_finish,{},{}".format(cid, time.time() - worker_st)) if __name__ == '__main__': ts = ToneShift() ts.process_worker() diff --git a/AutoCoverTool/ref/so_vits_svc/inference_main.py b/AutoCoverTool/ref/so_vits_svc/inference_main.py index 20a9439..e1579ec 100644 --- a/AutoCoverTool/ref/so_vits_svc/inference_main.py +++ b/AutoCoverTool/ref/so_vits_svc/inference_main.py @@ -1,83 +1,83 @@ import io import os import sys import logging import time from pathlib import Path import librosa import numpy as np import soundfile from inference import infer_tool from inference import slicer from inference.infer_tool import Svc logging.getLogger('numba').setLevel(logging.WARNING) -chunks_dict = infer_tool.read_temp("ref/so-vits-svc/inference/chunks_temp.json") +chunks_dict = infer_tool.read_temp("ref/so_vits_svc/inference/chunks_temp.json") def inf(model_path, config_path, raw_audio_path, dst_path, dev): # model_path = "logs/32k/G_174000-Copy1.pth" # config_path = "configs/config.json" svc_model = Svc(model_path, config_path) out_dir = os.path.dirname(dst_path) print(dst_path) os.makedirs(out_dir, exist_ok=True) # 支持多个wav文件,放在raw文件夹下 tran = 0 spk_list = ['speaker0'] # 每次同时合成多语者音色 slice_db = -40 # 默认-40,嘈杂的音频可以-30,干声保留呼吸可以-50 wav_format = 'wav' # 音频输出格式 # infer_tool.fill_a_to_b(trans, clean_names) # for clean_name, tran in zip(clean_names, trans): # raw_audio_path = f"raw/{clean_name}" # if "." not in raw_audio_path: # raw_audio_path += ".wav" infer_tool.format_wav(raw_audio_path) wav_path = Path(raw_audio_path).with_suffix('.wav') chunks = slicer.cut(wav_path, db_thresh=slice_db) audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks) for spk in spk_list: audio = [] for (slice_tag, data) in audio_data: print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======') length = int(np.ceil(len(data) / audio_sr * svc_model.target_sample)) raw_path = io.BytesIO() soundfile.write(raw_path, data, audio_sr, format="wav") raw_path.seek(0) if slice_tag: print('jump empty segment') _audio = np.zeros(length) else: out_audio, out_sr = svc_model.infer(spk, tran, raw_path, dev == "test") _audio = out_audio.cpu().numpy() audio.extend(list(_audio)) soundfile.write(dst_path, audio, svc_model.target_sample, format=wav_format) if __name__ == '__main__': g_model = sys.argv[1] # 模型地址 g_config = sys.argv[2] # 配置文件地址 g_audio_path = sys.argv[3] # 输入的音频文件地址,wav g_dst_path = sys.argv[4] # 输出的音频文件地址 if os.path.exists(g_dst_path): print("{} success ...".format(g_dst_path)) exit(0) g_dev = "prod" if len(sys.argv) > 5: g_dev = sys.argv[5] g_aa, g_sr = librosa.load(g_audio_path) d = librosa.get_duration(g_aa, g_sr) # if g_dev != "test": # if d > 250: # print("{} too long".format(g_audio_path)) # exit(0) st = time.time() inf(g_model, g_config, g_audio_path, g_dst_path, g_dev) print("{}, inference sp={}".format(g_audio_path, time.time() - st)) diff --git a/AutoCoverTool/ref/so_vits_svc/models.py b/AutoCoverTool/ref/so_vits_svc/models.py index 09c4045..3e3498b 100644 --- a/AutoCoverTool/ref/so_vits_svc/models.py +++ b/AutoCoverTool/ref/so_vits_svc/models.py @@ -1,352 +1,357 @@ import copy import math import torch from torch import nn from torch.nn import functional as F import attentions import commons import modules from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm from commons import init_weights, get_padding from vdecoder.hifigan.models import Generator from utils import f0_to_coarse class ResidualCouplingBlock(nn.Module): def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows=4, gin_channels=0): super().__init__() self.channels = channels self.hidden_channels = hidden_channels self.kernel_size = kernel_size self.dilation_rate = dilation_rate self.n_layers = n_layers self.n_flows = n_flows self.gin_channels = gin_channels self.flows = nn.ModuleList() for i in range(n_flows): self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True)) self.flows.append(modules.Flip()) def forward(self, x, x_mask, g=None, reverse=False): if not reverse: for flow in self.flows: x, _ = flow(x, x_mask, g=g, reverse=reverse) else: for flow in reversed(self.flows): x = flow(x, x_mask, g=g, reverse=reverse) return x class Encoder(nn.Module): def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0): super().__init__() self.in_channels = in_channels self.out_channels = out_channels self.hidden_channels = hidden_channels self.kernel_size = kernel_size self.dilation_rate = dilation_rate self.n_layers = n_layers self.gin_channels = gin_channels self.pre = nn.Conv1d(in_channels, hidden_channels, 1) self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels) self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) def forward(self, x, x_lengths, g=None): # print(x.shape,x_lengths.shape) x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) x = self.pre(x) * x_mask x = self.enc(x, x_mask, g=g) stats = self.proj(x) * x_mask m, logs = torch.split(stats, self.out_channels, dim=1) z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask return z, m, logs, x_mask class TextEncoder(nn.Module): def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, filter_channels=None, n_heads=None, p_dropout=None): super().__init__() self.in_channels = in_channels self.out_channels = out_channels self.hidden_channels = hidden_channels self.kernel_size = kernel_size self.dilation_rate = dilation_rate self.n_layers = n_layers self.gin_channels = gin_channels self.pre = nn.Conv1d(in_channels, hidden_channels, 1) self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) self.f0_emb = nn.Embedding(256, hidden_channels) self.enc_ = attentions.Encoder( hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout) def forward(self, x, x_lengths, f0=None): + # x->(b,256,frame_num), x_lengths -> (b) + # commons.sequence_mask 对于batch层级有价值,x_lengths是每个batch中每一个元素的帧数 + # 比如输入([3,5,2], 5)那么得到 3 * 5的True/False矩阵,其中第一层矩阵为3个true,2个false,第二层全true,第三层前两个true,其余false + # 作用一个批次中允许不同长度的数据一起训练,此时较短的乘以false,剔除影响 x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) x = self.pre(x) * x_mask x = x + self.f0_emb(f0).transpose(1,2) x = self.enc_(x * x_mask, x_mask) stats = self.proj(x) * x_mask m, logs = torch.split(stats, self.out_channels, dim=1) z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask return z, m, logs, x_mask class DiscriminatorP(torch.nn.Module): def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): super(DiscriminatorP, self).__init__() self.period = period self.use_spectral_norm = use_spectral_norm norm_f = weight_norm if use_spectral_norm == False else spectral_norm self.convs = nn.ModuleList([ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))), ]) self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) def forward(self, x): fmap = [] # 1d to 2d b, c, t = x.shape if t % self.period != 0: # pad first n_pad = self.period - (t % self.period) x = F.pad(x, (0, n_pad), "reflect") t = t + n_pad x = x.view(b, c, t // self.period, self.period) for l in self.convs: x = l(x) x = F.leaky_relu(x, modules.LRELU_SLOPE) fmap.append(x) x = self.conv_post(x) fmap.append(x) x = torch.flatten(x, 1, -1) return x, fmap class DiscriminatorS(torch.nn.Module): def __init__(self, use_spectral_norm=False): super(DiscriminatorS, self).__init__() norm_f = weight_norm if use_spectral_norm == False else spectral_norm self.convs = nn.ModuleList([ norm_f(Conv1d(1, 16, 15, 1, padding=7)), norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), ]) self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) def forward(self, x): fmap = [] for l in self.convs: x = l(x) x = F.leaky_relu(x, modules.LRELU_SLOPE) fmap.append(x) x = self.conv_post(x) fmap.append(x) x = torch.flatten(x, 1, -1) return x, fmap class MultiPeriodDiscriminator(torch.nn.Module): def __init__(self, use_spectral_norm=False): super(MultiPeriodDiscriminator, self).__init__() periods = [2,3,5,7,11] discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods] self.discriminators = nn.ModuleList(discs) def forward(self, y, y_hat): y_d_rs = [] y_d_gs = [] fmap_rs = [] fmap_gs = [] for i, d in enumerate(self.discriminators): y_d_r, fmap_r = d(y) y_d_g, fmap_g = d(y_hat) y_d_rs.append(y_d_r) y_d_gs.append(y_d_g) fmap_rs.append(fmap_r) fmap_gs.append(fmap_g) return y_d_rs, y_d_gs, fmap_rs, fmap_gs - - + + class SpeakerEncoder(torch.nn.Module): def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256): super(SpeakerEncoder, self).__init__() self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True) self.linear = nn.Linear(model_hidden_size, model_embedding_size) self.relu = nn.ReLU() def forward(self, mels): self.lstm.flatten_parameters() _, (hidden, _) = self.lstm(mels) embeds_raw = self.relu(self.linear(hidden[-1])) return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True) - + def compute_partial_slices(self, total_frames, partial_frames, partial_hop): mel_slices = [] for i in range(0, total_frames-partial_frames, partial_hop): mel_range = torch.arange(i, i+partial_frames) mel_slices.append(mel_range) - + return mel_slices - + def embed_utterance(self, mel, partial_frames=128, partial_hop=64): mel_len = mel.size(1) last_mel = mel[:,-partial_frames:] - + if mel_len > partial_frames: mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop) mels = list(mel[:,s] for s in mel_slices) mels.append(last_mel) mels = torch.stack(tuple(mels), 0).squeeze(1) - + with torch.no_grad(): partial_embeds = self(mels) embed = torch.mean(partial_embeds, axis=0).unsqueeze(0) #embed = embed / torch.linalg.norm(embed, 2) else: with torch.no_grad(): embed = self(last_mel) - + return embed class SynthesizerTrn(nn.Module): """ Synthesizer for Training """ - def __init__(self, + def __init__(self, spec_channels, segment_size, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, upsample_kernel_sizes, gin_channels, ssl_dim, n_speakers, **kwargs): super().__init__() self.spec_channels = spec_channels self.inter_channels = inter_channels self.hidden_channels = hidden_channels self.filter_channels = filter_channels self.n_heads = n_heads self.n_layers = n_layers self.kernel_size = kernel_size self.p_dropout = p_dropout self.resblock = resblock self.resblock_kernel_sizes = resblock_kernel_sizes self.resblock_dilation_sizes = resblock_dilation_sizes self.upsample_rates = upsample_rates self.upsample_initial_channel = upsample_initial_channel self.upsample_kernel_sizes = upsample_kernel_sizes self.segment_size = segment_size self.gin_channels = gin_channels self.ssl_dim = ssl_dim self.emb_g = nn.Embedding(n_speakers, gin_channels) self.enc_p_ = TextEncoder(ssl_dim, inter_channels, hidden_channels, 5, 1, 16,0, filter_channels, n_heads, p_dropout) hps = { "sampling_rate": 32000, "inter_channels": 192, "resblock": "1", "resblock_kernel_sizes": [3, 7, 11], "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], "upsample_rates": [10, 8, 2, 2], "upsample_initial_channel": 512, "upsample_kernel_sizes": [16, 16, 4, 4], "gin_channels": 256, } self.dec = Generator(h=hps) self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels) self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) def forward(self, c, f0, spec, g=None, mel=None, c_lengths=None, spec_lengths=None): - # hubert特征, f0, 幅度谱特征, 说话人id,mel谱特征 + # hubert特征(b,256,frame_num), f0 (frame_num), 幅度谱特征, 说话人id,mel谱特征 if c_lengths == None: - c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device) + c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device) # (b, frame_num) if spec_lengths == None: spec_lengths = (torch.ones(spec.size(0)) * spec.size(-1)).to(spec.device) + # 说话人信息embding g = self.emb_g(g).transpose(1,2) z_ptemp, m_p, logs_p, _ = self.enc_p_(c, c_lengths, f0=f0_to_coarse(f0)) - z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g) + z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g) z_p = self.flow(z, spec_mask, g=g) z_slice, pitch_slice, ids_slice = commons.rand_slice_segments_with_pitch(z, f0, spec_lengths, self.segment_size) # o = self.dec(z_slice, g=g) o = self.dec(z_slice, g=g, f0=pitch_slice) return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q) def infer(self, c, f0, g=None, mel=None, c_lengths=None): if c_lengths == None: c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device) g = self.emb_g(g).transpose(1,2) z_p, m_p, logs_p, c_mask = self.enc_p_(c, c_lengths, f0=f0_to_coarse(f0)) z = self.flow(z_p, c_mask, g=g, reverse=True) o = self.dec(z * c_mask, g=g, f0=f0) return o diff --git a/AutoCoverTool/script/get_song_url.py b/AutoCoverTool/script/get_song_url.py index ab483db..6514536 100644 --- a/AutoCoverTool/script/get_song_url.py +++ b/AutoCoverTool/script/get_song_url.py @@ -1,827 +1,827 @@ """ 获取歌曲的地址 # song_src=2 是来源108和109的歌曲,未被洗过的 # song_src=1 是曲库给的 # song_src=3 # 用于轻变调的 """ from script.common import * from copy import deepcopy from online.common import update_db def get_url_by_song_id(song_id): sql = "select task_url,starmaker_songid from silence where starmaker_songid = {} order by task_id desc limit 1".format( song_id) ban = deepcopy(banned_user_map) ban["db"] = "starmaker_musicbook" data = get_data_by_mysql(sql, ban) if len(data) > 0: return data[0][0] return None def process(): arr = [ "611752105020327762", "611752105020332343", "611752105020332580", "611752105020382477", "611752105020384960", "611752105020390942", "611752105020402448", "611752105022612883", "611752105022614531", "611752105022647060", "611752105022647065", "611752105022647066", "611752105022647079", "611752105022704186", "611752105022728482", "611752105022728546", "611752105022729259", "611752105022729263", "611752105022729268", "611752105022731042", "611752105022731053", "611752105022731127", "611752105022734755", "611752105022736024", "611752105022738473", "611752105022739648", "611752105022739650", "611752105022740713", "611752105022741712", "611752105022743896", "611752105022746068", "611752105022746733", "611752105022747108", "611752105022753906", "611752105022757968", "611752105022763051", "611752105022763880", "611752105022763884", "611752105022764224", "611752105022764688", "611752105022764801", "611752105022765022", "611752105022766341", "611752105022767186", "611752105022767294", "611752105022768062", "611752105022768419", "611752105022768837", "611752105022770004", "611752105022770306", "611752105022772154", "611752105022773633", "611752105022773776", "611752105022774040", "611752105022774127", "611752105022774220", "611752105022774502", "611752105022775091", "611752105022775486", "611752105022775600", "611752105022775907", "611752105022775939", "611752105022776719", "611752105022776721", "611752105022776761", "611752105022776857", "611752105022777051", "611752105022777076", "611752105022777120", "611752105022777328", "611752105022777496", "611752105022777573", "611752105022777600", "611752105022777607", "611752105022777608", "611752105022777611", "611752105022777835", "611752105022778042", "611752105022779020", "611752105022779294", "611752105022779784", "611752105022780210", "611752105022780284", "611752105022780287", "611752105022780345", "611752105022780355", "611752105022780818", "611752105022780961", "611752105022780965", "611752105022781011", "611752105022781144", "611752105022781193", "611752105022781267", "611752105022781268", "611752105022781374", "611752105022781387", "611752105022781807", "611752105022782935", "611752105022783480", "611752105022783776", "611752105022783967", "611752105022784390", "611752105022784996", "611752105022785018", "611752105022785313", "611752105022785621", "611752105022785681", "611752105022812895", "611752105022818025", "611752105022823781", "611752105022825467", "611752105022835406", "611752105022835415", "611752105022835751", "611752105022836470", "611752105022837186", "611752105022837452", "611752105022837464", "611752105022838205", "611752105022838206", "611752105022839030", "611752105022839189", "611752105022839975", "611752105022840319", "611752105022840550", "611752105022840637", "611752105022841089", "611752105022841308", "611752105022841355", "611752105022842120", "611752105022842184", "611752105022842241", "611752105022842989", "611752105022843089", "611752105022843139", "611752105022843331", "611752105022843710", "611752105022843728", "611752105022876795", "611752105022973113", "611752105023086347", "611752105023091102", "611752105023104185", "611752105023162802", "611752105023184121", "611752105023219237", "611752105023234496", "611752105023246015", "611752105023246857", "611752105023258864", "611752105023262008", "611752105023301455", "611752105023306231", "611752105023329571", "611752105023411931", "611752105023449798", "611752105023458990", "611752105023610603", "611752105023678577", "611752105023683356", "611752105023683357", "611752105023725727", "611752105023783626", "611752105023841037", "611752105023908922", "611752105023929521", "611752105024170140", "611752105024183682", "611752105024231227", "611752105024415490", "611752105024466658", "611752105024618170", "611752105024683212", "611752105024728134", "611752105024765795", "611752105024766050", "611752105025090763", "611752105025188532", "611752105025242121", "611752105025348359", "611752105025455871", "611752105025458749", "611752105025458753", "611752105025458809", "611752105025467555", "611752105025475926", "611752105025486355", "611752105025492732", "611752105025496983", "611752105025503613", "611752105025504662", "611752105025506533", "611752105025515144", "611752105025520678", "611752105025521388", "611752105025524664", "611752105025524932", "611752105025526555", "611752105025542775", "611752105025542802", "611752105025543710", "611752105025555350", "611752105025558173", "611752105025565020", "611752105025565029", "611752105025565034", "611752105025565044", "611752105025578884", "611752105025580424", "611752105025581305", "611752105025584544", "611752105025720331", "611752105025840622", "611752105026003288", "611752105026090255", "611752105026110280", "611752105026110299", "611752105026110309", "611752105026110320", "611752105026110324", "611752105026110363", "611752105026110435", "611752105026150525", "611752105026152312", "611752105026152320", "611752105026180638", "611752105026180797", "611752105026205984", "611752105026227884", "611752105026343282", "611752105026343284", "611752105026388268", "611752105026417620", "611752105026449246", "611752105026462848", "611752105026465098", "611752105026533365", "611752105026533380", "611752105026533386", "611752105026533657", "611752105026536897", "611752105026536911", "611752105026536913", "611752105026577993", "611752105026580839", "611752105026614487", "611752105026666894", "611752105026666899", "611752105026666904", "611752105026666918", "611752105026666950", "611752105026666964", "611752105026666995", "611752105026667014", "611752105026667025", "611752105026716551", "611752105026779831", "611752105027030955", "611752105027147459", "611752105027186707", "611752105027201043", "611752105027216307", "611752105027228689", "611752105027228702", "611752105027326105", "611752105027460089", "611752105027460125", "611752105027484924", "611752105027601574", "611752105027648113", "611752105027802526", "611752105027854263", "611752105028204403", "611752105028408823", "611752105028477541", "611752105028507652", "611752105028558157", "611752105028593043", "611752105028793344", "611752105028815367", "611752105028820643", "611752105028820644", "611752105028837845", "611752105028858612", "611752105028858622", "611752105028878359", "611752105028879009", "611752105028916096", "611752105028916098", "611752105028975585", "611752105028990740", "611752105029006327", "611752105029041707", "611752105029047058", "611752105029047269", "611752105029054046", "611752105029059915", "611752105029090034", "611752105029204262", "611752105029272970", "611752105029290667", "611752105029291290", "611752105029291293", "611752105029291297", "611752105029291304", "611752105029306974", "611752105029372452", "611752105029432803", "611752105029648535", "611752105029648872", "611752105029648891", "611752105029788767", "611752105029953987", "611752105029954740", "611752105029954853", "611752105029955024", "611752105029956615", "611752105029990162", "611752105029990799", "611752105029991249", "611752105030119229", "611752105030146069", "611752105030220820", "611752105030228836", "611752105030249168", "611752105030249302", "611752105030279605", "61175210503042544", "611752105030447267", "611752105030449043", "611752105030454452", "611752105030470921", "611752105030483301", "611752105030483312", "611752105030483313", "611752105030483344", "611752105030483414", "611752105030483415", "611752105030485565", "611752105030485569", "611752105030485602", "611752105030485608", "611752105030485620", "611752105030485663", "611752105030485711", "611752105030485733", "611752105030485745", "611752105030485799", "611752105030486000", "611752105030486306", "611752105030488510", "611752105030488594", "611752105030488665", "611752105030488713", "611752105030488727", "611752105030488744", "611752105030488814", "611752105030488836", "611752105030488852", "611752105030488864", "611752105030488880", "611752105030488962", "611752105030488997", "611752105030489153", "611752105030489354", "611752105030489380", "611752105030489394", "611752105030489403", "611752105030489415", "611752105030494882", "611752105030499117", "611752105030499185", "611752105030499265", "611752105030499310", "611752105030500582", "611752105030501929", "611752105030501994", "611752105030502109", "611752105030502463", "611752105030503847", "611752105030506310", "611752105030507034", "611752105030516849", "611752105030517044", "611752105030517093", "611752105030532792", "611752105030532858", "611752105030532869", "611752105030534180", "611752105030534293", "611752105030534503", "611752105030535017", "611752105030538184", "611752105030544325", "611752105030547499", "611752105030547630", "611752105030547632", "611752105030547638", "611752105030547849", "611752105030547881", "611752105030549919", "611752105030554063", "611752105030554076", "611752105030556613", "611752105030557261", "611752105030557355", "611752105030558647", "611752105030558663", "611752105030559471", "611752105030559472", "611752105030562192", "611752105030562194", "611752105030562196", "611752105030562197", "611752105030562199", "611752105030562203", "611752105030562205", "611752105030562209", "611752105030562211", "611752105030562213", "611752105030562214", "611752105030562218", "611752105030562221", "611752105030562227", "611752105030562228", "611752105030562231", "611752105030562234", "611752105030562236", "611752105030562239", "611752105030562243", "611752105030562245", "611752105030562248", "611752105030562251", "611752105030562254", "611752105030562255", "611752105030562259", "611752105030562262", "611752105030562263", "611752105030562266", "611752105030562268", "611752105030562271", "611752105030562274", "611752105030562277", "611752105030562283", "611752105030562286", "611752105030562289", "611752105030562291", "611752105030562296", "611752105030562302", "611752105030562303", "611752105030562306", "611752105030562311", "611752105030562314", "611752105030562316", "611752105030562322", "611752105030562325", "611752105030562327", "611752105030562333", "611752105030562335", "611752105030562337", "611752105030562338", "611752105030562345", "611752105030562351", "611752105030562378", "611752105030562380", "611752105030562383", "611752105030562386", "611752105030562389", "611752105030562391", "611752105030562392", "611752105030562397", "611752105030562398", "611752105030562399", "611752105030562401", "611752105030562404", "611752105030562405", "611752105030562411", "611752105030562413", "611752105030562414", "611752105030562417", "611752105030562419", "611752105030562424", "611752105030562425", "611752105030562426", "611752105030562428", "611752105030562431", "611752105030562448", "611752105030562457", "611752105030562459", "611752105030562460", "611752105030562463", "611752105030562470", "611752105030562472", "611752105030562473", "611752105030562483", "611752105030562489", "611752105030562493", "611752105030562494", "611752105030562499", "611752105030562502", "611752105030562504", "611752105030562507", "611752105030562512", "611752105030562513", "611752105030562517", "611752105030562522", "611752105030562919", "611752105030562921", "611752105030562924", "611752105030562925", "611752105030562929", "611752105030562931", "611752105030562936", "611752105030562938", "611752105030562939", "611752105030562940", "611752105030562943", "611752105030562946", "611752105030562950", "611752105030562953", "611752105030562954", "611752105030562959", "611752105030562960", "611752105030562962", "611752105030562968", "611752105030562974", "611752105030562978", "611752105030562979", "611752105030562981", "611752105030562983", "611752105030562986", "611752105030562988", "611752105030562999", "611752105030563001", "611752105030563003", "611752105030563005", "611752105030563006", "611752105030563010", "611752105030563014", "611752105030563022", "611752105030563025", "611752105030563028", "611752105030563031", "611752105030563034", "611752105030563035", "611752105030563043", "611752105030564818", "611752105030568345", "611752105030568348", "611752105030578193", "611752105030582372", "611752105030590839", "611752105030590840", "611752105030590845", "611752105030590847", "611752105030595548", "611752105030595730", "611752105030597299", "611752105030597505", "611752105030605279", "611752105030605833", "611752105030606179", "611752105030615341", "611752105030617237", "611752105030617286", "611752105030617308", "611752105030617321", "611752105030617329", "611752105030617332", "611752105030617344", "611752105030623889", "611752105030623893", "611752105030629479", "611752105030629626", "611752105030644584", "611752105030644604", "611752105030645910", "611752105030645928", "611752105030645939", "611752105030645952", "611752105030645965", "611752105030645979", "611752105030645993", "611752105030646025", "611752105030646052", "611752105030646076", "611752105030646106", "611752105030646134", "611752105030646147", "611752105030647154", "611752105030647258", "611752105030647263", "611752105030647335", "611752105030647409", "611752105030647439", "611752105030647456", "611752105030647469", "611752105030647493", "611752105030647496", "611752105030647512", "611752105030647526", "611752105030647735", "611752105030647850", "611752105030647856", "611752105030647863", "611752105030649520", "611752105030649525", "611752105030649535", "611752105030649557", "611752105030649597", "611752105030649603", "611752105030649621", "611752105030649627", "611752105030649645", "611752105030649647", "611752105030649648", "611752105030649650", "611752105030649652", "611752105030649657", "611752105030649661", "611752105030650191", "611752105030650212", "611752105030650762", "611752105030650773", "611752105030650788", "611752105030650800", "611752105030650807", "611752105030652092", "611752105030652107", "611752105030652116", "611752105030652135", "611752105030652143", "611752105030652148", "611752105030652183", "611752105030652188", "611752105030652306", "611752105030652522", "611752105030652547", "611752105030652554", "611752105030652565", "611752105030652580", "611752105030652586", "611752105030652606", "611752105030652613", "611752105030652619", "611752105030652628", "611752105030652635", "611752105030652645", "611752105030652657", "611752105030652666", "611752105030652675", "611752105030652710", "611752105030652734", "611752105030652740", "611752105030652759", "611752105030652832", "611752105030652843", "611752105030652867", "611752105030652890", "611752105030652901", "611752105030652955", "611752105030652962", "611752105030653015", "611752105030653040", "611752105030653077", "611752105030653092", "611752105030653100", "611752105030653125", "611752105030653141", "611752105030653177", "611752105030659127", "611752105030659135", "611752105030659138", "611752105030659139", "611752105030659140", "611752105030659145", "611752105030659146", "611752105030659150", "611752105030659157", "611752105030659158", "611752105030659160", "611752105030659164", "611752105030659167", "611752105030659176", "611752105030659178", "611752105030659180", "611752105030659183", "611752105030659190", "611752105030659197", "611752105030659201", "611752105030659205", "611752105030659206", "611752105030659210", "611752105030659213", "611752105030659214", "611752105030659222", "611752105030659228", "611752105030659230", "611752105030659235", "611752105030659239", "611752105030659241", "611752105030659245", "611752105030659247", "611752105030659248", "611752105030659249", "611752105030659252", "611752105030659253", "611752105030659255", "611752105030659259", "611752105030659261", "611752105030659263", "611752105030659265", "611752105030659266", "611752105030659268", "611752105030659271", "611752105030659272", "611752105030659275", "611752105030659276", "611752105030659279", "611752105030659280", "611752105030659285", "611752105030659286", "611752105030659287", "611752105030659892", "611752105030659898", "611752105030659904", "611752105030659907", "611752105030659915", "611752105030659923", "611752105030659929", "611752105030659936", "611752105030659942", "611752105030659957", "611752105030659965", "611752105030659974", "611752105030659982", "611752105030659986", "611752105030659992", "611752105030659998", "611752105030660002", "611752105030660009", "611752105030660018", "611752105030660030", "611752105030660038", "611752105030661242" ] ban = deepcopy(banned_user_map) ban["db"] = "av_db" for sid in arr: url = get_url_by_song_id(sid) if url is not None: print("out,{},{}".format(url, sid)) # 不在数据库中 sql = "select song_id from svc_queue_table where song_id={} and song_src=3".format(sid) data = get_data_by_mysql(sql, ban) if len(data) == 0: tm = int(time.time()) sql = "replace INTO svc_queue_table (song_id, url, create_time, update_time, song_src) VALUES ({}, \"{}\",{}, {}, 3)" \ .format(sid, url, tm, tm) update_db(sql, ban) def get_data_from_song(): sql = """ select tb1.song_id, tb1.recording_count from ( select song_id,recording_count from starmaker.song where song_src in (108,109) and song_status = 2 order by recording_count desc ) as tb1 left join ( select song_id from av_db.svc_queue_table ) as tb2 on tb1.song_id = tb2.song_id where tb2.song_id is null - order by tb1.recording_count desc limit 1000 + order by tb1.recording_count desc limit 5000 """ ban = deepcopy(banned_user_map) ban_v1 = deepcopy(banned_user_map) ban["db"] = "starmaker_musicbook" ban_v1["db"] = "av_db" data = get_data_by_mysql(sql, ban) for dt in data: sid = dt[0] url = get_url_by_song_id(sid) if url is not None: print("out,{},{}".format(url, sid)) tm = int(time.time()) sql = "insert INTO svc_queue_table (song_id, url, create_time, update_time, song_src) VALUES ({}, \"{}\", {}, {}, 3)" \ .format(sid, url, tm, tm) update_db(sql, ban_v1) if __name__ == '__main__': # get_diff_song() get_data_from_song() # process()