diff --git a/AutoCoverTool/online/inference_one.py b/AutoCoverTool/online/inference_one.py index 8113e10..a73d87d 100644 --- a/AutoCoverTool/online/inference_one.py +++ b/AutoCoverTool/online/inference_one.py @@ -1,688 +1,689 @@ """ 单个处理的逻辑 song_id: ---src.mp3 // 源数据,需要提前放进去 ---cache ---vocal.wav // 分离之后产生 ---acc.wav // 分离之后产生 ---vocal_32.wav // 分离之后产生 ---song_id_sp1.wav // 合成之后产生 ---song_id_sp2.wav // 合成之后产生 ---song_id_sp2_d.wav // 降噪之后生成 ---song_id_sp2_dv.wav // 降噪+拉伸之后产生 [占比太高的不产生] ---song_id_sp2_dve442.wav // 手动调整之后产生 ---song_id_sp2_dve442_replace.wav // 替换之后产生 ---song_id_sp2_dve442_replace_mix.wav // 人声+伴奏混合之后产生 ---song_id --acc.mp3 // 44k双声道320k --vocal.mp3 // 44k双声道320k --src.mp3 // 44k双声道320k --song_id_sp2_dv.mp3 // 44k单声道320k ---song_id_out // 对外输出 --src.mp3 // 原始音频 --song_id_sp2_dv_replace_mix.mp3 // 制作完成的音频 环境安装: conda create -n auto_song_cover python=3.9 # 安装demucs环境[进入到ref.music_remover 执行pip install -r requirements.txt] # 安装so_vits_svc环境[进入到ref.so_vits_svc 执行pip install -r requirements.txt] pip install librosa pip install scikit-maad pip install praat-parselmouth pip install matplotlib pip install torchvision pip install madmom pip install torchstat 环境设置: export PATH=$PATH:/data/gpu_env_common/env/bin/ffmpeg/bin export PYTHONPATH=$PWD:$PWD/ref/music_remover/demucs:$PWD/ref/so_vits_svc:$PWD/ref/split_dirty_frame """ import os import time import shutil import random import logging import librosa logging.basicConfig(filename='/tmp/inference.log', level=logging.INFO) gs_err_code_success = 0 gs_err_code_no_src_mp3 = 1 gs_err_code_separate = 2 gs_err_code_trans_32 = 3 gs_err_code_encode_err = 4 gs_err_code_replace_err = 5 gs_err_code_replace_trans_err = 6 gs_err_code_mix_err = 7 gs_err_code_mix_transcode_err = 8 gs_err_code_no_src_dir = 9 gs_err_code_volume_err = 10 gs_err_code_trans2_442 = 11 gs_err_code_reverb = 12 gs_err_code_no_good_choice = 13 gs_err_code_preprocess_vocal = 14 gs_err_code_replace_except_err = 15 gs_denoise_exe = "/opt/soft/bin/denoise_exe" gs_draw_volume_exe = "/opt/soft/bin/draw_volume" gs_simple_mixer_path = "/opt/soft/bin/simple_mixer" gs_rever_path = "/opt/soft/bin/dereverbrate" from ref.music_remover.separate_interface import SeparateInterface from ref.so_vits_svc.inference_main import * from ref.split_dirty_frame.script.process_one import ReplaceVocalFrame, construct_power_fragment class SongCoverInference: def __init__(self): self.work_dir = None self.cache_dir = None self.cid = None self.src_mp3 = None self.vocal_path = None self.vocal_32_path = None self.acc_path = None self.speakers = [ 10414574138721494, 10414574140317353, 1688849864840588, 3634463651, 5629499489839033, 5910973794723621, 6755399374234747, 8162774327817435, 8162774329368194, 1125899914308640, # 以下为男声,包括这个 12384898975368914, 12947848931397021, 3096224748076687, 3096224751151928, 5066549357604730, 5348024335101054, 6755399442719465, 7036874421386111 ] self.speakers2gender = { 10414574138721494: 2, 10414574140317353: 2, 1688849864840588: 2, 3634463651: 2, 5629499489839033: 2, 5910973794723621: 2, 6755399374234747: 2, 8162774327817435: 2, 8162774329368194: 2, 1125899914308640: 1, # 1是男 12384898975368914: 1, 12947848931397021: 1, 3096224748076687: 1, 3096224751151928: 1, 5066549357604730: 1, 5348024335101054: 1, 6755399442719465: 1, 7036874421386111: 1 } self.speakers_model_path = "data/train_users/{}/logs/32k/G_2000.pth" self.speakers_model_config = "data/train_users/{}/config/config.json" st = time.time() self.separate_inst = None logging.info("post process ... ReplaceVocalFrame init sp={}".format(time.time() - st)) self.replace_vocal_frame_inst = None logging.info("SongCoverInference init sp={}".format(time.time() - st)) def separate(self, cid, src_mp3, vocal_path, acc_path): """ 人声伴奏分离 :param cid: :param src_mp3: :param vocal_path: :param acc_path: :return: """ st = time.time() if self.separate_inst is None: self.separate_inst = SeparateInterface() if not self.separate_inst.process(cid, src_mp3, vocal_path, acc_path): return gs_err_code_separate if not os.path.exists(vocal_path) or not os.path.exists(acc_path): return gs_err_code_separate # 转码出一个32k单声道的数据 cmd = "ffmpeg -i {} -ar 32000 -ac 1 -y {} -loglevel fatal".format(vocal_path, self.vocal_32_path) os.system(cmd) if not os.path.exists(self.vocal_32_path): return gs_err_code_trans_32 print("separate:cid={}|sp={}".format(cid, time.time() - st)) return gs_err_code_success def get_start_ms(self, vocal_path): """ 给定原始音频,找一段连续10s的音频 :param vocal_path: :return: """ audio, sr = librosa.load(vocal_path, sr=16000) audio = librosa.util.normalize(audio) # 帧长100ms,帧移10ms,计算能量 power_arr = [] for i in range(0, len(audio) - 1600, 160): power_arr.append(np.sum(np.abs(audio[i:i + 160])) / 160) # 将能量小于等于10的部分做成段 power_arr = construct_power_fragment(power_arr) fragments = [] last_pos = 0 for idx, line in enumerate(power_arr): start = round(float(line[0]) * 0.01, 3) duration = round(float(line[1]) * 0.01, 3) fragments.append([last_pos, start - last_pos]) last_pos = start + duration if last_pos < len(audio) / sr: fragments.append([last_pos, len(audio) / sr - last_pos]) # 合并数据,两者间隔在50ms以内的合并起来 idx = 0 while idx < len(fragments) - 1: if fragments[idx + 1][0] - (fragments[idx][0] + fragments[idx][1]) < 0.05: fragments[idx][1] = fragments[idx + 1][0] + fragments[idx + 1][1] - fragments[idx][0] del fragments[idx + 1] idx -= 1 idx += 1 # out_file = vocal_path + "_power.csv" # with open(out_file, "w") as f: # f.write("Name\tStart\tDuration\tTime Format\tType\n") # for fragment in fragments: # start = round(float(fragment[0]), 3) # duration = round(float(fragment[1]), 3) # strr = "{}\t{}\t{}\t{}\n".format("11", start, duration, "decimal\tCue\t") # f.write(strr) # 筛选出开始的位置 # 1. 连续时长大于10s,当前段长度大于3s # 2. 不可用 # 从0到fragments[idx], 包含idx其中人声段的总和 tot_vocal_duration = [fragments[0][1]] for i in range(1, len(fragments)): tot_vocal_duration.append(tot_vocal_duration[i - 1] + fragments[i][1]) # 计算出任意两段之间非人声占比 for i in range(0, len(fragments)): if fragments[i][1] >= 3: now_tot = 0 if i > 0: now_tot = tot_vocal_duration[i - 1] for j in range(i + 1, len(fragments)): cur_rate = tot_vocal_duration[j] - now_tot cur_rate = cur_rate / (fragments[j][1] + fragments[j][0] - fragments[i][0]) if cur_rate > 0.1: return fragments[i][0] return -1 def inference_speaker(self): """ 推理生成合成后的音频 随机取5个干声,选择占比最小的,并且要求占比小于0.3 :return: """ st = time.time() out_speakers = random.sample(self.speakers, 15) out_songs_dict = {} for speaker in out_speakers: model_path = self.speakers_model_path.format(speaker) config_path = self.speakers_model_config.format(speaker) song_path = os.path.join(self.cache_dir, "{}_{}.wav".format(self.cid, speaker)) try: inf(model_path, config_path, self.vocal_32_path, song_path, "prod") except Exception as ex: logging.info("cid={}, inference_speaker err={}".format(self.cid, ex)) continue if os.path.exists(song_path): if self.replace_vocal_frame_inst is None: self.replace_vocal_frame_inst = ReplaceVocalFrame( "data/models/split_dirty_frame_v5_3_epoch3_852.pth") rate = self.replace_vocal_frame_inst.get_rate(song_path) if rate < 0.3: out_songs_dict[song_path] = rate # 从内部选择占比最低的 out_songs = [] if len(out_songs_dict.keys()) > 0: st_sec = self.get_start_ms(self.vocal_path) song_msg = sorted(out_songs_dict.items(), key=lambda kv: kv[1])[0] out_songs = [song_msg[0]] logging.info("GetRate:cid={},song={},rate={},st_tm={}".format(self.cid, song_msg[0], round(song_msg[1], 2), round(st_sec, 3))) print("GetRate:cid={},song={},rate={},st_tm={}".format(self.cid, song_msg[0], round(song_msg[1], 2), round(st_sec, 3))) # logging.info("inference_speaker len = {} finish sp = {}".format(len(out_songs), time.time() - st)) print("inference_speaker len = {} finish sp = {}".format(len(out_songs), time.time() - st)) return out_songs def get_new_vocal_rate(self, songs): """ 获取人声的比率 :param songs: :return: """ st = time.time() need_to_process_song = [] for song in songs: if self.replace_vocal_frame_inst is None: self.replace_vocal_frame_inst = ReplaceVocalFrame("data/models/split_dirty_frame_v5_3_epoch3_852.pth") rate = self.replace_vocal_frame_inst.get_rate(song) logging.info("{} {} replace_rate={}".format(self.cid, song, rate)) if rate < 1.0: need_to_process_song.append(song) logging.info( "get_new_vocal_rate belen = {} len = {} finish sp = {}".format(len(songs), len(need_to_process_song), time.time() - st)) return need_to_process_song def preprocess_vocal(self, songs, vocal_path): """ 1. 降噪 2. 拉伸 :param songs: :param vocal_path: 参考的音频信号 :return: """ st = time.time() dv_out_list = [] for song in songs: denoise_path = str(song).replace(".wav", "_d.wav") cmd = "{} {} {}".format(gs_denoise_exe, song, denoise_path) os.system(cmd) if not os.path.exists(denoise_path): print("{} {} ERROR denoise".format(self.cid, song)) continue # 拉伸 volume_path = str(song).replace(".wav", "_dv.wav") cmd = "{} {} {} {}".format(gs_draw_volume_exe, denoise_path, vocal_path, volume_path) os.system(cmd) if not os.path.exists(volume_path): print("{} {} ERROR denoise".format(self.cid, volume_path)) continue dv_out_list.append(volume_path) print( "preprocess_vocal belen = {} len = {} finish sp = {}".format(len(songs), len(dv_out_list), time.time() - st)) return dv_out_list def output(self, dv_out_list): """ 对外输出数据 :param dv_out_list: :return: """ st = time.time() out_dir = os.path.join(self.work_dir, self.cid) if os.path.exists(out_dir): shutil.rmtree(out_dir) os.makedirs(out_dir) # 拷贝数据 dst_mp3_path = os.path.join(out_dir, "src_mp3") dst_acc_path = os.path.join(out_dir, "acc.mp3") dst_vocal_path = os.path.join(out_dir, "vocal.mp3") shutil.copyfile(self.src_mp3, dst_mp3_path) cmd = "ffmpeg -i {} -ab 320k -y {} -loglevel fatal".format(self.acc_path, dst_acc_path) os.system(cmd) if not os.path.exists(dst_acc_path): return gs_err_code_encode_err cmd = "ffmpeg -i {} -ab 320k -y {} -loglevel fatal".format(self.vocal_path, dst_vocal_path) os.system(cmd) if not os.path.exists(dst_vocal_path): return gs_err_code_encode_err # 将所有数据放到out_dir中,用于给人工标注 for dv_wav in dv_out_list: dv_wav_name = str(dv_wav).split("/")[-1].replace(".wav", "_441.mp3") dst_dv_path = os.path.join(out_dir, dv_wav_name) cmd = "ffmpeg -i {} -ar 44100 -ac 1 -ab 320k -y {} -loglevel fatal".format(dv_wav, dst_dv_path) os.system(cmd) if not os.path.exists(dst_dv_path): print("{} encode err!".format(cmd)) continue logging.info( "preprocess_vocal output sp = {}".format(time.time() - st)) def process_one(self, cid, work_dir, enable_output=False): logging.info("\nstart:cid={},work_dir={}----------------------->>>>>>>>".format(cid, work_dir)) self.cid = cid self.work_dir = work_dir # 所有不对外交付的,全部放到这里 self.cache_dir = os.path.join(work_dir, "cache") if os.path.exists(self.cache_dir): shutil.rmtree(self.cache_dir) os.makedirs(self.cache_dir) self.src_mp3 = os.path.join(self.work_dir, "src.mp3") if not os.path.exists(self.src_mp3): return gs_err_code_no_src_mp3 self.vocal_path = os.path.join(self.cache_dir, "vocal.wav") self.vocal_32_path = os.path.join(self.cache_dir, "vocal_32.wav") self.acc_path = os.path.join(self.cache_dir, "acc.wav") if not os.path.exists(self.vocal_32_path): logging.info("start separate ... {} {} {}".format(self.src_mp3, self.vocal_path, self.acc_path)) err = self.separate(cid, self.src_mp3, self.vocal_path, self.acc_path) if err != gs_err_code_success: return err, None, None logging.info("start inference_speaker ...") out_songs = self.inference_speaker() dv_out_list = self.preprocess_vocal(out_songs, self.vocal_path) if len(dv_out_list) == 0: return gs_err_code_no_good_choice, None, None mix_mp3_path = None gender = -1 if enable_output: self.output(dv_out_list) else: # 默认全部处理一遍 for dv_out_path in dv_out_list: src_path = dv_out_path.replace("_dv.wav", ".wav") err, mix_mp3_path = self.after_process(self.cid, self.work_dir, src_path, dv_out_path, self.vocal_path, self.acc_path, True, False) if err != gs_err_code_success: logging.info("after_process err {}".format(err)) # 取出性别属性 if err == gs_err_code_success and mix_mp3_path is not None: gender = self.speakers2gender[int(str(os.path.basename(mix_mp3_path)).split("_")[1])] logging.info("finish:cid={},work_dir={}----------------------->>>>>>>>".format(cid, work_dir)) return gs_err_code_success, mix_mp3_path, gender def reverb_by_vocal(self, file): st = time.time() file_442 = file.replace(".wav", "_442.wav") if not os.path.exists(file_442): cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {}".format(file, file_442) os.system(cmd) if not os.path.exists(file_442): return gs_err_code_trans2_442, None file_dst = file.replace(".wav", "_442_dr.wav") cmd = "{} {} {} {}".format(gs_rever_path, self.vocal_path, file_442, file_dst) os.system(cmd) if not os.path.exists(file_dst): return gs_err_code_reverb, None print("cid = {}, reverb_by_vocal sp={}".format(self.cid, time.time() - st)) return gs_err_code_success, file_dst def after_process(self, cid, work_dir, in_file, effect_file, vocal_file, acc_file, need_draw=True, need_reverb=True): """ 后处理逻辑 将处理好的音频进行替换,然后和伴奏进行混合,最后进行编码 :return: """ if need_reverb: # 抓取混响 err, effect_file = self.reverb_by_vocal(in_file) if err != gs_err_code_success: return err, None if need_draw: # 增加一个拉伸的步骤 volume_path = str(effect_file).replace(".wav", "_dv.wav") cmd = "{} {} {} {}".format(gs_draw_volume_exe, effect_file, vocal_file, volume_path) print(cmd) os.system(cmd) if not os.path.exists(volume_path): print("{} {} ERROR draw volume".format(self.cid, volume_path)) return gs_err_code_volume_err, None effect_file = volume_path st = time.time() self.cid = cid self.work_dir = work_dir self.src_mp3 = os.path.join(self.work_dir, "src.mp3") if not os.path.exists(self.work_dir): return gs_err_code_no_src_dir self.replace_vocal_frame_inst.process(in_file, effect_file, vocal_file) dst_path = effect_file + "_replace.wav" if not os.path.exists(dst_path): return gs_err_code_replace_err, None print("replace_vocal_frame_inst sp = {}".format(time.time() - st)) # 转码 dst_path_442 = dst_path.replace("_replace.wav", "_replace442.wav") cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {} -loglevel fatal".format(dst_path, dst_path_442) os.system(cmd) if not os.path.exists(dst_path_442): return gs_err_code_replace_trans_err, None # 合并转码后再做一次拉伸,保证响度 volume_path = str(dst_path_442).replace(".wav", "_dv.wav") cmd = "{} {} {} {}".format(gs_draw_volume_exe, dst_path_442, vocal_file, volume_path) print(cmd) os.system(cmd) if not os.path.exists(volume_path): print("{} {} ERROR draw volume".format(self.cid, volume_path)) return gs_err_code_volume_err, None dst_path_442 = volume_path # 混合 mix_path = dst_path_442.replace("_replace442.wav", "_replace442_mix.wav") cmd = "{} {} {} {}".format(gs_simple_mixer_path, dst_path_442, acc_file, mix_path) print("{}".format(cmd)) os.system(cmd) if not os.path.exists(mix_path): return gs_err_code_mix_err, None # 编码为mp3 output_dir = os.path.join(self.work_dir, self.cid + "_out") if not os.path.exists(output_dir): os.makedirs(output_dir) name = str(mix_path).replace("_replace442_mix.wav", "_replace442_mix.mp3").split("/")[-1] mix_path_mp3 = os.path.join(output_dir, name) cmd = "ffmpeg -i {} -ab 320k -y {} -loglevel fatal".format(mix_path, mix_path_mp3) os.system(cmd) if not os.path.exists(mix_path_mp3): return gs_err_code_mix_transcode_err, None # 拷贝src到output_dir # shutil.copyfile(self.src_mp3, os.path.join(output_dir, "src.mp3")) # logging.info("after_process sp = {}".format(time.time() - st)) return gs_err_code_success, mix_path_mp3 ####################################新对外接口############################################################ def prepare_env(self, cid, work_dir, create_dir=False): self.cid = cid self.work_dir = work_dir # 所有不对外交付的,全部放到这里 self.cache_dir = os.path.join(work_dir, "cache") if create_dir: if os.path.exists(self.cache_dir): shutil.rmtree(self.cache_dir) os.makedirs(self.cache_dir) self.src_mp3 = os.path.join(self.work_dir, "src.mp3") if not os.path.exists(self.src_mp3): return gs_err_code_no_src_mp3 self.vocal_path = os.path.join(self.cache_dir, "vocal.wav") self.vocal_32_path = os.path.join(self.cache_dir, "vocal_32.wav") self.acc_path = os.path.join(self.cache_dir, "acc.wav") return gs_err_code_success def generate_svc_file(self, cid, work_dir): """ :param cid: :param work_dir: :return:err_code, 生成出的svc的文件名称 """ err = self.prepare_env(cid, work_dir, create_dir=True) if err != gs_err_code_success: return err, None # 音源分离 if not os.path.exists(self.vocal_32_path): st = time.time() err = self.separate(cid, self.src_mp3, self.vocal_path, self.acc_path) logging.info("cid={},separate,sp={}".format(self.cid, time.time() - st)) if err != gs_err_code_success: return err, None # 生成svc,只保留一个最佳的 st = time.time() out_songs = self.inference_speaker() if len(out_songs) == 0: return gs_err_code_no_good_choice, None logging.info("cid={},inference_speaker,{},sp={}".format(self.cid, out_songs[0], time.time() - st)) return gs_err_code_success, out_songs[0] def effect(self, cid, work_dir, svc_file): st = time.time() err = self.prepare_env(cid, work_dir) if err != gs_err_code_success: return err, None logging.info("cid={},effect_and_mix,{},sp={}".format(self.cid, svc_file, time.time() - st)) # 预处理人声 dv_out_list = self.preprocess_vocal([svc_file], self.vocal_path) if len(dv_out_list) == 0: return gs_err_code_preprocess_vocal, None svc_file = dv_out_list[0] # 做音效 st = time.time() err, effect_file = self.reverb_by_vocal(svc_file) if err != gs_err_code_success: return err, None logging.info("cid={},reverb_by_vocal,{},sp={}".format(self.cid, svc_file, time.time() - st)) return err, effect_file def mix(self, cid, work_dir, svc_file, effect_file): """ 做音效以及合并 :param cid: :param work_dir: :param svc_file: :param effect_file: :return: err_code, 完成的mp3文件 """ st = time.time() err = self.prepare_env(cid, work_dir) if err != gs_err_code_success: return err, None logging.info("cid={},effect_and_mix,{},sp={}".format(self.cid, svc_file, time.time() - st)) # 拉伸 st = time.time() volume_path = str(effect_file).replace(".wav", "_dv.wav") cmd = "{} {} {} {}".format(gs_draw_volume_exe, effect_file, self.vocal_path, volume_path) os.system(cmd) if not os.path.exists(volume_path): print("{} {} ERROR draw volume".format(self.cid, volume_path)) return gs_err_code_volume_err, None effect_file = volume_path logging.info("cid={},draw_volume,{},sp={}".format(self.cid, svc_file, time.time() - st)) # 替换 st = time.time() try: if self.replace_vocal_frame_inst is None: self.replace_vocal_frame_inst = ReplaceVocalFrame("data/models/split_dirty_frame_v5_3_epoch3_852.pth") self.replace_vocal_frame_inst.process(svc_file, effect_file, self.vocal_path) except Exception as ex: logging.info("{},replace_vocal_frame_inst, {}", self.cid, ex) return gs_err_code_replace_except_err, None dst_path = effect_file + "_replace.wav" if not os.path.exists(dst_path): return gs_err_code_replace_err, None logging.info("cid={},replace_vocal_frame_inst,{},sp={}".format(self.cid, svc_file, time.time() - st)) # 转码 st = time.time() dst_path_442 = dst_path.replace("_replace.wav", "_replace442.wav") cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {} -loglevel fatal".format(dst_path, dst_path_442) os.system(cmd) if not os.path.exists(dst_path_442): return gs_err_code_replace_trans_err, None logging.info("cid={},transcode,{},sp={}".format(self.cid, svc_file, time.time() - st)) # 合并转码后再做一次拉伸,保证响度 st = time.time() volume_path = str(dst_path_442).replace("_replace442.wav", "_replace442_dv.wav") cmd = "{} {} {} {}".format(gs_draw_volume_exe, dst_path_442, self.vocal_path, volume_path) os.system(cmd) if not os.path.exists(volume_path): print("{} {} ERROR draw volume".format(self.cid, volume_path)) return gs_err_code_volume_err, None dst_path_442 = volume_path logging.info("cid={},draw_volume2,{},sp={}".format(self.cid, svc_file, time.time() - st)) # 混合 st = time.time() mix_path = dst_path_442.replace("_replace442_dv.wav", "_replace442_dv_mix.wav") cmd = "{} {} {} {}".format(gs_simple_mixer_path, dst_path_442, self.acc_path, mix_path) os.system(cmd) if not os.path.exists(mix_path): return gs_err_code_mix_err, None logging.info("cid={},mixer,{},sp={}".format(self.cid, svc_file, time.time() - st)) # 编码为mp3 st = time.time() output_dir = os.path.join(self.work_dir, self.cid + "_out") if not os.path.exists(output_dir): os.makedirs(output_dir) name = str(mix_path).replace("_replace442_dv_mix.wav", "_replace442_dv_mix.mp3").split("/")[-1] mix_path_mp3 = os.path.join(output_dir, name) cmd = "ffmpeg -i {} -ab 320k -y {} -loglevel fatal".format(mix_path, mix_path_mp3) print(cmd) os.system(cmd) if not os.path.exists(mix_path_mp3): return gs_err_code_mix_transcode_err, None logging.info("cid={},encode,{},sp={}".format(self.cid, svc_file, time.time() - st)) return gs_err_code_success, mix_path_mp3 def get_gender(self, svc_file): return self.speakers2gender[int(os.path.basename(svc_file.replace(".wav", "")).split("_")[1])] def process_one_logic(self, cid, work_dir): """ 搞成两部分: 1. 分离数据+5次推理,获取最佳结果,并保存 2. 利用最佳结果做音效以及合并 :return: """ err, svc_file = self.generate_svc_file(cid, work_dir) gender = -1 if err != gs_err_code_success: return err, svc_file, gender, gender = self.get_gender(svc_file) err, effect_file = self.effect(cid, work_dir, svc_file) if err != gs_err_code_success: return err, svc_file, gender err, mix_mp3_path = self.mix(cid, work_dir, svc_file, effect_file) return err, mix_mp3_path, gender def test(): arr = [ # "611752105020343687", # "611752105023532439", # "611752105030419688", # "611752105030485748", - "611752105030485685" + # "611752105030485685", + "dzq", ] base_dir = "/data/rsync/jianli.yang/AutoCoverTool/data/test" s_inst = SongCoverInference() for cid in arr: st = time.time() # err, mix_mp3, gender = s_inst.process_one(cid, os.path.join(base_dir, cid), False) err, mix_mp3, gender = s_inst.process_one_logic(cid, os.path.join(base_dir, cid)) print(mix_mp3, gender) print("cid={} RealFinish err={} sp={}".format(cid, err, time.time() - st)) if __name__ == '__main__': test() diff --git a/AutoCoverTool/script/shuffle_music.py b/AutoCoverTool/script/shuffle_music.py index 81b3e60..b3496d2 100644 --- a/AutoCoverTool/script/shuffle_music.py +++ b/AutoCoverTool/script/shuffle_music.py @@ -1,263 +1,429 @@ """ 载入人声,将人声的频谱进行向上平移 """ import librosa import soundfile import numpy as np from copy import deepcopy def local_maxium(x): """ 求序列的极大值 :param x: :return: """ d = np.diff(x) l_d = len(d) maxium = [] loc = [] for i in range(l_d - 1): if d[i] > 0 and d[i + 1] <= 0: maxium.append(x[i + 1]) loc.append(i + 1) return maxium, loc def Formant_Cepst(u, cepstL): """ 来源: https://github.com/taw19960426/-Speech-signal-processing-experiment-tutorial-_python/blob/master/%E5%85%B1%E6%8C%AF%E5%B3%B0%E4%BC%B0%E8%AE%A1%E5%87%BD%E6%95%B0.py 倒谱法共振峰估计函数 :param u:输入信号 :param cepstL:🔪频率上窗函数的宽度 :return: val共振峰幅值 :return: loc共振峰位置 :return: spec包络线 """ wlen2 = len(u) // 2 u_fft = np.fft.fft(u) # 按式(2-1)计算 U = np.log(np.abs(u_fft[:wlen2])) Cepst = np.fft.ifft(U) # 按式(2-2)计算 cepst = np.zeros(wlen2, dtype=np.complex) cepst[:cepstL] = Cepst[:cepstL] # 按式(2-3)计算 cepst[-cepstL + 1:] = Cepst[-cepstL + 1:] # 取第二个式子的相反 spec = np.real(np.fft.fft(cepst)) val, loc = local_maxium(spec) # 在包络线上寻找极大值 return val, loc, spec def get_ref_stft(): sr = 44100 audio, sr = librosa.load( "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal_ref.wav", \ sr=sr, mono=True) stft = librosa.stft(audio, n_fft=2048) stft = stft.transpose() print(stft.shape) data = np.mean(np.abs(stft), axis=0) data = data / np.max(data) return data def test(in_vocal): import matplotlib.pyplot as plt sr = 44100 audio, sr = librosa.load(in_vocal, sr=sr, mono=True) stft = librosa.stft(audio, n_fft=2048) stft = stft.transpose() new_stft = np.zeros_like(stft) w1 = get_ref_stft() data = np.mean(np.abs(stft), axis=0) data = data / np.max(data) w = w1 / data for ii in range(0, len(stft)): # 第一种,整体向上+3 # for i in range(0, 3): # new_stft[ii][i] = stft[ii][i] # for i in range(0, len(stft[ii]) - 3): # dst_i = i + 3 # new_stft[ii][dst_i] = stft[ii][i] # 第二种,整体向上拉伸1.12倍[2个音高] # for i in range(0, 1): # new_stft[ii][i] = stft[ii][i] # for i in range(1, len(stft[ii])): # dst_i = int(i * 1.12 + 0.5) # if dst_i >= len(stft[ii]): # break # new_stft[ii][dst_i] += stft[ii][i] # 第三种,第一共振峰部分不移动,其他部分移动 # power = np.abs(stft[ii]) # power = power / (np.max(power)) # # x = np.array(list(range(0, len(stft[ii])))) # y = power # # new_x = [] # new_y = [] # for i in range(1, len(x) - 1, 1): # if y[i - 1] < y[i] > y[i + 1] and y[i] > 0.01: # new_x.append(x[i]) # new_y.append(y[i]) # # # 前后100hz的合并 # x = new_x # y = new_y # new_x = [] # new_y = [] # for i in range(1, len(x) - 1, 1): # if y[i - 1] < y[i] > y[i + 1]: # if x[i] - x[i - 1] > 5: # new_x.append(x[i - 1]) # new_y.append(y[i - 1]) # new_x.append(x[i]) # new_y.append(y[i]) # if x[i + 1] - x[i] > 5: # new_x.append(x[i + 1]) # new_y.append(y[i + 1]) # # if len(new_x) <= 1: # new_stft[ii] = deepcopy(stft[ii]) # continue # # # 从第一共振峰开始向上加 # st_freq_idx = new_x[1] # for i in range(st_freq_idx, len(stft[ii])): # dst_i = int(i * 1.12 + 0.5) # if dst_i >= len(stft[ii]): # continue # new_stft[ii][dst_i] = stft[ii][i] # new_stft[ii][0] = stft[ii][0] # for i in range(0, st_freq_idx): # new_stft[ii][i] = stft[ii][i] - new_stft[ii] = stft[ii] * w + # new_stft[ii] = stft[ii] * w + pass new_stft = new_stft.transpose() istft = librosa.istft(new_stft) soundfile.write(str(in_vocal).replace(".wav", "_out.wav"), istft, 44100, format="wav") def test_v5(vocal, vocal_ref, vocal_ref2): sr = 44100 audio, sr = librosa.load(vocal, sr=sr, mono=True) stft = librosa.stft(audio, n_fft=2048) stft = stft.transpose() new_stft = np.zeros_like(stft) audio_ref, sr = librosa.load(vocal_ref, sr=sr, mono=True) stft_ref = librosa.stft(audio_ref, n_fft=2048) stft_ref = stft_ref.transpose() audio_ref2, sr = librosa.load(vocal_ref2, sr=sr, mono=True) stft_ref2 = librosa.stft(audio_ref2, n_fft=2048) stft_ref2 = stft_ref2.transpose() w1 = np.ones(len(stft[0])) for i in range(0, 800): w1[i] = i / 800 w2 = 1.0 - w1 for i in range(0, min(len(stft), len(stft_ref2), len(stft_ref))): # new_stft[i] = stft_ref2[i] * w2 + stft[i] * w1 w = np.abs(stft_ref2[i]) / np.abs(stft[i]) new_stft[i] = w * stft[i] new_stft = new_stft.transpose() istft = librosa.istft(new_stft) soundfile.write(str(vocal).replace(".wav", "_out5.wav"), istft, 44100, format="wav") def ttt(path): from scipy.signal import lfilter import matplotlib.pyplot as plt # path="C4_3_y.wav" # data, fs = soundBase('C4_3_y.wav').audioread() data, fs = librosa.load(path, sr=44100, mono=True) # sr=None声音保持原采样频率, mono=False声音保持原通道数 # 预处理-预加重 u = lfilter([1, -0.99], [1], data) cepstL = 7 wlen = len(u) wlen2 = wlen // 2 print("帧长={}".format(wlen)) print("帧移={}".format(wlen2)) # wlen = 256 # wlen2 = 256//2 # 预处理-加窗 u2 = np.multiply(u, np.hamming(wlen)) # 预处理-FFT,取对数 获得频域图像 取一半 U_abs = np.log(np.abs(np.fft.fft(u2))[:wlen2]) # 4.3.1 freq = [i * fs / wlen for i in range(wlen2)] # print(freq) # val共振峰幅值 loc共振峰位置 spec包络线 val, loc, spec = Formant_Cepst(u, cepstL) plt.subplot(2, 1, 1) plt.plot(freq, U_abs, 'k') plt.xlabel('频率/Hz') # 设置x,y轴的标签 plt.ylabel('幅值') plt.title('男性a的发音频谱') plt.subplot(2, 1, 2) plt.plot(freq, spec, 'k') plt.xlabel('频率/Hz') # 设置x,y轴的标签 plt.ylabel('幅值') plt.title('倒谱法共振峰估计') for i in range(len(loc)): plt.subplot(2, 1, 2) plt.plot([freq[loc[i]], freq[loc[i]]], [np.min(spec), spec[loc[i]]], '-.k') plt.text(freq[loc[i]], spec[loc[i]], 'Freq={}'.format(int(freq[loc[i]]))) # plt.savefig('images/共振峰估计.png') plt.show() plt.close() def main(path): import numpy as np import pyworld as pw from scipy.signal import freqz import librosa import math """ 思路: 先变调,再轻微调整共振峰进行合成 """ base_rate = 1.05946 pitch = 0 fs = 44100 x, sr = librosa.load(path, sr=fs, mono=True) x = x.reshape(-1).astype(np.float) f0, t = pw.dio(x, fs) f0 = pw.stonemask(x, f0, t, fs) sp = pw.cheaptrick(x, f0, t, fs) sp2 = np.zeros_like(sp) cur_rate = 1 for i in range(sp.shape[1]): sp2[:, i] = sp[:, min(int(i * 1 / cur_rate), sp.shape[1] - 1)] ap = pw.d4c(x, f0, t, fs) rate = math.pow(base_rate, pitch) out = pw.synthesize(f0 * rate, sp2, ap, fs).reshape(-1, 1) soundfile.write(path.replace(".wav", "_out2.wav"), out, fs) +def test_v4(in_file, ref_file): + freqs = get_pitch_freq(in_file) + # 将其转变为每毫秒一帧 + new_freqs = np.zeros(len(freqs) * 5) + for i in range(0, len(new_freqs)): + new_freqs[i] = freqs[int(i / 5)] + + freqs_ref = get_pitch_freq(ref_file) + # 将其转变为每毫秒一帧 + new_freqs_ref = np.zeros(len(freqs_ref) * 5) + for i in range(0, len(new_freqs_ref)): + new_freqs_ref[i] = freqs_ref[int(i / 5)] + + sr = 44100 + audio, sr = librosa.load(in_file, sr=sr, mono=True) + stft = librosa.stft(audio, n_fft=2048) + stft = stft.transpose() + new_stft = np.zeros_like(stft) + + sr = 44100 + audio_ref, sr = librosa.load(ref_file, sr=sr, mono=True) + stft_ref = librosa.stft(audio_ref, n_fft=2048) + stft_ref = stft_ref.transpose() + # 按照上面的频率段找到基频,然后先打印出来看看 + for i in range(min(len(stft), len(stft_ref))): + cur_tm = int(i * 11.61) + cur_pitch = max(new_freqs[cur_tm], new_freqs_ref[cur_tm]) + + # # 方案1 + new_stft[i] = stft_ref[i] + if cur_pitch < 10: + continue + # cur_pitch = int(cur_pitch / (44100 / 2048) * 1.5 + 0.5) + # # 保证前后总能量基本一致 + # rate = np.sum(np.abs(new_stft[i][:cur_pitch])) / np.sum(np.abs(stft[i][:cur_pitch])) + # new_stft[i][:cur_pitch] = stft[i][:cur_pitch] * rate + + # 方案2 + vocal_pitch = int(new_freqs[cur_tm] / (44100 / 2048) * 1.5 + 0.5) + ref_vocal_pitch = int(new_freqs_ref[cur_tm] / (44100 / 2048) * 1.5 + 0.5) + base_vocal_pitch = vocal_pitch + base_ref_vocal_pitch = ref_vocal_pitch + # 穿插,第一共振峰用vocal,第二用ref,第三用vocal,第四用ref,第五用vocal,之后全用ref + # 第一共振峰 + new_stft[i][vocal_pitch - 5:vocal_pitch + 5] = stft[i][vocal_pitch - 5:vocal_pitch + 5] + new_stft[i][ref_vocal_pitch - 5:ref_vocal_pitch + 5] = stft[i][ref_vocal_pitch - 5:ref_vocal_pitch + 5] + + # 第二共振峰 + vocal_pitch = base_vocal_pitch * 2 + ref_vocal_pitch = base_ref_vocal_pitch * 2 + new_stft[i][vocal_pitch - 5:vocal_pitch + 5] = stft[i][vocal_pitch - 5:vocal_pitch + 5] + new_stft[i][ref_vocal_pitch - 5:ref_vocal_pitch + 5] = stft[i][ref_vocal_pitch - 5:ref_vocal_pitch + 5] + + # 第三共振峰 + vocal_pitch = base_vocal_pitch * 3 + ref_vocal_pitch = base_ref_vocal_pitch * 3 + new_stft[i][vocal_pitch - 5:vocal_pitch + 5] = stft[i][vocal_pitch - 5:vocal_pitch + 5] + new_stft[i][ref_vocal_pitch - 5:ref_vocal_pitch + 5] = stft[i][ref_vocal_pitch - 5:ref_vocal_pitch + 5] + + # 方案3 + + # 第五共振峰 + # vocal_pitch = int(vocal_pitch / 3 * 5) + # ref_vocal_pitch = int(vocal_pitch / 3 * 5) + # new_stft[i][vocal_pitch - 5:] = stft[i][vocal_pitch - 5:] + # new_stft[i][ref_vocal_pitch - 5:] = stft[i][ref_vocal_pitch - 5:] + + new_stft = new_stft.transpose() + istft = librosa.istft(new_stft) + soundfile.write(str(in_file).replace(".wav", "_out7.wav"), istft, 44100, format="wav") + + +def get_pitch_freq(in_file): + # 1 转码到16k单声道 + # 2 提取音高 + import os + + gs_ffmpeg = "/usr/local/bin/ffmpeg" + in_16k_wav = in_file + "_16k.wav" + cmd = "{} -i {} -ar 16000 -ac 1 -y {}".format(gs_ffmpeg, in_file, in_16k_wav) + os.system(cmd) + if not os.path.exists(in_16k_wav): + return np.array([]) + + gs_pitch_exe = "/Users/yangjianli/linux/opt/soft/bin/dpitch" + out_path = in_file + ".cache" + cmd = "{} {} {}".format(gs_pitch_exe, in_16k_wav, out_path) + print("exec:{}\n".format(cmd)) + os.system(cmd) + if not os.path.exists(out_path): + return np.array([]) + + midi_arr = [] + with open(out_path, "r") as f: + while True: + line = f.readline() + line = line.strip() + if not line: + break + freq = float(line) + midi_arr.append(freq) # 5ms一帧 + return np.array(midi_arr) + + +def test_v5(in_file): + sr = 44100 + audio, sr = librosa.load(in_file, sr=sr, mono=True) + stft = librosa.stft(audio, n_fft=2048) + stft = stft.transpose() + + w = np.ones(1025) * 0.1 + for mid in [23, 23 * 2, 23 * 3, 23 * 4]: + for i in range(0, 5): + rate = i / 5 + w[mid + 5 - i] = rate * (10 - 0.1) + 0.1 + w[mid - 5 + i] = rate * (10 - 0.1) + 0.1 + stft = stft * w + stft = stft.transpose() + istft = librosa.istft(stft) + soundfile.write(str(in_file).replace(".wav", "_out8.wav"), istft, 44100, format="wav") + + +def test_v6(in_file, ref_file): + sr = 44100 + audio, sr = librosa.load(in_file, sr=sr, mono=True) + stft = librosa.stft(audio, n_fft=2048) + stft = stft.transpose() + + sr = 44100 + audio_ref, sr = librosa.load(ref_file, sr=sr, mono=True) + stft_ref = librosa.stft(audio_ref, n_fft=2048) + stft_ref = stft_ref.transpose() + + # 由A映射到B的情况 + new_stft = np.zeros_like(stft) + step = 85 + new_stft[:step] = stft_ref[:step] + a1 = np.mean(np.abs(stft), axis=0) + a2 = np.mean(np.abs(stft_ref), axis=0) + w = a2 / a1 + print(np.max(w), np.min(w), np.mean(w)) + for i in range(step, min(len(stft_ref), len(stft))): + a1 = np.mean(np.abs(stft[i - step:i]), axis=0) # 原声 + a2 = np.mean(np.abs(stft_ref[i - step:i]), axis=0) # 变调之后 + # w = a2 - a1 + w = a2 / a1 + # w[w > 2] = 2 + # w[w < 0.5] = 0.5 + # 微调,乘以幅度谱 + new_stft[i] = (np.abs(stft[i]) * w) * np.exp(1j * np.angle(stft[i])) + + new_stft = new_stft.transpose() + istft = librosa.istft(new_stft) + soundfile.write(str(in_file).replace(".wav", "_out8.wav"), istft, 44100, format="wav") + + if __name__ == '__main__': # vc = VoiceChanger() # vc.process("/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal.wav", # "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal_out1.wav") - + # # test( - # "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal.wav") + # "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal.wav") - vocal_pp = "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal_p1.wav" - vocal_p2 = "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal.wav" - vocal_p3 = "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal_p2.wav" - test_v5(vocal_pp, vocal_p2, vocal_p3) + # vocal_pp = "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal_p1.wav" + # vocal_p2 = "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal.wav" + # vocal_p3 = "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal_p2.wav" + # test_v5(vocal_pp, vocal_p2, vocal_p3) # main("/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal_p2.wav") # ttt("/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal_02_01.wav") + + # vv = "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal.wav" + # vref = "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal_p2.wav" + # # test_v4(vv, vref) + # test_v5(vv) + + vv = "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/out1/vocal.wav" + vref = "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/out1/vocal_p2.wav" + test_v6(vv, vref)