diff --git a/AutoCoverTool/online/inference_one.py b/AutoCoverTool/online/inference_one.py index 0974c8e..8113e10 100644 --- a/AutoCoverTool/online/inference_one.py +++ b/AutoCoverTool/online/inference_one.py @@ -1,684 +1,688 @@ """ 单个处理的逻辑 song_id: ---src.mp3 // 源数据,需要提前放进去 ---cache ---vocal.wav // 分离之后产生 ---acc.wav // 分离之后产生 ---vocal_32.wav // 分离之后产生 ---song_id_sp1.wav // 合成之后产生 ---song_id_sp2.wav // 合成之后产生 ---song_id_sp2_d.wav // 降噪之后生成 ---song_id_sp2_dv.wav // 降噪+拉伸之后产生 [占比太高的不产生] ---song_id_sp2_dve442.wav // 手动调整之后产生 ---song_id_sp2_dve442_replace.wav // 替换之后产生 ---song_id_sp2_dve442_replace_mix.wav // 人声+伴奏混合之后产生 ---song_id --acc.mp3 // 44k双声道320k --vocal.mp3 // 44k双声道320k --src.mp3 // 44k双声道320k --song_id_sp2_dv.mp3 // 44k单声道320k ---song_id_out // 对外输出 --src.mp3 // 原始音频 --song_id_sp2_dv_replace_mix.mp3 // 制作完成的音频 环境安装: conda create -n auto_song_cover python=3.9 # 安装demucs环境[进入到ref.music_remover 执行pip install -r requirements.txt] # 安装so_vits_svc环境[进入到ref.so_vits_svc 执行pip install -r requirements.txt] pip install librosa pip install scikit-maad pip install praat-parselmouth pip install matplotlib pip install torchvision pip install madmom pip install torchstat 环境设置: export PATH=$PATH:/data/gpu_env_common/env/bin/ffmpeg/bin export PYTHONPATH=$PWD:$PWD/ref/music_remover/demucs:$PWD/ref/so_vits_svc:$PWD/ref/split_dirty_frame """ import os import time import shutil import random import logging import librosa logging.basicConfig(filename='/tmp/inference.log', level=logging.INFO) gs_err_code_success = 0 gs_err_code_no_src_mp3 = 1 gs_err_code_separate = 2 gs_err_code_trans_32 = 3 gs_err_code_encode_err = 4 gs_err_code_replace_err = 5 gs_err_code_replace_trans_err = 6 gs_err_code_mix_err = 7 gs_err_code_mix_transcode_err = 8 gs_err_code_no_src_dir = 9 gs_err_code_volume_err = 10 gs_err_code_trans2_442 = 11 gs_err_code_reverb = 12 gs_err_code_no_good_choice = 13 gs_err_code_preprocess_vocal = 14 gs_err_code_replace_except_err = 15 gs_denoise_exe = "/opt/soft/bin/denoise_exe" gs_draw_volume_exe = "/opt/soft/bin/draw_volume" gs_simple_mixer_path = "/opt/soft/bin/simple_mixer" gs_rever_path = "/opt/soft/bin/dereverbrate" from ref.music_remover.separate_interface import SeparateInterface from ref.so_vits_svc.inference_main import * from ref.split_dirty_frame.script.process_one import ReplaceVocalFrame, construct_power_fragment class SongCoverInference: def __init__(self): self.work_dir = None self.cache_dir = None self.cid = None self.src_mp3 = None self.vocal_path = None self.vocal_32_path = None self.acc_path = None self.speakers = [ 10414574138721494, 10414574140317353, 1688849864840588, 3634463651, 5629499489839033, 5910973794723621, 6755399374234747, 8162774327817435, 8162774329368194, 1125899914308640, # 以下为男声,包括这个 12384898975368914, 12947848931397021, 3096224748076687, 3096224751151928, 5066549357604730, 5348024335101054, 6755399442719465, 7036874421386111 ] self.speakers2gender = { 10414574138721494: 2, 10414574140317353: 2, 1688849864840588: 2, 3634463651: 2, 5629499489839033: 2, 5910973794723621: 2, 6755399374234747: 2, 8162774327817435: 2, 8162774329368194: 2, 1125899914308640: 1, # 1是男 12384898975368914: 1, 12947848931397021: 1, 3096224748076687: 1, 3096224751151928: 1, 5066549357604730: 1, 5348024335101054: 1, 6755399442719465: 1, 7036874421386111: 1 } self.speakers_model_path = "data/train_users/{}/logs/32k/G_2000.pth" self.speakers_model_config = "data/train_users/{}/config/config.json" st = time.time() self.separate_inst = None logging.info("post process ... ReplaceVocalFrame init sp={}".format(time.time() - st)) self.replace_vocal_frame_inst = None logging.info("SongCoverInference init sp={}".format(time.time() - st)) def separate(self, cid, src_mp3, vocal_path, acc_path): """ 人声伴奏分离 :param cid: :param src_mp3: :param vocal_path: :param acc_path: :return: """ st = time.time() if self.separate_inst is None: self.separate_inst = SeparateInterface() if not self.separate_inst.process(cid, src_mp3, vocal_path, acc_path): return gs_err_code_separate if not os.path.exists(vocal_path) or not os.path.exists(acc_path): return gs_err_code_separate # 转码出一个32k单声道的数据 cmd = "ffmpeg -i {} -ar 32000 -ac 1 -y {} -loglevel fatal".format(vocal_path, self.vocal_32_path) os.system(cmd) if not os.path.exists(self.vocal_32_path): return gs_err_code_trans_32 print("separate:cid={}|sp={}".format(cid, time.time() - st)) return gs_err_code_success def get_start_ms(self, vocal_path): """ 给定原始音频,找一段连续10s的音频 :param vocal_path: :return: """ audio, sr = librosa.load(vocal_path, sr=16000) audio = librosa.util.normalize(audio) # 帧长100ms,帧移10ms,计算能量 power_arr = [] for i in range(0, len(audio) - 1600, 160): power_arr.append(np.sum(np.abs(audio[i:i + 160])) / 160) # 将能量小于等于10的部分做成段 power_arr = construct_power_fragment(power_arr) fragments = [] last_pos = 0 for idx, line in enumerate(power_arr): start = round(float(line[0]) * 0.01, 3) duration = round(float(line[1]) * 0.01, 3) fragments.append([last_pos, start - last_pos]) last_pos = start + duration if last_pos < len(audio) / sr: fragments.append([last_pos, len(audio) / sr - last_pos]) # 合并数据,两者间隔在50ms以内的合并起来 idx = 0 while idx < len(fragments) - 1: if fragments[idx + 1][0] - (fragments[idx][0] + fragments[idx][1]) < 0.05: fragments[idx][1] = fragments[idx + 1][0] + fragments[idx + 1][1] - fragments[idx][0] del fragments[idx + 1] idx -= 1 idx += 1 # out_file = vocal_path + "_power.csv" # with open(out_file, "w") as f: # f.write("Name\tStart\tDuration\tTime Format\tType\n") # for fragment in fragments: # start = round(float(fragment[0]), 3) # duration = round(float(fragment[1]), 3) # strr = "{}\t{}\t{}\t{}\n".format("11", start, duration, "decimal\tCue\t") # f.write(strr) # 筛选出开始的位置 # 1. 连续时长大于10s,当前段长度大于3s # 2. 不可用 # 从0到fragments[idx], 包含idx其中人声段的总和 tot_vocal_duration = [fragments[0][1]] for i in range(1, len(fragments)): tot_vocal_duration.append(tot_vocal_duration[i - 1] + fragments[i][1]) # 计算出任意两段之间非人声占比 for i in range(0, len(fragments)): if fragments[i][1] >= 3: now_tot = 0 if i > 0: now_tot = tot_vocal_duration[i - 1] for j in range(i + 1, len(fragments)): cur_rate = tot_vocal_duration[j] - now_tot cur_rate = cur_rate / (fragments[j][1] + fragments[j][0] - fragments[i][0]) if cur_rate > 0.1: return fragments[i][0] return -1 def inference_speaker(self): """ 推理生成合成后的音频 随机取5个干声,选择占比最小的,并且要求占比小于0.3 :return: """ st = time.time() out_speakers = random.sample(self.speakers, 15) out_songs_dict = {} for speaker in out_speakers: model_path = self.speakers_model_path.format(speaker) config_path = self.speakers_model_config.format(speaker) song_path = os.path.join(self.cache_dir, "{}_{}.wav".format(self.cid, speaker)) try: inf(model_path, config_path, self.vocal_32_path, song_path, "prod") except Exception as ex: logging.info("cid={}, inference_speaker err={}".format(self.cid, ex)) continue if os.path.exists(song_path): if self.replace_vocal_frame_inst is None: self.replace_vocal_frame_inst = ReplaceVocalFrame( "data/models/split_dirty_frame_v5_3_epoch3_852.pth") rate = self.replace_vocal_frame_inst.get_rate(song_path) if rate < 0.3: out_songs_dict[song_path] = rate # 从内部选择占比最低的 out_songs = [] if len(out_songs_dict.keys()) > 0: st_sec = self.get_start_ms(self.vocal_path) song_msg = sorted(out_songs_dict.items(), key=lambda kv: kv[1])[0] out_songs = [song_msg[0]] logging.info("GetRate:cid={},song={},rate={},st_tm={}".format(self.cid, song_msg[0], round(song_msg[1], 2), round(st_sec, 3))) print("GetRate:cid={},song={},rate={},st_tm={}".format(self.cid, song_msg[0], round(song_msg[1], 2), round(st_sec, 3))) # logging.info("inference_speaker len = {} finish sp = {}".format(len(out_songs), time.time() - st)) print("inference_speaker len = {} finish sp = {}".format(len(out_songs), time.time() - st)) return out_songs def get_new_vocal_rate(self, songs): """ 获取人声的比率 :param songs: :return: """ st = time.time() need_to_process_song = [] for song in songs: if self.replace_vocal_frame_inst is None: self.replace_vocal_frame_inst = ReplaceVocalFrame("data/models/split_dirty_frame_v5_3_epoch3_852.pth") rate = self.replace_vocal_frame_inst.get_rate(song) logging.info("{} {} replace_rate={}".format(self.cid, song, rate)) if rate < 1.0: need_to_process_song.append(song) logging.info( "get_new_vocal_rate belen = {} len = {} finish sp = {}".format(len(songs), len(need_to_process_song), time.time() - st)) return need_to_process_song def preprocess_vocal(self, songs, vocal_path): """ 1. 降噪 2. 拉伸 :param songs: :param vocal_path: 参考的音频信号 :return: """ st = time.time() dv_out_list = [] for song in songs: denoise_path = str(song).replace(".wav", "_d.wav") cmd = "{} {} {}".format(gs_denoise_exe, song, denoise_path) os.system(cmd) if not os.path.exists(denoise_path): print("{} {} ERROR denoise".format(self.cid, song)) continue # 拉伸 volume_path = str(song).replace(".wav", "_dv.wav") cmd = "{} {} {} {}".format(gs_draw_volume_exe, denoise_path, vocal_path, volume_path) os.system(cmd) if not os.path.exists(volume_path): print("{} {} ERROR denoise".format(self.cid, volume_path)) continue dv_out_list.append(volume_path) print( "preprocess_vocal belen = {} len = {} finish sp = {}".format(len(songs), len(dv_out_list), time.time() - st)) return dv_out_list def output(self, dv_out_list): """ 对外输出数据 :param dv_out_list: :return: """ st = time.time() out_dir = os.path.join(self.work_dir, self.cid) if os.path.exists(out_dir): shutil.rmtree(out_dir) os.makedirs(out_dir) # 拷贝数据 dst_mp3_path = os.path.join(out_dir, "src_mp3") dst_acc_path = os.path.join(out_dir, "acc.mp3") dst_vocal_path = os.path.join(out_dir, "vocal.mp3") shutil.copyfile(self.src_mp3, dst_mp3_path) cmd = "ffmpeg -i {} -ab 320k -y {} -loglevel fatal".format(self.acc_path, dst_acc_path) os.system(cmd) if not os.path.exists(dst_acc_path): return gs_err_code_encode_err cmd = "ffmpeg -i {} -ab 320k -y {} -loglevel fatal".format(self.vocal_path, dst_vocal_path) os.system(cmd) if not os.path.exists(dst_vocal_path): return gs_err_code_encode_err # 将所有数据放到out_dir中,用于给人工标注 for dv_wav in dv_out_list: dv_wav_name = str(dv_wav).split("/")[-1].replace(".wav", "_441.mp3") dst_dv_path = os.path.join(out_dir, dv_wav_name) cmd = "ffmpeg -i {} -ar 44100 -ac 1 -ab 320k -y {} -loglevel fatal".format(dv_wav, dst_dv_path) os.system(cmd) if not os.path.exists(dst_dv_path): print("{} encode err!".format(cmd)) continue logging.info( "preprocess_vocal output sp = {}".format(time.time() - st)) def process_one(self, cid, work_dir, enable_output=False): logging.info("\nstart:cid={},work_dir={}----------------------->>>>>>>>".format(cid, work_dir)) self.cid = cid self.work_dir = work_dir # 所有不对外交付的,全部放到这里 self.cache_dir = os.path.join(work_dir, "cache") if os.path.exists(self.cache_dir): shutil.rmtree(self.cache_dir) os.makedirs(self.cache_dir) self.src_mp3 = os.path.join(self.work_dir, "src.mp3") if not os.path.exists(self.src_mp3): return gs_err_code_no_src_mp3 self.vocal_path = os.path.join(self.cache_dir, "vocal.wav") self.vocal_32_path = os.path.join(self.cache_dir, "vocal_32.wav") self.acc_path = os.path.join(self.cache_dir, "acc.wav") if not os.path.exists(self.vocal_32_path): logging.info("start separate ... {} {} {}".format(self.src_mp3, self.vocal_path, self.acc_path)) err = self.separate(cid, self.src_mp3, self.vocal_path, self.acc_path) if err != gs_err_code_success: return err, None, None logging.info("start inference_speaker ...") out_songs = self.inference_speaker() dv_out_list = self.preprocess_vocal(out_songs, self.vocal_path) if len(dv_out_list) == 0: return gs_err_code_no_good_choice, None, None mix_mp3_path = None gender = -1 if enable_output: self.output(dv_out_list) else: # 默认全部处理一遍 for dv_out_path in dv_out_list: src_path = dv_out_path.replace("_dv.wav", ".wav") err, mix_mp3_path = self.after_process(self.cid, self.work_dir, src_path, dv_out_path, self.vocal_path, self.acc_path, True, False) if err != gs_err_code_success: logging.info("after_process err {}".format(err)) # 取出性别属性 if err == gs_err_code_success and mix_mp3_path is not None: gender = self.speakers2gender[int(str(os.path.basename(mix_mp3_path)).split("_")[1])] logging.info("finish:cid={},work_dir={}----------------------->>>>>>>>".format(cid, work_dir)) return gs_err_code_success, mix_mp3_path, gender def reverb_by_vocal(self, file): st = time.time() file_442 = file.replace(".wav", "_442.wav") if not os.path.exists(file_442): cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {}".format(file, file_442) os.system(cmd) if not os.path.exists(file_442): return gs_err_code_trans2_442, None file_dst = file.replace(".wav", "_442_dr.wav") cmd = "{} {} {} {}".format(gs_rever_path, self.vocal_path, file_442, file_dst) os.system(cmd) if not os.path.exists(file_dst): return gs_err_code_reverb, None print("cid = {}, reverb_by_vocal sp={}".format(self.cid, time.time() - st)) return gs_err_code_success, file_dst def after_process(self, cid, work_dir, in_file, effect_file, vocal_file, acc_file, need_draw=True, need_reverb=True): """ 后处理逻辑 将处理好的音频进行替换,然后和伴奏进行混合,最后进行编码 :return: """ if need_reverb: # 抓取混响 err, effect_file = self.reverb_by_vocal(in_file) if err != gs_err_code_success: return err, None if need_draw: # 增加一个拉伸的步骤 volume_path = str(effect_file).replace(".wav", "_dv.wav") cmd = "{} {} {} {}".format(gs_draw_volume_exe, effect_file, vocal_file, volume_path) print(cmd) os.system(cmd) if not os.path.exists(volume_path): print("{} {} ERROR draw volume".format(self.cid, volume_path)) return gs_err_code_volume_err, None effect_file = volume_path st = time.time() self.cid = cid self.work_dir = work_dir self.src_mp3 = os.path.join(self.work_dir, "src.mp3") if not os.path.exists(self.work_dir): return gs_err_code_no_src_dir self.replace_vocal_frame_inst.process(in_file, effect_file, vocal_file) dst_path = effect_file + "_replace.wav" if not os.path.exists(dst_path): return gs_err_code_replace_err, None print("replace_vocal_frame_inst sp = {}".format(time.time() - st)) # 转码 dst_path_442 = dst_path.replace("_replace.wav", "_replace442.wav") cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {} -loglevel fatal".format(dst_path, dst_path_442) os.system(cmd) if not os.path.exists(dst_path_442): return gs_err_code_replace_trans_err, None # 合并转码后再做一次拉伸,保证响度 volume_path = str(dst_path_442).replace(".wav", "_dv.wav") cmd = "{} {} {} {}".format(gs_draw_volume_exe, dst_path_442, vocal_file, volume_path) print(cmd) os.system(cmd) if not os.path.exists(volume_path): print("{} {} ERROR draw volume".format(self.cid, volume_path)) return gs_err_code_volume_err, None dst_path_442 = volume_path # 混合 mix_path = dst_path_442.replace("_replace442.wav", "_replace442_mix.wav") cmd = "{} {} {} {}".format(gs_simple_mixer_path, dst_path_442, acc_file, mix_path) print("{}".format(cmd)) os.system(cmd) if not os.path.exists(mix_path): return gs_err_code_mix_err, None # 编码为mp3 output_dir = os.path.join(self.work_dir, self.cid + "_out") if not os.path.exists(output_dir): os.makedirs(output_dir) name = str(mix_path).replace("_replace442_mix.wav", "_replace442_mix.mp3").split("/")[-1] mix_path_mp3 = os.path.join(output_dir, name) cmd = "ffmpeg -i {} -ab 320k -y {} -loglevel fatal".format(mix_path, mix_path_mp3) os.system(cmd) if not os.path.exists(mix_path_mp3): return gs_err_code_mix_transcode_err, None # 拷贝src到output_dir # shutil.copyfile(self.src_mp3, os.path.join(output_dir, "src.mp3")) # logging.info("after_process sp = {}".format(time.time() - st)) return gs_err_code_success, mix_path_mp3 ####################################新对外接口############################################################ def prepare_env(self, cid, work_dir, create_dir=False): self.cid = cid self.work_dir = work_dir # 所有不对外交付的,全部放到这里 self.cache_dir = os.path.join(work_dir, "cache") if create_dir: if os.path.exists(self.cache_dir): shutil.rmtree(self.cache_dir) os.makedirs(self.cache_dir) self.src_mp3 = os.path.join(self.work_dir, "src.mp3") if not os.path.exists(self.src_mp3): return gs_err_code_no_src_mp3 self.vocal_path = os.path.join(self.cache_dir, "vocal.wav") self.vocal_32_path = os.path.join(self.cache_dir, "vocal_32.wav") self.acc_path = os.path.join(self.cache_dir, "acc.wav") return gs_err_code_success def generate_svc_file(self, cid, work_dir): """ :param cid: :param work_dir: :return:err_code, 生成出的svc的文件名称 """ err = self.prepare_env(cid, work_dir, create_dir=True) if err != gs_err_code_success: return err, None # 音源分离 if not os.path.exists(self.vocal_32_path): st = time.time() err = self.separate(cid, self.src_mp3, self.vocal_path, self.acc_path) logging.info("cid={},separate,sp={}".format(self.cid, time.time() - st)) if err != gs_err_code_success: return err, None # 生成svc,只保留一个最佳的 st = time.time() out_songs = self.inference_speaker() if len(out_songs) == 0: return gs_err_code_no_good_choice, None logging.info("cid={},inference_speaker,{},sp={}".format(self.cid, out_songs[0], time.time() - st)) - - # 预处理人声 - dv_out_list = self.preprocess_vocal(out_songs, self.vocal_path) - if len(dv_out_list) == 0: - return gs_err_code_preprocess_vocal, None - return gs_err_code_success, dv_out_list[0] + return gs_err_code_success, out_songs[0] def effect(self, cid, work_dir, svc_file): st = time.time() err = self.prepare_env(cid, work_dir) if err != gs_err_code_success: return err, None logging.info("cid={},effect_and_mix,{},sp={}".format(self.cid, svc_file, time.time() - st)) + + # 预处理人声 + dv_out_list = self.preprocess_vocal([svc_file], self.vocal_path) + if len(dv_out_list) == 0: + return gs_err_code_preprocess_vocal, None + svc_file = dv_out_list[0] + # 做音效 st = time.time() err, effect_file = self.reverb_by_vocal(svc_file) if err != gs_err_code_success: return err, None logging.info("cid={},reverb_by_vocal,{},sp={}".format(self.cid, svc_file, time.time() - st)) return err, effect_file def mix(self, cid, work_dir, svc_file, effect_file): """ 做音效以及合并 :param cid: :param work_dir: :param svc_file: :param effect_file: :return: err_code, 完成的mp3文件 """ st = time.time() err = self.prepare_env(cid, work_dir) if err != gs_err_code_success: return err, None logging.info("cid={},effect_and_mix,{},sp={}".format(self.cid, svc_file, time.time() - st)) # 拉伸 st = time.time() volume_path = str(effect_file).replace(".wav", "_dv.wav") cmd = "{} {} {} {}".format(gs_draw_volume_exe, effect_file, self.vocal_path, volume_path) os.system(cmd) if not os.path.exists(volume_path): print("{} {} ERROR draw volume".format(self.cid, volume_path)) return gs_err_code_volume_err, None effect_file = volume_path logging.info("cid={},draw_volume,{},sp={}".format(self.cid, svc_file, time.time() - st)) # 替换 st = time.time() try: + if self.replace_vocal_frame_inst is None: + self.replace_vocal_frame_inst = ReplaceVocalFrame("data/models/split_dirty_frame_v5_3_epoch3_852.pth") self.replace_vocal_frame_inst.process(svc_file, effect_file, self.vocal_path) except Exception as ex: logging.info("{},replace_vocal_frame_inst, {}", self.cid, ex) return gs_err_code_replace_except_err, None dst_path = effect_file + "_replace.wav" if not os.path.exists(dst_path): return gs_err_code_replace_err, None logging.info("cid={},replace_vocal_frame_inst,{},sp={}".format(self.cid, svc_file, time.time() - st)) # 转码 st = time.time() dst_path_442 = dst_path.replace("_replace.wav", "_replace442.wav") cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {} -loglevel fatal".format(dst_path, dst_path_442) os.system(cmd) if not os.path.exists(dst_path_442): return gs_err_code_replace_trans_err, None logging.info("cid={},transcode,{},sp={}".format(self.cid, svc_file, time.time() - st)) # 合并转码后再做一次拉伸,保证响度 st = time.time() volume_path = str(dst_path_442).replace("_replace442.wav", "_replace442_dv.wav") cmd = "{} {} {} {}".format(gs_draw_volume_exe, dst_path_442, self.vocal_path, volume_path) os.system(cmd) if not os.path.exists(volume_path): print("{} {} ERROR draw volume".format(self.cid, volume_path)) return gs_err_code_volume_err, None dst_path_442 = volume_path logging.info("cid={},draw_volume2,{},sp={}".format(self.cid, svc_file, time.time() - st)) # 混合 st = time.time() mix_path = dst_path_442.replace("_replace442_dv.wav", "_replace442_dv_mix.wav") cmd = "{} {} {} {}".format(gs_simple_mixer_path, dst_path_442, self.acc_path, mix_path) os.system(cmd) if not os.path.exists(mix_path): return gs_err_code_mix_err, None logging.info("cid={},mixer,{},sp={}".format(self.cid, svc_file, time.time() - st)) # 编码为mp3 st = time.time() output_dir = os.path.join(self.work_dir, self.cid + "_out") if not os.path.exists(output_dir): os.makedirs(output_dir) name = str(mix_path).replace("_replace442_dv_mix.wav", "_replace442_dv_mix.mp3").split("/")[-1] mix_path_mp3 = os.path.join(output_dir, name) cmd = "ffmpeg -i {} -ab 320k -y {} -loglevel fatal".format(mix_path, mix_path_mp3) print(cmd) os.system(cmd) if not os.path.exists(mix_path_mp3): return gs_err_code_mix_transcode_err, None logging.info("cid={},encode,{},sp={}".format(self.cid, svc_file, time.time() - st)) return gs_err_code_success, mix_path_mp3 def get_gender(self, svc_file): - return self.speakers2gender[int(os.path.basename(svc_file).split("_")[1])] + return self.speakers2gender[int(os.path.basename(svc_file.replace(".wav", "")).split("_")[1])] def process_one_logic(self, cid, work_dir): """ 搞成两部分: 1. 分离数据+5次推理,获取最佳结果,并保存 2. 利用最佳结果做音效以及合并 :return: """ err, svc_file = self.generate_svc_file(cid, work_dir) gender = -1 if err != gs_err_code_success: return err, svc_file, gender, gender = self.get_gender(svc_file) err, effect_file = self.effect(cid, work_dir, svc_file) if err != gs_err_code_success: return err, svc_file, gender err, mix_mp3_path = self.mix(cid, work_dir, svc_file, effect_file) return err, mix_mp3_path, gender def test(): arr = [ # "611752105020343687", # "611752105023532439", # "611752105030419688", # "611752105030485748", "611752105030485685" ] base_dir = "/data/rsync/jianli.yang/AutoCoverTool/data/test" s_inst = SongCoverInference() for cid in arr: st = time.time() # err, mix_mp3, gender = s_inst.process_one(cid, os.path.join(base_dir, cid), False) err, mix_mp3, gender = s_inst.process_one_logic(cid, os.path.join(base_dir, cid)) print(mix_mp3, gender) print("cid={} RealFinish err={} sp={}".format(cid, err, time.time() - st)) if __name__ == '__main__': test() diff --git a/AutoCoverTool/script/get_song_url.py b/AutoCoverTool/script/get_song_url.py index 6327796..1e5314f 100644 --- a/AutoCoverTool/script/get_song_url.py +++ b/AutoCoverTool/script/get_song_url.py @@ -1,379 +1,280 @@ """ 获取歌曲的地址 # song_src=2 是来源108和109的歌曲,未被洗过的 # song_src=1 是曲库给的 # song_src=3 # 用于轻变调的 """ from script.common import * from copy import deepcopy from online.common import update_db def get_url_by_song_id(song_id): sql = "select task_url,starmaker_songid from silence where starmaker_songid = {} order by task_id limit 1".format( song_id) ban = deepcopy(banned_user_map) ban["db"] = "starmaker_musicbook" data = get_data_by_mysql(sql, ban) if len(data) > 0: return data[0][0] return None def process(): arr = [ - "611752105020332343", - "611752105022647065", - "611752105022704186", - "611752105022729268", - "611752105022736024", - "611752105022739648", - "611752105022739650", - "611752105022741712", - "611752105022743896", - "611752105022746068", - "611752105022747108", - "611752105022757968", - "611752105022763880", - "611752105022763884", - "611752105022764688", - "611752105022764801", - "611752105022766341", - "611752105022767186", - "611752105022770004", - "611752105022770306", - "611752105022773633", - "611752105022773776", - "611752105022774127", - "611752105022774502", - "611752105022775091", - "611752105022775486", - "611752105022775907", - "611752105022776719", - "611752105022776721", - "611752105022776761", - "611752105022776857", - "611752105022777051", - "611752105022777076", - "611752105022777328", - "611752105022777573", - "611752105022777607", - "611752105022777608", - "611752105022777611", - "611752105022777835", - "611752105022780287", - "611752105022781374", - "611752105022785018", - "611752105022785313", - "611752105022812895", - "611752105022825467", - "611752105022837452", - "611752105022837464", - "611752105022840319", - "611752105022840637", - "611752105022841089", - "611752105022841355", - "611752105022842184", - "611752105022843089", - "611752105022843139", - "611752105022843331", - "611752105022843710", - "611752105022843728", - "611752105022876795", - "611752105022973113", - "611752105023184121", - "611752105023234496", - "611752105023258864", - "611752105023262008", - "611752105023301455", - "611752105023306231", - "611752105023329571", - "611752105023411931", - "611752105023449798", - "611752105023458990", - "611752105023610603", - "611752105023678577", - "611752105023683357", - "611752105023841037", - "611752105023929521", - "611752105024170140", - "611752105024466658", - "611752105024683212", - "611752105024765795", - "611752105024766050", - "611752105025475926", - "611752105025486355", - "611752105025503613", - "611752105025506533", - "611752105025515144", - "611752105025521388", - "611752105025524664", - "611752105025524932", - "611752105025526555", - "611752105025542775", - "611752105025542802", - "611752105025543710", - "611752105025555350", - "611752105025558173", - "611752105025565020", - "611752105025565029", - "611752105025565034", - "611752105025578884", - "611752105025581305", - "611752105026003288", - "611752105026090255", - "611752105026152320", - "611752105026180638", - "611752105026180797", - "611752105026205984", - "611752105026227884", - "611752105026343282", - "611752105026417620", - "611752105026449246", - "611752105026462848", - "611752105026533657", - "611752105026577993", - "611752105026614487", - "611752105026666894", - "611752105026666899", - "611752105026666904", - "611752105026666918", - "611752105026666950", - "611752105026666964", - "611752105026666995", - "611752105026667014", - "611752105026667025", - "611752105027030955", - "611752105027216307", - "611752105027228689", - "611752105027228702", - "611752105027460125", - "611752105027802526", - "611752105027854263", - "611752105028204403", - "611752105028408823", - "611752105028477541", - "611752105028558157", - "611752105028593043", - "611752105028793344", - "611752105028820643", - "611752105028820644", - "611752105028858622", - "611752105028878359", - "611752105028916096", - "611752105028916098", - "611752105028990740", - "611752105029006327", - "611752105029047058", - "611752105029054046", - "611752105029059915", - "611752105029204262", - "611752105029291293", - "611752105029306974", - "611752105029372452", - "611752105029648535", - "611752105030146069", - "611752105030483301", - "611752105030483312", - "611752105030499117", - "611752105030499185", - "611752105030499265", - "611752105030499310", - "611752105030503847", - "611752105030547499", - "611752105030547630", - "611752105030547632", - "611752105030547638", - "611752105030557261", - "611752105030557355", - "611752105030558663", - "611752105030559471", - "611752105030562192", - "611752105030562194", - "611752105030562196", - "611752105030562197", - "611752105030562199", - "611752105030562203", - "611752105030562205", - "611752105030562209", - "611752105030562211", - "611752105030562213", - "611752105030562214", - "611752105030562218", - "611752105030562221", - "611752105030562227", - "611752105030562228", - "611752105030562231", - "611752105030562234", - "611752105030562236", - "611752105030562239", - "611752105030562243", - "611752105030562245", - "611752105030562248", - "611752105030562251", - "611752105030562254", - "611752105030562255", - "611752105030562259", - "611752105030562262", - "611752105030562263", - "611752105030562266", - "611752105030562268", - "611752105030562271", - "611752105030562274", - "611752105030562277", - "611752105030562286", - "611752105030562289", - "611752105030562291", - "611752105030562296", - "611752105030562302", - "611752105030562303", - "611752105030562306", - "611752105030562311", - "611752105030562314", - "611752105030562316", - "611752105030562322", - "611752105030562325", - "611752105030562327", - "611752105030562333", - "611752105030562335", - "611752105030562337", - "611752105030562338", - "611752105030562345", - "611752105030562351", - "611752105030562378", - "611752105030562380", - "611752105030562383", - "611752105030562389", - "611752105030562391", - "611752105030562392", - "611752105030562397", - "611752105030562398", - "611752105030562399", - "611752105030562401", - "611752105030562404", - "611752105030562405", - "611752105030562411", - "611752105030562413", - "611752105030562414", - "611752105030562417", - "611752105030562419", - "611752105030562424", - "611752105030562425", - "611752105030562426", - "611752105030562428", - "611752105030562431", - "611752105030562448", - "611752105030562457", - "611752105030562459", - "611752105030562460", - "611752105030562463", - "611752105030562470", - "611752105030562472", - "611752105030562473", - "611752105030562479", - "611752105030562483", - "611752105030562489", - "611752105030562493", - "611752105030562494", - "611752105030562499", - "611752105030562502", - "611752105030562504", - "611752105030562507", - "611752105030562512", - "611752105030562513", - "611752105030562517", - "611752105030562522", - "611752105030562919", - "611752105030562921", - "611752105030562924", - "611752105030562925", - "611752105030562929", - "611752105030562931", - "611752105030562936", - "611752105030562938", - "611752105030562939", - "611752105030562940", - "611752105030562943", - "611752105030562950", - "611752105030562953", - "611752105030562954", - "611752105030562959", - "611752105030562960", - "611752105030562962", - "611752105030562968", - "611752105030562974", - "611752105030562978", - "611752105030562979", - "611752105030562981", - "611752105030562983", - "611752105030562986", - "611752105030562988", - "611752105030562999", - "611752105030563001", - "611752105030563003", - "611752105030563005", - "611752105030563006", - "611752105030563010", - "611752105030563014", - "611752105030563022", - "611752105030563025", - "611752105030563028", - "611752105030563031", - "611752105030563034", - "611752105030563035", - "611752105030563043" + "611752105030484885", + "611752105029543722", + "611752105030556608", + "611752105030585154", + "611752105030556609", + "611752105029054060", + "611752105028975148", + "611752105030558172", + "611752105028778344", + "611752105030556613", + "611752105029290698", + "611752105030556605", + "611752105027484924", + "611752105030559472", + "611752105030534293", + "611752105027148644", + "611752105029292630", + "611752105026900917", + "611752105027103140", + "611752105030589795", + "611752105026915170", + "611752105030534289", + "611752105026751742", + "611752105026452638", + "611752105025979421", + "611752105025817810", + "611752105026536899", + "611752105030534282", + "611752105030534285", + "611752105030559474", + "611752105025219762", + "611752105025034426", + "611752105024938926", + "611752105029648740", + "611752105029675859", + "611752105024598727", + "611752105030548412", + "611752105030487271", + "611752105029648743", + "611752105023692976", + "611752105024135802", + "611752105023616288", + "611752105023255629", + "611752105022728286", + "611752105023206033", + "611752105023091102", + "611752105029792918", + "611752105022729259", + "611752105030487512", + "611752105022842120", + "611752105022842054", + "611752105022785621", + "611752105022840550", + "611752105022838205", + "611752105022839189", + "611752105022835751", + "611752105022818025", + "611752105022797521", + "611752105022784390", + "611752105028820609", + "611752105030488595", + "611752105030517536", + "611752105030501857", + "611752105030478339", + "611752105025957389", + "611752105027484925", + "611752105027484915", + "611752105024415490", + "611752105027854244", + "611752105029527187", + "611752105028870536", + "611752105028444597", + "611752105028778353", + "611752105027877846", + "611752105028906605", + "611752105027781526", + "611752105027877887", + "611752105027795229", + "611752105027734187", + "611752105028820612", + "611752105027626964", + "611752105027460080", + "611752105027507932", + "611752105027611342", + "611752105027435127", + "611752105027307631", + "611752105029648514", + "611752105026874730", + "611752105030591117", + "611752105026437853", + "611752105025541483", + "611752105026536913", + "611752105022647044", + "611752105023440333", + "611752105023460357", + "611752105023604729", + "611752105023510939", + "611752105022842387", + "611752105024230229", + "611752105023674599", + "611752105023160140", + "611752105022647074", + "611752105022615220", + "611752105028408822", + "611752105022816170", + "611752105022772279", + "611752105022614618", + "611752105020417684", + "611752105020382477", + "611752105022780345", + "611752105022780961", + "611752105022837186", + "611752105022778042", + "611752105022775939", + "611752105022764224", + "611752105022781267", + "611752105022839030", + "611752105022767294", + "611752105022784996", + "611752105022775600", + "611752105022780284", + "611752105022768837", + "611752105030590847", + "611752105022780965", + "611752105022779020", + "611752105022777496", + "611752105022781268", + "611752105022785681", + "611752105022779294", + "611752105022823781", + "611752105022780210", + "611752105022774220", + "611752105022768419", + "611752105030590845", + "611752105022835406", + "611752105022774040", + "611752105022783776", + "611752105022781193", + "611752105020390942", + "611752105022783967", + "611752105022763051", + "611752105022780818", + "611752105022835415", + "611752105022782935", + "611752105020402448", + "611752105022781011", + "611752105020384960", + "611752105022779784", + "611752105022781387", + "611752105025580424", + "611752105022765022", + "611752105025492732", + "611752105023683356", + "611752105022842241", + "611752105024231227", + "611752105029291290", + "611752105023104185", + "611752105025565044", + "611752105025458749", + "611752105025458753", + "611752105025090763", + "611752105030590839", + "611752105030534180", + "611752105023908922", + "611752105027326105", + "611752105023725727", + "611752105022647079", + "611752105024082232", + "611752105029648891", + "611752105025504662", + "611752105025496983", + "611752105026716551", + "611752105029648872", + "611752105022614531", + "611752105029041707", + "611752105030483313", + "611752105023219237", + "611752105022842989", + "611752105022746733", + "611752105023162802", + "611752105022729263", + "611752105022777120", + "611752105025584544", + "611752105025458809", + "611752105027648113", + "611752105030590840", + "611752105024183682", + "611752105023086347", + "611752105022839975", + "611752105025348359", + "611752105022781144", + "611752105022647060", + "611752105022728482", + "611752105025840622", + "611752105022836470", + "611752105023246015", + "611752105022838206", + "611752105022780355", + "611752105022768062", + "611752105022777600" ] ban = deepcopy(banned_user_map) ban["db"] = "av_db" for sid in arr: url = get_url_by_song_id(sid) if url is not None: print("out,{},{}".format(url, sid)) # 不在数据库中 sql = "select song_id from svc_queue_table where song_id={}".format(sid) data = get_data_by_mysql(sql, ban) if len(data) == 0: - sql = "insert INTO svc_queue_table (song_id, url, create_time, update_time, song_src) VALUES ({}, \"{}\", NOW(), NOW(), 1)" \ - .format(sid, url) + tm = int(time.time()) + sql = "insert INTO svc_queue_table (song_id, url, create_time, update_time, song_src) VALUES ({}, \"{}\",{}, {}, 1)" \ + .format(sid, url, tm, tm) update_db(sql, ban) def get_data_from_song(): sql = """ select tb1.song_id, tb1.recording_count from ( select song_id,recording_count from starmaker.song where song_src in (108,109) and song_status = 2 order by recording_count desc ) as tb1 left join ( select song_id from av_db.svc_queue_table ) as tb2 on tb1.song_id = tb2.song_id where tb2.song_id is null order by tb1.recording_count desc limit 400 """ ban = deepcopy(banned_user_map) ban_v1 = deepcopy(banned_user_map) ban["db"] = "starmaker_musicbook" ban_v1["db"] = "av_db" data = get_data_by_mysql(sql, ban) for dt in data: sid = dt[0] url = get_url_by_song_id(sid) if url is not None: print("out,{},{}".format(url, sid)) sql = "replace INTO svc_queue_table (song_id, url, create_time, update_time) VALUES ({}, \"{}\", NOW(), NOW())" \ .format(sid, url) update_db(sql, ban_v1) if __name__ == '__main__': # get_data_from_song() process() diff --git a/AutoCoverTool/script/shuffle_music.py b/AutoCoverTool/script/shuffle_music.py index 0f80872..81b3e60 100644 --- a/AutoCoverTool/script/shuffle_music.py +++ b/AutoCoverTool/script/shuffle_music.py @@ -1,225 +1,263 @@ """ 载入人声,将人声的频谱进行向上平移 """ import librosa import soundfile import numpy as np from copy import deepcopy def local_maxium(x): """ 求序列的极大值 :param x: :return: """ d = np.diff(x) l_d = len(d) maxium = [] loc = [] for i in range(l_d - 1): if d[i] > 0 and d[i + 1] <= 0: maxium.append(x[i + 1]) loc.append(i + 1) return maxium, loc def Formant_Cepst(u, cepstL): """ 来源: https://github.com/taw19960426/-Speech-signal-processing-experiment-tutorial-_python/blob/master/%E5%85%B1%E6%8C%AF%E5%B3%B0%E4%BC%B0%E8%AE%A1%E5%87%BD%E6%95%B0.py 倒谱法共振峰估计函数 :param u:输入信号 :param cepstL:🔪频率上窗函数的宽度 :return: val共振峰幅值 :return: loc共振峰位置 :return: spec包络线 """ wlen2 = len(u) // 2 u_fft = np.fft.fft(u) # 按式(2-1)计算 U = np.log(np.abs(u_fft[:wlen2])) Cepst = np.fft.ifft(U) # 按式(2-2)计算 cepst = np.zeros(wlen2, dtype=np.complex) cepst[:cepstL] = Cepst[:cepstL] # 按式(2-3)计算 cepst[-cepstL + 1:] = Cepst[-cepstL + 1:] # 取第二个式子的相反 spec = np.real(np.fft.fft(cepst)) val, loc = local_maxium(spec) # 在包络线上寻找极大值 return val, loc, spec +def get_ref_stft(): + sr = 44100 + audio, sr = librosa.load( + "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal_ref.wav", \ + sr=sr, mono=True) + stft = librosa.stft(audio, n_fft=2048) + stft = stft.transpose() + print(stft.shape) + data = np.mean(np.abs(stft), axis=0) + data = data / np.max(data) + return data + + def test(in_vocal): import matplotlib.pyplot as plt sr = 44100 audio, sr = librosa.load(in_vocal, sr=sr, mono=True) + stft = librosa.stft(audio, n_fft=2048) stft = stft.transpose() new_stft = np.zeros_like(stft) - for ii in range(0, len(stft)): + w1 = get_ref_stft() + data = np.mean(np.abs(stft), axis=0) + data = data / np.max(data) + w = w1 / data - power = np.abs(stft[ii]) - power = power / (np.max(power)) - - x = np.array(list(range(0, len(stft[ii])))) - y = power - - new_x = [] - new_y = [] - for i in range(1, len(x) - 1, 1): - if y[i - 1] < y[i] > y[i + 1] and y[i] > 0.01: - new_x.append(x[i]) - new_y.append(y[i]) - - # 前后100hz的合并 - x = new_x - y = new_y - new_x = [] - new_y = [] - for i in range(1, len(x) - 1, 1): - if y[i - 1] < y[i] > y[i + 1]: - if x[i] - x[i - 1] > 5: - new_x.append(x[i - 1]) - new_y.append(y[i - 1]) - new_x.append(x[i]) - new_y.append(y[i]) - if x[i + 1] - x[i] > 5: - new_x.append(x[i + 1]) - new_y.append(y[i + 1]) - - if len(new_x) <= 1: - new_stft[ii] = deepcopy(stft[ii]) - continue - # 从第一共振峰开始向上加 - st_freq_idx = 1 - for i in range(st_freq_idx, len(stft[ii])): - dst_i = int(i * 1.12) - if dst_i >= len(stft[ii]): - continue - new_stft[ii][dst_i] = stft[ii][i] - new_stft[ii][0] = stft[ii][0] - # for i in range(0, len(stft[ii])): + for ii in range(0, len(stft)): + # 第一种,整体向上+3 + # for i in range(0, 3): # new_stft[ii][i] = stft[ii][i] + # for i in range(0, len(stft[ii]) - 3): + # dst_i = i + 3 + # new_stft[ii][dst_i] = stft[ii][i] - # new_stft[ii] = deepcopy(stft[ii]) + # 第二种,整体向上拉伸1.12倍[2个音高] + # for i in range(0, 1): + # new_stft[ii][i] = stft[ii][i] + # for i in range(1, len(stft[ii])): + # dst_i = int(i * 1.12 + 0.5) + # if dst_i >= len(stft[ii]): + # break + # new_stft[ii][dst_i] += stft[ii][i] - # # # 从0.01开始向后走 + # 第三种,第一共振峰部分不移动,其他部分移动 + # power = np.abs(stft[ii]) + # power = power / (np.max(power)) + # + # x = np.array(list(range(0, len(stft[ii])))) + # y = power + # + # new_x = [] + # new_y = [] + # for i in range(1, len(x) - 1, 1): + # if y[i - 1] < y[i] > y[i + 1] and y[i] > 0.01: + # new_x.append(x[i]) + # new_y.append(y[i]) + # + # # 前后100hz的合并 + # x = new_x + # y = new_y + # new_x = [] + # new_y = [] + # for i in range(1, len(x) - 1, 1): + # if y[i - 1] < y[i] > y[i + 1]: + # if x[i] - x[i - 1] > 5: + # new_x.append(x[i - 1]) + # new_y.append(y[i - 1]) + # new_x.append(x[i]) + # new_y.append(y[i]) + # if x[i + 1] - x[i] > 5: + # new_x.append(x[i + 1]) + # new_y.append(y[i + 1]) + # + # if len(new_x) <= 1: + # new_stft[ii] = deepcopy(stft[ii]) + # continue + # + # # 从第一共振峰开始向上加 # st_freq_idx = new_x[1] - # if len(new_x) >= 3: - # st_freq_idx = new_x[2] - # music_idx = int(4000 / (sr / 2048)) - # # 当前频率翻1.19倍 - # kk = -0.19 / (music_idx - st_freq_idx) - # bb = 1 - music_idx * kk # for i in range(st_freq_idx, len(stft[ii])): - # cur_rate = i * kk + bb - # if i >= music_idx: - # cur_rate = 1.0 - # dst_idx = int(i * cur_rate + 0.5) - # if dst_idx >= len(stft[ii]): - # break - # new_stft[ii][dst_idx] += stft[ii][i] - # - # # 加平滑 - # st_freq_1 = new_x[1] - # # 当前频率从1倍翻到1.19倍 - # kk = 0.19 / (st_freq_idx - st_freq_1) - # bb = 1 - st_freq_1 * kk - # for i in range(st_freq_1, st_freq_idx): - # cur_rate = i * kk + bb - # dst_idx = int(i * cur_rate + 0.5) - # if dst_idx >= len(stft[ii]): - # break - # new_stft[ii][dst_idx] += stft[ii][i] - # for i in range(0, st_freq_1): - # new_stft[ii][i] += stft[ii][i] + # dst_i = int(i * 1.12 + 0.5) + # if dst_i >= len(stft[ii]): + # continue + # new_stft[ii][dst_i] = stft[ii][i] + # new_stft[ii][0] = stft[ii][0] + # for i in range(0, st_freq_idx): + # new_stft[ii][i] = stft[ii][i] + new_stft[ii] = stft[ii] * w new_stft = new_stft.transpose() istft = librosa.istft(new_stft) soundfile.write(str(in_vocal).replace(".wav", "_out.wav"), istft, 44100, format="wav") +def test_v5(vocal, vocal_ref, vocal_ref2): + sr = 44100 + audio, sr = librosa.load(vocal, sr=sr, mono=True) + stft = librosa.stft(audio, n_fft=2048) + stft = stft.transpose() + new_stft = np.zeros_like(stft) + + audio_ref, sr = librosa.load(vocal_ref, sr=sr, mono=True) + stft_ref = librosa.stft(audio_ref, n_fft=2048) + stft_ref = stft_ref.transpose() + + audio_ref2, sr = librosa.load(vocal_ref2, sr=sr, mono=True) + stft_ref2 = librosa.stft(audio_ref2, n_fft=2048) + stft_ref2 = stft_ref2.transpose() + + w1 = np.ones(len(stft[0])) + for i in range(0, 800): + w1[i] = i / 800 + w2 = 1.0 - w1 + for i in range(0, min(len(stft), len(stft_ref2), len(stft_ref))): + # new_stft[i] = stft_ref2[i] * w2 + stft[i] * w1 + w = np.abs(stft_ref2[i]) / np.abs(stft[i]) + new_stft[i] = w * stft[i] + + new_stft = new_stft.transpose() + istft = librosa.istft(new_stft) + soundfile.write(str(vocal).replace(".wav", "_out5.wav"), istft, 44100, format="wav") + + def ttt(path): from scipy.signal import lfilter import matplotlib.pyplot as plt # path="C4_3_y.wav" # data, fs = soundBase('C4_3_y.wav').audioread() data, fs = librosa.load(path, sr=44100, mono=True) # sr=None声音保持原采样频率, mono=False声音保持原通道数 # 预处理-预加重 u = lfilter([1, -0.99], [1], data) cepstL = 7 wlen = len(u) wlen2 = wlen // 2 print("帧长={}".format(wlen)) print("帧移={}".format(wlen2)) # wlen = 256 # wlen2 = 256//2 # 预处理-加窗 u2 = np.multiply(u, np.hamming(wlen)) # 预处理-FFT,取对数 获得频域图像 取一半 U_abs = np.log(np.abs(np.fft.fft(u2))[:wlen2]) # 4.3.1 freq = [i * fs / wlen for i in range(wlen2)] # print(freq) # val共振峰幅值 loc共振峰位置 spec包络线 val, loc, spec = Formant_Cepst(u, cepstL) plt.subplot(2, 1, 1) plt.plot(freq, U_abs, 'k') plt.xlabel('频率/Hz') # 设置x,y轴的标签 plt.ylabel('幅值') plt.title('男性a的发音频谱') plt.subplot(2, 1, 2) plt.plot(freq, spec, 'k') plt.xlabel('频率/Hz') # 设置x,y轴的标签 plt.ylabel('幅值') plt.title('倒谱法共振峰估计') for i in range(len(loc)): plt.subplot(2, 1, 2) plt.plot([freq[loc[i]], freq[loc[i]]], [np.min(spec), spec[loc[i]]], '-.k') plt.text(freq[loc[i]], spec[loc[i]], 'Freq={}'.format(int(freq[loc[i]]))) # plt.savefig('images/共振峰估计.png') plt.show() plt.close() def main(path): import numpy as np import pyworld as pw from scipy.signal import freqz import librosa import math """ 思路: 先变调,再轻微调整共振峰进行合成 """ base_rate = 1.05946 pitch = 0 fs = 44100 x, sr = librosa.load(path, sr=fs, mono=True) x = x.reshape(-1).astype(np.float) f0, t = pw.dio(x, fs) f0 = pw.stonemask(x, f0, t, fs) sp = pw.cheaptrick(x, f0, t, fs) sp2 = np.zeros_like(sp) cur_rate = 1 for i in range(sp.shape[1]): sp2[:, i] = sp[:, min(int(i * 1 / cur_rate), sp.shape[1] - 1)] ap = pw.d4c(x, f0, t, fs) rate = math.pow(base_rate, pitch) out = pw.synthesize(f0 * rate, sp2, ap, fs).reshape(-1, 1) soundfile.write(path.replace(".wav", "_out2.wav"), out, fs) if __name__ == '__main__': # vc = VoiceChanger() # vc.process("/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal.wav", # "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal_out1.wav") # test( - # "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal.wav") + # "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal.wav") + + vocal_pp = "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal_p1.wav" + vocal_p2 = "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal.wav" + vocal_p3 = "/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal_p2.wav" + test_v5(vocal_pp, vocal_p2, vocal_p3) - main("/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal_p2.wav") + # main("/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal_p2.wav") # ttt("/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal_02_01.wav")