diff --git a/AutoCoverTool/online/beanstalk_helper.py b/AutoCoverTool/online/beanstalk_helper.py new file mode 100644 index 0000000..6045d98 --- /dev/null +++ b/AutoCoverTool/online/beanstalk_helper.py @@ -0,0 +1,36 @@ +import beanstalkc +import logging +import os + + +class BeanstalkHelper: + def __init__(self, config): + print("beanstalk init ! config is:", config) + self.beanstalk = None + self.config = config + self._init = False + + def init_beanstalks(self): + addr_list = self.config['addr'].split(':') + return self.connet_beanstalkd(addr_list[0], addr_list[1]) + + @staticmethod + def connet_beanstalkd(host, port): + logging.info("beanstalk host=%s port=%s" % (host, port)) + beanstalkd = beanstalkc.Connection(host=host, port=int(port)) + return beanstalkd + + # 对外两个接口 + def get_beanstalkd(self): + if not self._init or not self.beanstalk: + self.beanstalk = self.init_beanstalks() + self._init = True + return self.beanstalk + + def put_payload_to_beanstalk(self, tube, message, priority=2 ** 31, delay=0, ttr=180): + beanstalk = self.get_beanstalkd() + if beanstalk is not None: + beanstalk.use(tube) + beanstalk.put(message, priority=priority, delay=delay, ttr=ttr) + return True + return False diff --git a/AutoCoverTool/online/inference_one_v1.py b/AutoCoverTool/online/inference_one_v1.py new file mode 100644 index 0000000..92e9495 --- /dev/null +++ b/AutoCoverTool/online/inference_one_v1.py @@ -0,0 +1,190 @@ +""" +单个处理的逻辑 +环境安装: +conda create -n auto_song_cover python=3.9 +# 安装demucs环境[进入到ref.music_remover 执行pip install -r requirements.txt] +# 安装so_vits_svc环境[进入到ref.so_vits_svc 执行pip install -r requirements.txt] +pip install librosa +pip install scikit-maad +pip install praat-parselmouth +pip install matplotlib +pip install torchvision +pip install madmom +pip install torchstat +环境设置: +export PATH=$PATH:/data/gpu_env_common/env/bin/ffmpeg/bin +export PYTHONPATH=$PWD:$PWD/ref/music_remover/demucs:$PWD/ref/so_vits_svc:$PWD/ref/split_dirty_frame +""" + +import os +import glob +import json +import shutil +import librosa +from ref.so_vits_svc.inference_main import * +from ref.adaptive_voice_conversion.spk_compare1 import infer_load, infer_main + +gs_res_dir = "/data/rsync/jianli.yang/AutoCoverTool/data/online_models" +gs_model_dir = os.path.join(gs_res_dir, 'models') +gs_config_path = os.path.join(gs_res_dir, 'config.json') + +gs_draw_volume_exe = "/opt/soft/bin/draw_volume" +gs_simple_mixer_path = "/opt/soft/bin/simple_mixer" + +# 错误码 +gs_scg_success = 0 +gs_scg_no_vocal = 1 +gs_scg_svc_trans_442 = 2 +gs_scg_svc_volume = 3 +gs_scg_svc_mix = 4 +gs_scg_svc_trans_mix = 5 + + +class SongCoverGenerator: + def __init__(self): + self.models = glob.glob(os.path.join(gs_model_dir, "*/*pth")) + self.gs_infer = infer_load() + + def mix(self, cid, work_dir, svc_file, vocal_file, acc_file, mix_path): + """ + :param cid: + :param work_dir: + :param svc_file: + :param vocal_file: + :param acc_file: + :param mix_path: + :return: + """ + cache_dir = os.path.join(work_dir, "cache") + if os.path.exists(cache_dir): + shutil.rmtree(cache_dir) + os.makedirs(cache_dir) + + # svc转码到442 + svc_442_file = os.path.join(cache_dir, "442.wav") + st = time.time() + cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {} -loglevel fatal".format(svc_file, svc_442_file) + os.system(cmd) + if not os.path.exists(svc_442_file): + return gs_scg_svc_trans_442 + logging.info("cid={},transcode,{},sp={}".format(cid, svc_file, time.time() - st)) + + # 合并转码后再做一次拉伸,保证响度 + st = time.time() + volume_path = os.path.join(cache_dir, "volume.wav") + cmd = "{} {} {} {}".format(gs_draw_volume_exe, svc_442_file, vocal_file, volume_path) + os.system(cmd) + if not os.path.exists(volume_path): + print("{} {} ERROR draw volume".format(cid, volume_path)) + return gs_scg_svc_volume + logging.info("cid={},draw_volume2,{},sp={}".format(cid, svc_file, time.time() - st)) + + # 混合 + st = time.time() + mix_wav_path = os.path.join(cache_dir, "mix.wav") + cmd = "{} {} {} {}".format(gs_simple_mixer_path, volume_path, acc_file, mix_wav_path) + os.system(cmd) + if not os.path.exists(mix_wav_path): + return gs_scg_svc_mix + logging.info("cid={},mixer,{},sp={}".format(cid, svc_file, time.time() - st)) + + # 编码为m4a + st = time.time() + cmd = "ffmpeg -i {} -ab 128k -y {} -loglevel fatal".format(mix_wav_path, mix_path) + print(cmd) + os.system(cmd) + if not os.path.exists(mix_path): + return gs_scg_svc_trans_mix + logging.info("cid={},encode,{},sp={}".format(cid, svc_file, time.time() - st)) + return gs_scg_success + + def process_logic(self, cid, work_dir): + """ + work_dir: + ---vocal.wav # 默认人声和伴奏都是44k双声道 + ---acc.wav + ---svc_vocals + model1.wav + model2.wav + ---cache + model1_tmp.wav + model2_tmp.wav + ---output + model1.m4a + model2.m4a + ---emb + model1.npy + model2.npy + :param cid: + :param work_dir: + :return: + """ + p_start = time.time() + # vocal_wav = os.path.join(work_dir, "vocal.wav") + vocal_wav = os.path.join(work_dir, "vocal_01.wav") + vocal_32_wav = os.path.join(work_dir, "vocal_32.wav") + # acc_wav = os.path.join(work_dir, "acc.wav") + acc_wav = os.path.join(work_dir, "acc_01.wav") + if not os.path.exists(vocal_wav) or not os.path.exists(acc_wav): + return gs_scg_no_vocal + + # 将vocal采样率转为32位 + audio, sr = librosa.load(vocal_wav, sr=32000, mono=True) + soundfile.write(vocal_32_wav, audio, sr, format="wav") + + # 开始生成 + svc_vocal_dir = os.path.join(work_dir, "svc_vocals") + if not os.path.exists(svc_vocal_dir): + os.makedirs(svc_vocal_dir) + + print("cid={}, start svc ...".format(cid)) + st = time.time() + out_files = [] + for model_path in self.models: + model_name = model_path.split("/")[-1].replace(".pth", "") + dst_path = os.path.join(svc_vocal_dir, "{}_{}.wav".format(cid, model_name)) + if os.path.exists(dst_path): + continue + + if not os.path.exists(dst_path): + try: + inf(model_path, gs_config_path, vocal_32_wav, dst_path, 'cuda') + except Exception as ex: + print(ex) + if os.path.exists(dst_path): + out_files.append(dst_path) + print("cid={}, svc finish sp={}, len={}".format(cid, time.time() - st, len(out_files))) + + # 提取特征 + print("cid={}, start get emb".format(cid)) + output_dir = os.path.join(work_dir, "output") + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + emb_dir = os.path.join(work_dir, "emb") + if not os.path.exists(emb_dir): + os.makedirs(emb_dir) + + for file in out_files: + spk_emb = infer_main(self.gs_infer, file).detach().cpu().numpy() + fname = file.split("/")[-1].replace(".wav", "") + emb_file = os.path.join(emb_dir, "{}".format(fname)) + np.save(emb_file, spk_emb) + + cur_name = file.split("/")[-1].replace(".wav", ".m4a") + mix_path = os.path.join(output_dir, "{}".format(cur_name)) + err = self.mix(cid, work_dir, file, vocal_wav, acc_wav, mix_path) + if err != gs_scg_success: + print("cid={}, mix err code={}".format(cid, err)) + + print("cid={}, finish, sp={}".format(cid, time.time() - p_start)) + + +if __name__ == '__main__': + scg = SongCoverGenerator() + ww_dir = "/data/rsync/jianli.yang/AutoCoverTool/data/online_data/step1" + # cid = "AIYpdjQVidc." # 女 + # cid = "Vds8ddYXYZY." # 男 + for cid in ["AIYpdjQVidc.", "Vds8ddYXYZY."]: + scg.process_logic(cid, os.path.join(ww_dir, cid)) diff --git a/AutoCoverTool/online/search_voice_webui.py b/AutoCoverTool/online/search_voice_webui.py new file mode 100644 index 0000000..7ddb423 --- /dev/null +++ b/AutoCoverTool/online/search_voice_webui.py @@ -0,0 +1,140 @@ +""" +给定一段音频,查找到其最相近的歌曲 +根据性别给候选歌曲,按照本歌曲下的所有模型信息来进行判定即可 +""" + +import os +import time +import glob +import shutil +import soundfile +import numpy as np +import gradio as gr + +from ref.adaptive_voice_conversion.spk_compare1 import infer_load, infer_main, cos_distance + +st = time.time() +gs_infer = infer_load() +print("load model sp={}".format(time.time() - st)) + +gs_tmp_dir = "/tmp/search_voice_web" +gs_emb_dir = "/data/rsync/jianli.yang/AutoCoverTool/data/online_data/step1" +gs_res_dir = "/data/rsync/jianli.yang/AutoCoverTool/data/online_models" +gs_model_dir = os.path.join(gs_res_dir, 'models') + +gs_song_map = { + "female": ["AIYpdjQVidc."], + "male": ["Vds8ddYXYZY."], + "unknown": None +} + +gs_song_list_dropdown = None + + +def song_select(gender): + return gs_song_list_dropdown.update(choices=gs_song_map[gender]), gs_song_map[gender][0] + + +def get_emb_dict(): + model_list = glob.glob(os.path.join(gs_model_dir, "*/*pth")) + m_id_gender = {} + for model in model_list: + m_arr = model.split("/") + m_id = m_arr[-1].replace(".pth", "") + m_gender = m_arr[-2] + m_id_gender[m_id] = m_gender + + print(m_id_gender) + emb_list = glob.glob(os.path.join(gs_emb_dir, "*/*/*npy")) + emb_vec_dict = {} + for emb_list_file in emb_list: + # /data/rsync/jianli.yang/AutoCoverTool/data/online_data/step1/Vds8ddYXYZY./emb/Vds8ddYXYZY._taylor_n3.0_g2000.npy + song_name = emb_list_file.split("/")[-1].split("_")[0] + model_name = "_".join(emb_list_file.split("/")[-1].split("_")[1:]).replace(".npy", "") + gender = m_id_gender[model_name] + emb_vec_dict[emb_list_file] = [np.load(emb_list_file).squeeze(), song_name, gender] # emb,歌曲,性别 + return emb_vec_dict + + +def gen_spk_vec(input_data, gender, song_name): + emb_vec_dict = get_emb_dict() + if os.path.exists(gs_tmp_dir): + shutil.rmtree(gs_tmp_dir) + os.makedirs(gs_tmp_dir) + sr, data = input_data + tmp_file = os.path.join(gs_tmp_dir, "tmp.wav") + soundfile.write(tmp_file, data, sr, format="wav") + + st = time.time() + spk_emb = infer_main(gs_infer, tmp_file) + print("gen_spk_vec input={}, sp={}".format(data.shape[0] / sr, time.time() - st)) + spk_emb = spk_emb.detach().cpu().numpy().squeeze() + # 查询 + mm_rate = 0 + mm_sp_name = None + for sp_name, vec_data in emb_vec_dict.items(): + vec, cur_song_name, cur_gender = vec_data + print("{},{}|{},{}".format(song_name, gender, cur_song_name, cur_gender)) + # 要求包含song_name和gender + if song_name != cur_song_name or gender != cur_gender: + continue + rate = cos_distance(spk_emb, vec) + if mm_sp_name is None or rate > mm_rate: + mm_sp_name = sp_name + mm_rate = rate + print("{},{}".format(sp_name, rate)) + print("MaxRate:{},{}".format(mm_sp_name, mm_rate)) + output_file = mm_sp_name.replace("/emb/", "/output/").replace(".npy", ".m4a") + return output_file + + +def application(data): + print(data) + return + + +def main(): + app = gr.Blocks() + with app: + # 头部介绍 + gr.Markdown(value=""" + ### 一段声音模拟你唱歌(测试版) + 作者:starmaker音视频 + """) + input_audio = gr.inputs.Audio(label="input") + gender = gr.inputs.Radio(choices=["female", "male"], default="female") + global gs_song_list_dropdown + gs_song_list_dropdown = gr.Dropdown(choices=gs_song_map["female"], interactive=True, label="song list") + gender.change(song_select, inputs=[gender], outputs=[gs_song_list_dropdown, gs_song_list_dropdown]) + + gen_btn = gr.Button("generate", variant="primary") + + output_audio = gr.outputs.Audio(label="output", type='filepath') + gen_btn.click(fn=gen_spk_vec, inputs=[input_audio, gender, gs_song_list_dropdown], + outputs=output_audio) + # 本方法实现同一时刻只有一个程序在服务器端运行 + app.queue(concurrency_count=1, max_size=2044).launch(server_name="0.0.0.0", inbrowser=True, quiet=True, + server_port=7861) + + +# def test1(): +# import time +# tmp_0_wav = "/data/rsync/jianli.yang/AutoCoverTool/data/test_vec/0.wav" +# tmp_1_wav = "/data/rsync/jianli.yang/AutoCoverTool/data/test_vec/1.wav" +# +# st = time.time() +# spk_emb0 = infer_main(gs_infer, tmp_0_wav).detach().cpu().numpy().squeeze() +# print("gen emb={}".format(time.time() - st)) +# +# for i in range(10): +# st = time.time() +# spk_emb1 = infer_main(gs_infer, tmp_1_wav).detach().cpu().numpy().squeeze() +# print("gen emb={}".format(time.time() - st)) +# +# st = time.time() +# rate = cos_distance(spk_emb0, spk_emb1) +# print("rate={},sp={}".format(rate, time.time() - st)) + + +if __name__ == '__main__': + main() diff --git a/AutoCoverTool/online/webui.py b/AutoCoverTool/online/webui.py index af9baca..30ed200 100644 --- a/AutoCoverTool/online/webui.py +++ b/AutoCoverTool/online/webui.py @@ -1,77 +1,78 @@ """ 构建唱歌音色转换网页(基于3.0) 要求: 1. 音频上传 2. 推理 3. 下载 """ import os import time import glob import shutil import librosa import soundfile import gradio as gr from online.inference_one import inf gs_tmp_dir = "/tmp/auto_cover_tool_web" gs_res_dir = "/data/rsync/jianli.yang/AutoCoverTool/data/online_models" gs_model_dir = os.path.join(gs_res_dir, 'models') gs_config_path = os.path.join(gs_res_dir, 'config.json') gs_models_choices = glob.glob(os.path.join(gs_model_dir, "*/*pth")) gs_model_list_dropdown = None -def svc(audio_data, model_path): +def svc(audio_data, model_path, tran): sr, data = audio_data if os.path.exists(gs_tmp_dir): shutil.rmtree(gs_tmp_dir) os.makedirs(gs_tmp_dir) tmp_path = os.path.join(gs_tmp_dir, "tmp.wav") soundfile.write(tmp_path, data, sr, format="wav") # 重采样到32k audio, sr = librosa.load(tmp_path, sr=32000, mono=True) tmp_path = os.path.join(gs_tmp_dir, "tmp_32.wav") out_path = os.path.join(gs_tmp_dir, "out.wav") - soundfile.write(tmp_path, data, sr, format="wav") + soundfile.write(tmp_path, audio, sr, format="wav") # 推理 - print("svc: {}".format(model_path)) + print("svc: {}, tran={}".format(model_path, tran)) st = time.time() - inf(model_path, gs_config_path, tmp_path, out_path, 'cuda') + inf(model_path, gs_config_path, tmp_path, out_path, 'prod', tran) print("input d={}, sp = {}".format(len(audio) / sr, time.time() - st)) return out_path def model_select(): files = glob.glob(os.path.join(gs_model_dir, "*/*pth")) return gs_model_list_dropdown.update(choices=files) def main(): # header app = gr.Blocks() with app: # 头部介绍 gr.Markdown(value=""" ### 唱歌音色转换 作者:starmaker音视频 """) global gs_model_list_dropdown gs_model_list_dropdown = gr.Dropdown(choices=gs_models_choices, interactive=True, label="model list") refresh_btn = gr.Button("refresh_model_list") refresh_btn.click(fn=model_select, inputs=[], outputs=gs_model_list_dropdown) - # 提示词输入框 + # 音频输入框 input_audio = gr.inputs.Audio(label="input") + vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)", value=0) gen_btn = gr.Button("generate", variant="primary") output_audio = gr.outputs.Audio(label="output", type='filepath') - gen_btn.click(fn=svc, inputs=[input_audio, gs_model_list_dropdown], outputs=output_audio) + gen_btn.click(fn=svc, inputs=[input_audio, gs_model_list_dropdown, vc_transform], outputs=output_audio) # 本方法实现同一时刻只有一个程序在服务器端运行 app.queue(concurrency_count=1, max_size=2044).launch(server_name="0.0.0.0", inbrowser=True, quiet=True, server_port=7860) if __name__ == '__main__': main() diff --git a/AutoCoverTool/ref/adaptive_voice_conversion b/AutoCoverTool/ref/adaptive_voice_conversion new file mode 160000 index 0000000..d1cad9f --- /dev/null +++ b/AutoCoverTool/ref/adaptive_voice_conversion @@ -0,0 +1 @@ +Subproject commit d1cad9fb4eff74ca56714f9a2527124132fb1ed7 diff --git a/AutoCoverTool/ref/music_remover/separate_interface.py b/AutoCoverTool/ref/music_remover/separate_interface.py index 646d96a..94657bc 100644 --- a/AutoCoverTool/ref/music_remover/separate_interface.py +++ b/AutoCoverTool/ref/music_remover/separate_interface.py @@ -1,103 +1,111 @@ """ 分离的对外函数 """ import os import sys import time import shutil import logging from demucs.pretrained import get_model from demucs.separate import * # 第三方二进制文件 gs_standard_audio_exe = "/opt/soft/bin/standard_audio" gs_ffmpeg_exe = "ffmpeg" # 全局配置 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') # 错误码 ERR_CODE_SUCCESS = 0 ERR_CODE_INPUT_FILE_NOT_EXISTS = 1 class SeparateInterface: """ 分离器对外接口,只生成伴奏 """ def __init__(self): sp_start = time.time() # 评价之后该版本模型的效果效果最佳,性能也合适 # 对比评价过mdx_extra_q和htdemucs_ft # 其中mdx_extra_q和mdx_extra速度一致,但是和声保留情况不如后者 # htdemucs_ft耗时是mdx_extra的1.6倍,在部分歌曲消去的程度上比mdx_extra好,但是和声没有mdx_extra保留的完整|详情见xishuashua的开头和声部分 self.model = get_model('mdx_extra') self.tmp_dir = os.path.join("/tmp/separate") if not os.path.exists(self.tmp_dir): os.makedirs(self.tmp_dir) logging.info("SeparateInterface: load model spent = {}".format(time.time() - sp_start)) def process_logic(self, cid, cache_dir, in_file, vocal_out_file, acc_out_file, dev='cuda'): model = self.model sp_start = time.time() wav = load_track(in_file, model.audio_channels, model.samplerate) logging.info("--------load_track:cid={},sp={}".format(cid, time.time() - sp_start)) # 模型推理 sp_start = time.time() ref = wav.mean(0) wav = (wav - ref.mean()) / ref.std() # wav[None] -> 增加一个维度,原来是[2, xxx] -> [1, 2, xxx] sources = apply_model(model, wav[None], device=dev, shifts=1, split=True, overlap=0.25, progress=True, num_workers=0)[0] sources = sources * ref.std() + ref.mean() logging.info("--------apply_model:cid={},sp={}".format(cid, time.time() - sp_start)) # 只保留伴奏 sources = list(sources) vocals = sources.pop(model.sources.index("vocals")) if vocals is not None: save_audio(vocals, vocal_out_file, samplerate=model.samplerate) other_stem = th.zeros_like(sources[0]) for sc in sources: other_stem += sc if acc_out_file is not None: save_audio(other_stem, acc_out_file, samplerate=model.samplerate) if vocal_out_file is not None: if not os.path.exists(vocal_out_file): return False if acc_out_file is not None: if not os.path.exists(acc_out_file): return False return True def process(self, cid, in_file, vocal_out_file, acc_out_file, dev='cuda'): if not os.path.exists(in_file): return ERR_CODE_INPUT_FILE_NOT_EXISTS st_time = time.time() logging.info("--------process:cid={},{},{},{}".format(cid, in_file, vocal_out_file, acc_out_file)) cache_dir = os.path.join(self.tmp_dir, str(cid)) if os.path.exists(cache_dir): shutil.rmtree(cache_dir) os.makedirs(cache_dir) # 核心处理逻辑 ret = self.process_logic(cid, cache_dir, in_file, vocal_out_file, acc_out_file, dev) shutil.rmtree(cache_dir) logging.info( "--------finish:cid={},{},{},{}|{}|sp={}\n\n".format(cid, in_file, vocal_out_file, acc_out_file, dev, ret, time.time() - st_time)) return ret -# if __name__ == '__main__': -# si = SeparateInterface() -# in_f = sys.argv[1] -# out_f = sys.argv[2] -# dev = sys.argv[3] # cuda或者cpu -# for i in range(0, 3): -# si.process(str(1), in_f, out_f, dev) +if __name__ == '__main__': + import glob + + si = SeparateInterface() + in_f = sys.argv[1] + out_f = sys.argv[2] + in_files = glob.glob(os.path.join(in_f, "*m4a")) + for idx, in_file in enumerate(in_files): + b_dir = str(in_file).replace("m4a", "") + if os.path.exists(b_dir): + shutil.rmtree(b_dir) + os.makedirs(b_dir) + v_out = os.path.join(b_dir, "vocal.wav") + a_out = os.path.join(b_dir, "acc.wav") + si.process(str(1), in_file, v_out, a_out, 'cuda') diff --git a/AutoCoverTool/ref/so_vits_svc/inference/infer_tool.py b/AutoCoverTool/ref/so_vits_svc/inference/infer_tool.py index 2f29b6f..9539a97 100644 --- a/AutoCoverTool/ref/so_vits_svc/inference/infer_tool.py +++ b/AutoCoverTool/ref/so_vits_svc/inference/infer_tool.py @@ -1,340 +1,342 @@ import hashlib import json import logging import os import time from pathlib import Path import librosa import maad import numpy as np # import onnxruntime import parselmouth import soundfile import torch import torchaudio from hubert import hubert_model import utils from models import SynthesizerTrn import copy + logging.getLogger('matplotlib').setLevel(logging.WARNING) from mel_processing import spectrogram_torch, spec_to_mel_torch def get_spec(audio): audio_norm = audio print(audio_norm) spec = spectrogram_torch(audio_norm, 1280, 32000, 320, 1280, center=False) return spec + def read_temp(file_name): if not os.path.exists(file_name): with open(file_name, "w") as f: f.write(json.dumps({"info": "temp_dict"})) return {} else: try: with open(file_name, "r") as f: data = f.read() data_dict = json.loads(data) if os.path.getsize(file_name) > 50 * 1024 * 1024: f_name = file_name.replace("\\", "/").split("/")[-1] print(f"clean {f_name}") for wav_hash in list(data_dict.keys()): if int(time.time()) - int(data_dict[wav_hash]["time"]) > 14 * 24 * 3600: del data_dict[wav_hash] except Exception as e: print(e) print(f"{file_name} error,auto rebuild file") data_dict = {"info": "temp_dict"} return data_dict def write_temp(file_name, data): with open(file_name, "w") as f: f.write(json.dumps(data)) def timeit(func): def run(*args, **kwargs): t = time.time() res = func(*args, **kwargs) print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t)) return res return run def format_wav(audio_path): if Path(audio_path).suffix == '.wav': return raw_audio, raw_sample_rate = librosa.load(audio_path, mono=True, sr=None) soundfile.write(Path(audio_path).with_suffix(".wav"), raw_audio, raw_sample_rate) def get_end_file(dir_path, end): file_lists = [] for root, dirs, files in os.walk(dir_path): files = [f for f in files if f[0] != '.'] dirs[:] = [d for d in dirs if d[0] != '.'] for f_file in files: if f_file.endswith(end): file_lists.append(os.path.join(root, f_file).replace("\\", "/")) return file_lists def get_md5(content): return hashlib.new("md5", content).hexdigest() def resize2d_f0(x, target_len): source = np.array(x) source[source < 0.001] = np.nan target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)), source) res = np.nan_to_num(target) return res def get_f0(x, p_len, f0_up_key=0): time_step = 160 / 16000 * 1000 f0_min = 50 f0_max = 1100 f0_mel_min = 1127 * np.log(1 + f0_min / 700) f0_mel_max = 1127 * np.log(1 + f0_max / 700) f0 = parselmouth.Sound(x, 16000).to_pitch_ac( time_step=time_step / 1000, voicing_threshold=0.6, pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency'] if len(f0) > p_len: f0 = f0[:p_len] pad_size = (p_len - len(f0) + 1) // 2 if (pad_size > 0 or p_len - len(f0) - pad_size > 0): f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode='constant') f0 *= pow(2, f0_up_key / 12) f0_mel = 1127 * np.log(1 + f0 / 700) f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1 f0_mel[f0_mel <= 1] = 1 f0_mel[f0_mel > 255] = 255 f0_coarse = np.rint(f0_mel).astype(np.int) return f0_coarse, f0 def clean_pitch(input_pitch): num_nan = np.sum(input_pitch == 1) if num_nan / len(input_pitch) > 0.9: input_pitch[input_pitch != 1] = 1 return input_pitch def plt_pitch(input_pitch): input_pitch = input_pitch.astype(float) input_pitch[input_pitch == 1] = np.nan return input_pitch def f0_to_pitch(ff): f0_pitch = 69 + 12 * np.log2(ff / 440) return f0_pitch def fill_a_to_b(a, b): if len(a) < len(b): for _ in range(0, len(b) - len(a)): a.append(a[0]) def mkdir(paths: list): for path in paths: if not os.path.exists(path): os.mkdir(path) class Svc(object): def __init__(self, net_g_path, config_path, hubert_path="data/models/hubert-soft-0d54a1f4.pt", onnx=False): self.onnx = onnx self.net_g_path = net_g_path self.hubert_path = hubert_path self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.net_g_ms = None self.hps_ms = utils.get_hparams_from_file(config_path) self.target_sample = self.hps_ms.data.sampling_rate self.hop_size = self.hps_ms.data.hop_length self.speakers = {} for spk, sid in self.hps_ms.spk.items(): self.speakers[sid] = spk self.spk2id = self.hps_ms.spk # 加载hubert self.hubert_soft = hubert_model.hubert_soft(hubert_path) if torch.cuda.is_available(): self.hubert_soft = self.hubert_soft.cuda() self.load_model() def load_model(self): # 获取模型配置 if self.onnx: raise NotImplementedError # self.net_g_ms = SynthesizerTrnForONNX( # 178, # self.hps_ms.data.filter_length // 2 + 1, # self.hps_ms.train.segment_size // self.hps_ms.data.hop_length, # n_speakers=self.hps_ms.data.n_speakers, # **self.hps_ms.model) # _ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None) else: self.net_g_ms = SynthesizerTrn( self.hps_ms.data.filter_length // 2 + 1, self.hps_ms.train.segment_size // self.hps_ms.data.hop_length, **self.hps_ms.model) _ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None) if "half" in self.net_g_path and torch.cuda.is_available(): _ = self.net_g_ms.half().eval().to(self.dev) else: _ = self.net_g_ms.eval().to(self.dev) def get_units(self, source, sr): source = source.unsqueeze(0).to(self.dev) with torch.inference_mode(): start = time.time() units = self.hubert_soft.units(source) use_time = time.time() - start print("hubert use time:{}".format(use_time)) return units def get_unit_pitch(self, in_path, tran): source, sr = torchaudio.load(in_path) source_bak = copy.deepcopy(source) source = torchaudio.functional.resample(source, sr, 16000) if len(source.shape) == 2 and source.shape[1] >= 2: source = torch.mean(source, dim=0).unsqueeze(0) soft = self.get_units(source, sr).squeeze(0).cpu().numpy() f0_coarse, f0 = get_f0(source.cpu().numpy()[0], soft.shape[0] * 2, tran) return soft, f0, source_bak def infer(self, speaker_id, tran, raw_path, dev=False): if type(speaker_id) == str: speaker_id = self.spk2id[speaker_id] sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0) soft, pitch, source = self.get_unit_pitch(raw_path, tran) f0 = torch.FloatTensor(clean_pitch(pitch)).unsqueeze(0).to(self.dev) if "half" in self.net_g_path and torch.cuda.is_available(): stn_tst = torch.HalfTensor(soft) else: stn_tst = torch.FloatTensor(soft) # 提取幅度谱 # spec = get_spec(source).to(self.dev) with torch.no_grad(): x_tst = stn_tst.unsqueeze(0).to(self.dev) start = time.time() x_tst = torch.repeat_interleave(x_tst, repeats=2, dim=1).transpose(1, 2) audio = self.net_g_ms.infer(x_tst, f0=f0, g=sid)[0, 0].data.float() # audio = self.net_g_ms.infer_v1(x_tst, spec[:, :, :f0.size(-1)], f0=f0, g=sid)[0, 0].data.float() use_time = time.time() - start print("vits use time:{}".format(use_time)) return audio, audio.shape[-1] # class SvcONNXInferModel(object): # def __init__(self, hubert_onnx, vits_onnx, config_path): # self.config_path = config_path # self.vits_onnx = vits_onnx # self.hubert_onnx = hubert_onnx # self.hubert_onnx_session = onnxruntime.InferenceSession(hubert_onnx, providers=['CUDAExecutionProvider', ]) # self.inspect_onnx(self.hubert_onnx_session) # self.vits_onnx_session = onnxruntime.InferenceSession(vits_onnx, providers=['CUDAExecutionProvider', ]) # self.inspect_onnx(self.vits_onnx_session) # self.hps_ms = utils.get_hparams_from_file(self.config_path) # self.target_sample = self.hps_ms.data.sampling_rate # self.feature_input = FeatureInput(self.hps_ms.data.sampling_rate, self.hps_ms.data.hop_length) # # @staticmethod # def inspect_onnx(session): # for i in session.get_inputs(): # print("name:{}\tshape:{}\tdtype:{}".format(i.name, i.shape, i.type)) # for i in session.get_outputs(): # print("name:{}\tshape:{}\tdtype:{}".format(i.name, i.shape, i.type)) # # def infer(self, speaker_id, tran, raw_path): # sid = np.array([int(speaker_id)], dtype=np.int64) # soft, pitch = self.get_unit_pitch(raw_path, tran) # pitch = np.expand_dims(pitch, axis=0).astype(np.int64) # stn_tst = soft # x_tst = np.expand_dims(stn_tst, axis=0) # x_tst_lengths = np.array([stn_tst.shape[0]], dtype=np.int64) # # 使用ONNX Runtime进行推理 # start = time.time() # audio = self.vits_onnx_session.run(output_names=["audio"], # input_feed={ # "hidden_unit": x_tst, # "lengths": x_tst_lengths, # "pitch": pitch, # "sid": sid, # })[0][0, 0] # use_time = time.time() - start # print("vits_onnx_session.run time:{}".format(use_time)) # audio = torch.from_numpy(audio) # return audio, audio.shape[-1] # # def get_units(self, source, sr): # source = torchaudio.functional.resample(source, sr, 16000) # if len(source.shape) == 2 and source.shape[1] >= 2: # source = torch.mean(source, dim=0).unsqueeze(0) # source = source.unsqueeze(0) # # 使用ONNX Runtime进行推理 # start = time.time() # units = self.hubert_onnx_session.run(output_names=["embed"], # input_feed={"source": source.numpy()})[0] # use_time = time.time() - start # print("hubert_onnx_session.run time:{}".format(use_time)) # return units # # def transcribe(self, source, sr, length, transform): # feature_pit = self.feature_input.compute_f0(source, sr) # feature_pit = feature_pit * 2 ** (transform / 12) # feature_pit = resize2d_f0(feature_pit, length) # coarse_pit = self.feature_input.coarse_f0(feature_pit) # return coarse_pit # # def get_unit_pitch(self, in_path, tran): # source, sr = torchaudio.load(in_path) # soft = self.get_units(source, sr).squeeze(0) # input_pitch = self.transcribe(source.numpy()[0], sr, soft.shape[0], tran) # return soft, input_pitch class RealTimeVC: def __init__(self): self.last_chunk = None self.last_o = None self.chunk_len = 16000 # 区块长度 self.pre_len = 3840 # 交叉淡化长度,640的倍数 """输入输出都是1维numpy 音频波形数组""" def process(self, svc_model, speaker_id, f_pitch_change, input_wav_path): audio, sr = torchaudio.load(input_wav_path) audio = audio.cpu().numpy()[0] temp_wav = io.BytesIO() if self.last_chunk is None: input_wav_path.seek(0) audio, sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path) audio = audio.cpu().numpy() self.last_chunk = audio[-self.pre_len:] self.last_o = audio return audio[-self.chunk_len:] else: audio = np.concatenate([self.last_chunk, audio]) soundfile.write(temp_wav, audio, sr, format="wav") temp_wav.seek(0) audio, sr = svc_model.infer(speaker_id, f_pitch_change, temp_wav) audio = audio.cpu().numpy() ret = maad.util.crossfade(self.last_o, audio, self.pre_len) self.last_chunk = audio[-self.pre_len:] self.last_o = audio return ret[self.chunk_len:2 * self.chunk_len] diff --git a/AutoCoverTool/ref/so_vits_svc/inference_main.py b/AutoCoverTool/ref/so_vits_svc/inference_main.py index 326ad07..d1cc4c8 100644 --- a/AutoCoverTool/ref/so_vits_svc/inference_main.py +++ b/AutoCoverTool/ref/so_vits_svc/inference_main.py @@ -1,85 +1,84 @@ import io import os import sys import logging import time from pathlib import Path from copy import deepcopy import torch import librosa import numpy as np import soundfile from inference import infer_tool from inference import slicer from inference.infer_tool import Svc logging.getLogger('numba').setLevel(logging.WARNING) chunks_dict = infer_tool.read_temp("ref/so_vits_svc/inference/chunks_temp.json") -def inf(model_path, config_path, raw_audio_path, dst_path, dev): +def inf(model_path, config_path, raw_audio_path, dst_path, dev, tran=0): # model_path = "logs/32k/G_174000-Copy1.pth" # config_path = "configs/config.json" svc_model = Svc(model_path, config_path) out_dir = os.path.dirname(dst_path) print(dst_path) os.makedirs(out_dir, exist_ok=True) # 支持多个wav文件,放在raw文件夹下 - tran = 0 spk_list = ['speaker0'] # 每次同时合成多语者音色 slice_db = -40 # 默认-40,嘈杂的音频可以-30,干声保留呼吸可以-50 wav_format = 'wav' # 音频输出格式 # infer_tool.fill_a_to_b(trans, clean_names) # for clean_name, tran in zip(clean_names, trans): # raw_audio_path = f"raw/{clean_name}" # if "." not in raw_audio_path: # raw_audio_path += ".wav" infer_tool.format_wav(raw_audio_path) wav_path = Path(raw_audio_path).with_suffix('.wav') chunks = slicer.cut(wav_path, db_thresh=slice_db) audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks) for spk in spk_list: audio = [] for (slice_tag, data) in audio_data: print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======') length = int(np.ceil(len(data) / audio_sr * svc_model.target_sample)) raw_path = io.BytesIO() soundfile.write(raw_path, data, audio_sr, format="wav") raw_path.seek(0) if slice_tag: print('jump empty segment') _audio = np.zeros(length) else: out_audio, out_sr = svc_model.infer(spk, tran, raw_path, dev == "test") _audio = out_audio.cpu().numpy() audio.extend(list(_audio)) soundfile.write(dst_path, audio, svc_model.target_sample, format=wav_format) if __name__ == '__main__': g_model = sys.argv[1] # 模型地址 g_config = sys.argv[2] # 配置文件地址 g_audio_path = sys.argv[3] # 输入的音频文件地址,wav g_dst_path = sys.argv[4] # 输出的音频文件地址 if os.path.exists(g_dst_path): print("{} success ...".format(g_dst_path)) exit(0) g_dev = "prod" if len(sys.argv) > 5: g_dev = sys.argv[5] g_aa, g_sr = librosa.load(g_audio_path) d = librosa.get_duration(g_aa, g_sr) # if g_dev != "test": # if d > 250: # print("{} too long".format(g_audio_path)) # exit(0) st = time.time() inf(g_model, g_config, g_audio_path, g_dst_path, g_dev) print("{}, inference sp={}".format(g_audio_path, time.time() - st)) diff --git a/AutoCoverTool/ref/so_vits_svc/train.py b/AutoCoverTool/ref/so_vits_svc/train.py index 75e99cd..69f56ac 100644 --- a/AutoCoverTool/ref/so_vits_svc/train.py +++ b/AutoCoverTool/ref/so_vits_svc/train.py @@ -1,306 +1,312 @@ import logging logging.getLogger('matplotlib').setLevel(logging.WARNING) logging.getLogger('numba').setLevel(logging.WARNING) import os import json import argparse import itertools import math import torch from torch import nn, optim from torch.nn import functional as F from torch.utils.data import DataLoader from torch.utils.tensorboard import SummaryWriter import torch.multiprocessing as mp import torch.distributed as dist from torch.nn.parallel import DistributedDataParallel as DDP from torch.cuda.amp import autocast, GradScaler import commons import utils from data_utils import TextAudioSpeakerLoader, EvalDataLoader from models import ( SynthesizerTrn, MultiPeriodDiscriminator, ) from losses import ( kl_loss, generator_loss, discriminator_loss, feature_loss ) from mel_processing import mel_spectrogram_torch, spec_to_mel_torch torch.backends.cudnn.benchmark = True global_step = 0 # os.environ['TORCH_DISTRIBUTED_DEBUG'] = 'INFO' def main(): """Assume Single Node Multi GPUs Training Only""" assert torch.cuda.is_available(), "CPU training is not allowed." hps = utils.get_hparams() n_gpus = torch.cuda.device_count() os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = hps.train.port mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hps,)) def run(rank, n_gpus, hps): print("CurRank:===>", rank) global global_step if rank == 0: logger = utils.get_logger(hps.model_dir) logger.info(hps) utils.check_git_hash(hps.model_dir) writer = SummaryWriter(log_dir=hps.model_dir) writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval")) dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank) torch.manual_seed(hps.train.seed) torch.cuda.set_device(rank) # 从每段音频文件中获取特征 # hubert特征,f0,幅度谱特征,对应音频段波形(384 * hop_length),人声编码[0],每一次获取3840ms长度的特征 train_dataset = TextAudioSpeakerLoader(hps.data.training_files, hps) train_loader = DataLoader(train_dataset, num_workers=8, shuffle=False, pin_memory=True, batch_size=hps.train.batch_size) if rank == 0: eval_dataset = EvalDataLoader(hps.data.validation_files, hps) eval_loader = DataLoader(eval_dataset, num_workers=1, shuffle=False, batch_size=1, pin_memory=False, drop_last=False) net_g = SynthesizerTrn( hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, **hps.model).cuda(rank) net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) optim_g = torch.optim.AdamW( net_g.parameters(), hps.train.learning_rate, betas=hps.train.betas, eps=hps.train.eps) optim_d = torch.optim.AdamW( net_d.parameters(), hps.train.learning_rate, betas=hps.train.betas, eps=hps.train.eps) net_g = DDP(net_g, device_ids=[rank]) # , find_unused_parameters=True) net_d = DDP(net_d, device_ids=[rank]) try: _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g) _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d) global_step = (epoch_str - 1) * len(train_loader) print("load checkpoint ok !") except: epoch_str = 1 global_step = 0 scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) scaler = GradScaler(enabled=hps.train.fp16_run) for epoch in range(epoch_str, hps.train.epochs + 1): if rank == 0: train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, eval_loader], logger, [writer, writer_eval]) else: train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, None], None, None) scheduler_g.step() scheduler_d.step() def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers): net_g, net_d = nets optim_g, optim_d = optims scheduler_g, scheduler_d = schedulers train_loader, eval_loader = loaders if writers is not None: writer, writer_eval = writers # train_loader.batch_sampler.set_epoch(epoch) global global_step net_g.train() net_d.train() for batch_idx, items in enumerate(train_loader): # hubert特征,f0,幅度谱特征,对应音频段波形(384 * hop_length),人声编码[0] c, f0, spec, y, spk = items g = spk.cuda(rank, non_blocking=True) spec, y = spec.cuda(rank, non_blocking=True), y.cuda(rank, non_blocking=True) c = c.cuda(rank, non_blocking=True) f0 = f0.cuda(rank, non_blocking=True) """ "sampling_rate": 32000, "filter_length": 1280, "hop_length": 320, "win_length": 1280, "n_mel_channels": 80, "mel_fmin": 0.0, "mel_fmax": null """ mel = spec_to_mel_torch( spec, hps.data.filter_length, hps.data.n_mel_channels, hps.data.sampling_rate, hps.data.mel_fmin, hps.data.mel_fmax) with autocast(enabled=hps.train.fp16_run): # net_g的输入: hubert特征,f0,幅度谱特征,说话人id,mel谱特征 # net_g的输出: # 原始波形,批次中每个采样到的帧的位置,批次中幅度谱的有效帧位置, # 幅度谱编码得到正态分布后随机采样得到的z, z经过标准化流之后得到z_p, hubert特征层得到的正态分布的均值, # hubert特征层得到的正态分布的标准差(logs_p),幅度谱和人声信息得到的均值(m_q),幅度谱和人声信息得到的标准差(logs_q) y_hat, ids_slice, z_mask, \ (z, z_p, m_p, logs_p, m_q, logs_q) = net_g(c, f0, spec, g=g, mel=mel) y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length) y_hat_mel = mel_spectrogram_torch( y_hat.squeeze(1), hps.data.filter_length, hps.data.n_mel_channels, hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length, hps.data.mel_fmin, hps.data.mel_fmax ) y = commons.slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice # Discriminator y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach()) with autocast(enabled=False): loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g) loss_disc_all = loss_disc optim_d.zero_grad() scaler.scale(loss_disc_all).backward() scaler.unscale_(optim_d) grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None) scaler.step(optim_d) with autocast(enabled=hps.train.fp16_run): # Generator y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat) with autocast(enabled=False): # mel谱之间的损失函数,后面是系数,误差越小越好 loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel # KL散度,z_p: 幅度谱侧得到的采样值经过标准化流之后的结果,logs_q: 幅度谱侧得到的标准差,m_p:hubert侧得到的均值 # logs_p: hubert侧得到的标准差,z_mask: 批次中幅度谱的有效帧位置, loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl # 在d模型中将y和y_hat的每一层特征结果都拿出来,做l1距离 loss_fm = feature_loss(fmap_r, fmap_g) loss_gen, losses_gen = generator_loss(y_d_hat_g) loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl optim_g.zero_grad() scaler.scale(loss_gen_all).backward() scaler.unscale_(optim_g) grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) scaler.step(optim_g) scaler.update() if rank == 0: if global_step % hps.train.log_interval == 0: lr = optim_g.param_groups[0]['lr'] losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_kl] logger.info('Train Epoch: {} [{:.0f}%]'.format( epoch, 100. * batch_idx / len(train_loader))) logger.info([x.item() for x in losses] + [global_step, lr]) scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, "learning_rate": lr, "grad_norm_d": grad_norm_d, "grad_norm_g": grad_norm_g} scalar_dict.update({"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl}) scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}) scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}) scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}) image_dict = { "slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()), "slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()), "all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()), } utils.summarize( writer=writer, global_step=global_step, images=image_dict, scalars=scalar_dict ) if global_step % hps.train.eval_interval == 0: evaluate(hps, net_g, eval_loader, writer_eval) utils.save_checkpoint(net_g, optim_g, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "G_{}.pth".format(global_step))) utils.save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "D_{}.pth".format(global_step))) + + # 达到2000个step则停止 + if global_step == 2000: + logger.info('====> 2000 ==> Epoch: {},{}'.format(epoch, global_step)) + exit(0) + global_step += 1 if rank == 0: logger.info('====> Epoch: {},{}'.format(epoch, global_step)) def evaluate(hps, generator, eval_loader, writer_eval): generator.eval() image_dict = {} audio_dict = {} with torch.no_grad(): for batch_idx, items in enumerate(eval_loader): c, f0, spec, y, spk = items g = spk[:1].cuda(0) spec, y = spec[:1].cuda(0), y[:1].cuda(0) c = c[:1].cuda(0) f0 = f0[:1].cuda(0) mel = spec_to_mel_torch( spec, hps.data.filter_length, hps.data.n_mel_channels, hps.data.sampling_rate, hps.data.mel_fmin, hps.data.mel_fmax) y_hat = generator.module.infer(c, f0, g=g, mel=mel) y_hat_mel = mel_spectrogram_torch( y_hat.squeeze(1).float(), hps.data.filter_length, hps.data.n_mel_channels, hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length, hps.data.mel_fmin, hps.data.mel_fmax ) audio_dict.update({ f"gen/audio_{batch_idx}": y_hat[0], f"gt/audio_{batch_idx}": y[0] }) image_dict.update({ f"gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy()), "gt/mel": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy()) }) utils.summarize( writer=writer_eval, global_step=global_step, images=image_dict, audios=audio_dict, audio_sampling_rate=hps.data.sampling_rate ) generator.train() if __name__ == "__main__": main() diff --git a/AutoCoverTool/resource/0414_0514.csv b/AutoCoverTool/res/0414_0514.csv similarity index 100% rename from AutoCoverTool/resource/0414_0514.csv rename to AutoCoverTool/res/0414_0514.csv diff --git a/AutoCoverTool/script/get_user_recordings.py b/AutoCoverTool/script/get_user_recordings.py index 0eb9856..3d56aed 100644 --- a/AutoCoverTool/script/get_user_recordings.py +++ b/AutoCoverTool/script/get_user_recordings.py @@ -1,128 +1,178 @@ """ 获取用户数据 """ import os import time import glob import json import librosa import soundfile from script.common import * def exec_cmd(cmd): r = os.popen(cmd) text = r.read() r.close() return text def get_d(audio_path): cmd = "ffprobe -v quiet -print_format json -show_format -show_streams {}".format(audio_path) data = exec_cmd(cmd) data = json.loads(data) if "format" in data.keys(): if "duration" in data['format']: return float(data["format"]["duration"]) return 0 def get_user_recordings(user_id): sql = "select id, recording_url from recording where user_id={} and created_on > {} and is_public = 1 and is_deleted = 0 and media_type in (1, 2, 3, 4, 9, 10) ".format( user_id, time.time() - 86400 * 30) res = get_shard_data_by_sql(sql, user_id) true_num = 0 for id, url in res: if download_url(url, user_id, str(id)): true_num += 1 if true_num > 15: break def download_url(url, uid, rid): url = str(url).replace("master.mp4", "origin_master.mp4") - c_dir = "/data/rsync/jianli.yang/AutoCoverTool/data/train_users/0317_byw_man/{}".format(uid) + c_dir = "/data/rsync/jianli.yang/AutoCoverTool/data/train_users/0414_0514/{}".format(uid) if not os.path.exists(c_dir): os.makedirs(c_dir) c_dir = os.path.join(c_dir, "src") if not os.path.exists(c_dir): os.makedirs(c_dir) cmd = "wget {} -O {}/{}.mp4".format(url, c_dir, rid) os.system(cmd) # 转码为44k双声道音频 in_path = os.path.join(c_dir, rid + ".mp4") if os.path.exists(in_path): duration = get_d(in_path) print("duration={}".format(duration)) if duration > 30: dst_path = in_path.replace(".mp4", ".wav") cmd = "ffmpeg -i {} -ar 44100 -ac 1 {}".format(in_path, dst_path) print("exec={}".format(cmd)) os.system(cmd) return os.path.exists(dst_path) return False def split_to_idx(ppath, dst_path, user_id): frame_len = 32000 * 15 files = glob.glob(os.path.join(ppath, "*mp4")) mmax = 0 for file in files: try: audio, sr = librosa.load(file, sr=32000, mono=True) except Exception as ex: continue print("audio_len:={}".format(audio.shape)) for i in range(0, len(audio), frame_len): if i + frame_len > len(audio): break cur_data = audio[i:i + frame_len] out_path = os.path.join(dst_path, "{}_{}.wav".format(user_id, mmax)) print("save to {}".format(out_path)) # librosa.output.write_wav(out_path, cur_data, 32000) soundfile.write(out_path, cur_data, 32000, format="wav") mmax += 1 +def process(): + from online.beanstalk_helper import BeanstalkHelper + config = {"addr": "sg-test-common-box-1:11300", "consumer": "auto_cover_tool_download_user"} + bean_helper = BeanstalkHelper(config) + bean = bean_helper.get_beanstalkd() + bean.watch(config["consumer"]) + while True: + payload = bean.reserve(5) + if not payload: + logging.info("bean sleep...") + continue + in_data = json.loads(payload.body) + get_user_recordings(in_data["user_id"]) + payload.delete() + + +def put_data(file_path): + lines = [] + with open(file_path, "r") as f: + while True: + line = f.readline().strip() + if not line: + break + lines.append(line) + from online.beanstalk_helper import BeanstalkHelper + config = {"addr": "sg-test-common-box-1:11300", "consumer": "auto_cover_tool_download_user"} + bean_helper = BeanstalkHelper(config) + for idx, line in enumerate(lines): + if idx == 0: + continue + user_id = line.split(",")[0] + message = json.dumps({"user_id": str(user_id)}) + bean_helper.put_payload_to_beanstalk(config["consumer"], message) + + +def copy_data(): + base_dir = "/data/rsync/jianli.yang/AutoCoverTool/data/train_users/0414_0514" + dst_dir = "/data/rsync/jianli.yang/AutoCoverTool/data/train_users/0414_0514_finish" + # 只要10首干声以及以上的 + dirs = glob.glob(os.path.join(base_dir, "*")) + for cur_dir in dirs: + cur_name = cur_dir.split("/")[-1] + cur_mp4_files = glob.glob(os.path.join(cur_dir, "src/*wav")) + if len(cur_mp4_files) > 10: + print("mv {} {}".format(cur_dir, os.path.join(dst_dir, cur_name))) + + if __name__ == '__main__': - arr = [ - "5348024335101054", - "4222124657245641", - "5629499489117674", - "12384898975368914", - "5629499489839033", - "5348024336648185", - "5910973794961321", - "3635518643", - "844424937670811", - "4785074600577375", - "6755399442719465", - "4785074603156924", - "11540474053041727", - "6473924129711210", - "7036874421386111", - "7599824376482810", - "6755399447475416", - "8444249306118343", - "3377699721107378", - "12947848931397021", - "7599824374449011", - "3096224748076687", - "12103424006572822", - "1125899914308640", - "12666373952417962", - "281474982845813", - "11821949029679778", - "12947848937379499", - "12947848936090348", - "3096224747262571", - "2814749767432467", - "5066549357604730", - "3096224751151928" - ] - for uuid in arr: - get_user_recordings(uuid) - print("finish =={} ".format(uuid)) + process() + # put_data("res/0414_0514.csv") + # arr = [ + # "5348024335101054", + # "4222124657245641", + # "5629499489117674", + # "12384898975368914", + # "5629499489839033", + # "5348024336648185", + # "5910973794961321", + # "3635518643", + # "844424937670811", + # "4785074600577375", + # "6755399442719465", + # "4785074603156924", + # "11540474053041727", + # "6473924129711210", + # "7036874421386111", + # "7599824376482810", + # "6755399447475416", + # "8444249306118343", + # "3377699721107378", + # "12947848931397021", + # "7599824374449011", + # "3096224748076687", + # "12103424006572822", + # "1125899914308640", + # "12666373952417962", + # "281474982845813", + # "11821949029679778", + # "12947848937379499", + # "12947848936090348", + # "3096224747262571", + # "2814749767432467", + # "5066549357604730", + # "3096224751151928" + # ] + # for uuid in arr: + # get_user_recordings(uuid) + # print("finish =={} ".format(uuid)) + # copy_data() diff --git a/AutoCoverTool/script/get_vocals_for_train.py b/AutoCoverTool/script/get_vocals_for_train.py index 02df6c6..e19c519 100644 --- a/AutoCoverTool/script/get_vocals_for_train.py +++ b/AutoCoverTool/script/get_vocals_for_train.py @@ -1,87 +1,99 @@ from ref.music_remover.separate_interface import * import sys import glob import librosa import soundfile +gs_denoise_exe = "/data/prod/bin/denoise_exe" + def get_vocal(in_file, vocal_out_file, acc_out_file): inst = SeparateInterface() try: inst.process(str(1), in_file, vocal_out_file, acc_out_file, 'cuda') except Exception as ex: print(ex) def split_to_idx(ppath, dst_path, user_id): frame_len = 32000 * 15 files = glob.glob(os.path.join(ppath, "*wav")) mmax = 0 for file in files: try: audio, sr = librosa.load(file, sr=32000, mono=True) except Exception as ex: continue print("audio_len:={}".format(audio.shape)) for i in range(0, len(audio), frame_len): if i + frame_len > len(audio): break cur_data = audio[i:i + frame_len] out_path = os.path.join(dst_path, "{}_{}.wav".format(user_id, mmax)) print("save to {}".format(out_path)) soundfile.write(out_path, cur_data, 32000, format="wav") mmax += 1 def abandon_wav(ppath): files = glob.glob(os.path.join(ppath, "*wav")) power_list = [] for file in files: try: audio, sr = librosa.load(file, sr=32000, mono=True) except Exception as ex: continue power = sum(audio * audio) / len(audio) power_list.append([power, file]) power_list_sorted = sorted(power_list, key=lambda x: x[0], reverse=True) for idx, file in enumerate(power_list_sorted): print(idx) if idx >= 80: os.unlink(file[1]) print("{},{}".format(idx, file[1])) def get_all_vocals(in_dir, out_dir): # dir = "data/train_users/zjl/src" # out_dir = "data/train_users/zjl/vocals/" - files = glob.glob(os.path.join(in_dir, "*wav")) + files = glob.glob(os.path.join(in_dir, "*mp4")) for file in files: idx = file.split(".")[-2].split("/")[-1] - get_vocal(file, os.path.join(out_dir, "{}.wav".format(idx)), None) + out_tmp_path = os.path.join(out_dir, "{}_tmp.wav".format(idx)) + out_path = os.path.join(out_dir, "{}.wav".format(idx)) + cmd = "ffmpeg -i {} -ar 32000 -ac 1 {}".format(file, out_tmp_path) + # 给的本来就是干声,不需要再次提取 + os.system(cmd) + if os.path.exists(out_tmp_path): + # 做一次降噪 + cmd = "{} {} {}".format(gs_denoise_exe, out_tmp_path, out_path) + os.system(cmd) + os.unlink(out_tmp_path) + # get_vocal(file, os.path.join(out_dir, "{}.wav".format(idx)), None) if __name__ == '__main__': name = sys.argv[1] # 文件夹名称 - base = "data/train_users/0317_top200" + base = "data/train_users/multi_users" print("start get_all_vocals ...") vocals_dir = os.path.join(base, os.path.join(name, "vocals")) if not os.path.exists(vocals_dir): os.makedirs(vocals_dir) speaker0_dir = os.path.join(base, os.path.join(name, "speaker0")) if not os.path.exists(speaker0_dir): os.makedirs(speaker0_dir) get_all_vocals(os.path.join(base, os.path.join(name, "src")), vocals_dir) print("start split_to_idx ...") split_to_idx(os.path.join(base, os.path.join(name, "vocals")), speaker0_dir, "0") print("start abandon_wav ...") abandon_wav(speaker0_dir) # src_file = sys.argv[1] # dst_vocal_file = sys.argv[2] # dst_acc_file = sys.argv[3] # get_vocal(src_file, dst_vocal_file, dst_acc_file) diff --git a/AutoCoverTool/script/train.sh b/AutoCoverTool/script/train.sh index 343f770..5f7a31b 100644 --- a/AutoCoverTool/script/train.sh +++ b/AutoCoverTool/script/train.sh @@ -1,21 +1,22 @@ -export LD_LIBRARY_PATH=/data/gpu_env_common/env/anaconda3/envs/so_vits_svc/lib:$LD_LIBRARY_PATH +#export LD_LIBRARY_PATH=/data/gpu_env_common/env/anaconda3/envs/so_vits_svc/lib:$LD_LIBRARY_PATH export PATH=$PATH:/data/gpu_env_common/env/bin/ffmpeg/bin -export PYTHONPATH=$PWD:$PWD/ref/music_remover/demucs +#export PYTHONPATH=$PWD:$PWD/ref/music_remover/demucs +export PYTHONPATH=$PWD:$PWD/ref/music_remover/demucs:$PWD/ref/so_vits_svc:$PWD/ref/split_dirty_frame:$PWD/ref/adaptive_voice_conversion -mkdir -p /data/rsync/jianli.yang/AutoCoverTool/data/train_users/0317_top200/$1/filelists -mkdir -p /data/rsync/jianli.yang/AutoCoverTool/data/train_users/0317_top200/$1/config +mkdir -p /data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1/filelists +mkdir -p /data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1/config # 1. 收集数据放到train_users/zjl/src # 2. 提取人声 & 分片 & 取音量响度大的Top80 -/data/gpu_env_common/env/anaconda3/envs/demucs/bin/python script/get_vocals_for_train.py $1 +/data/gpu_env_common/env/anaconda3/envs/auto_song_cover/bin/python script/get_vocals_for_train.py $1 # 3. 重采样 -/data/gpu_env_common/env/anaconda3/envs/so_vits_svc/bin/python ref/so_vits_svc/resample.py --in_dir=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/0317_top200/$1 --out_dir2=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/0317_top200/$1/slice_resample +/data/gpu_env_common/env/anaconda3/envs/auto_song_cover/bin/python ref/so_vits_svc/resample.py --in_dir=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1 --out_dir2=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1/slice_resample # 4. 生成配置文件 -/data/gpu_env_common/env/anaconda3/envs/so_vits_svc/bin/python ref/so_vits_svc/preprocess_flist_config.py --source_dir=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/0317_top200/$1/slice_resample --train_list=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/0317_top200/$1/filelists/train.txt --val_list=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/0317_top200/$1/filelists/val.txt --test_list=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/0317_top200/$1/filelists/test.txt --config_path=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/0317_top200/$1/config/config.json +/data/gpu_env_common/env/anaconda3/envs/auto_song_cover/bin/python ref/so_vits_svc/preprocess_flist_config.py --source_dir=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1/slice_resample --train_list=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1/filelists/train.txt --val_list=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1/filelists/val.txt --test_list=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1/filelists/test.txt --config_path=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1/config/config.json # 5. 预处理提取特征 -/data/gpu_env_common/env/anaconda3/envs/so_vits_svc/bin/python ref/so_vits_svc/preprocess_hubert_f0.py --in_dir=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/0317_top200/$1/slice_resample +/data/gpu_env_common/env/anaconda3/envs/auto_song_cover/bin/python ref/so_vits_svc/preprocess_hubert_f0.py --in_dir=/data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/$1/slice_resample # 6. 拷贝数据到logs文件夹 -mkdir -p data/train_users/0317_top200/$1/logs/32k -cp -r data/models/G_0.pth data/train_users/0317_top200/$1/logs/32k -cp -r data/models/D_0.pth data/train_users/0317_top200/$1/logs/32k +mkdir -p data/train_users/multi_users/$1/logs/32k +cp -r data/models/G_0.pth data/train_users/multi_users/$1/logs/32k +cp -r data/models/D_0.pth data/train_users/multi_users/$1/logs/32k # 7. 训练 -/data/gpu_env_common/env/anaconda3/envs/so_vits_svc/bin/python ref/so_vits_svc/train.py -c data/train_users/0317_top200/$1/config/config.json -m 32k -l data/train_users/0317_top200/$1/logs +/data/gpu_env_common/env/anaconda3/envs/auto_song_cover/bin/python ref/so_vits_svc/train.py -c data/train_users/multi_users/$1/config/config.json -m 32k -l data/train_users/multi_users/$1/logs diff --git a/AutoCoverTool/script/train_user.sh b/AutoCoverTool/script/train_user.sh index 6822f58..e69de29 100644 --- a/AutoCoverTool/script/train_user.sh +++ b/AutoCoverTool/script/train_user.sh @@ -1,139 +0,0 @@ -bash script/train.sh 10133099162774898 -bash script/train.sh 10133099165505798 -bash script/train.sh 10133099166238022 -bash script/train.sh 10133099166303694 -bash script/train.sh 10414574140750700 -bash script/train.sh 10414574143604234 -bash script/train.sh 10414574147828554 -bash script/train.sh 10696049121628986 -bash script/train.sh 10977524092826650 -bash script/train.sh 10977524093357608 -bash script/train.sh 10977524096777752 -bash script/train.sh 1125899910556940 -bash script/train.sh 1125899912511535 -bash script/train.sh 1125899912859360 -bash script/train.sh 1125899912929958 -bash script/train.sh 1125899913358232 -bash script/train.sh 1125899914953727 -bash script/train.sh 11821949025902914 -bash script/train.sh 11821949026179480 -bash script/train.sh 11821949026431616 -bash script/train.sh 11821949029742724 -bash script/train.sh 12103423998600142 -bash script/train.sh 12103423999409280 -bash script/train.sh 12103424002079730 -bash script/train.sh 12103424005848546 -bash script/train.sh 12947848932595064 -bash script/train.sh 12947848933097644 -bash script/train.sh 12947848933316944 -bash script/train.sh 12947848933899946 -bash script/train.sh 12947848936938186 -bash script/train.sh 13229323906199158 -bash script/train.sh 13792273859683018 -bash script/train.sh 13792273860722074 -bash script/train.sh 1688849861813535 -bash script/train.sh 1688849863542144 -bash script/train.sh 1688849864113505 -bash script/train.sh 1688849867241550 -bash script/train.sh 1688849867808490 -bash script/train.sh 1688849867868127 -bash script/train.sh 1688849868202697 -bash script/train.sh 1970324838195179 -bash script/train.sh 1970324840341371 -bash script/train.sh 1970324842945245 -bash script/train.sh 2251799815699655 -bash script/train.sh 2251799816635889 -bash script/train.sh 2251799817218088 -bash script/train.sh 2251799819353845 -bash script/train.sh 2533274793280297 -bash script/train.sh 2533274793464264 -bash script/train.sh 2533274793682339 -bash script/train.sh 2533274794030877 -bash script/train.sh 2533274795871759 -bash script/train.sh 2533274796020110 -bash script/train.sh 281474976786003 -bash script/train.sh 2814749768921322 -bash script/train.sh 2814749769914305 -bash script/train.sh 2814749769999875 -bash script/train.sh 2814749774295517 -bash script/train.sh 2814749774761906 -bash script/train.sh 281474979999994 -bash script/train.sh 281474983182829 -bash script/train.sh 3096224745564693 -bash script/train.sh 3096224747284097 -bash script/train.sh 3096224747482330 -bash script/train.sh 3096224747502899 -bash script/train.sh 3377699723287863 -bash script/train.sh 3377699724121868 -bash script/train.sh 3377699728561333 -bash script/train.sh 3638106974 -bash script/train.sh 3659174700455188 -bash script/train.sh 3659174704229686 -bash script/train.sh 3659174704442140 -bash script/train.sh 3659174705167593 -bash script/train.sh 3940649674894493 -bash script/train.sh 3940649677150746 -bash script/train.sh 3940649677324319 -bash script/train.sh 3940649677979647 -bash script/train.sh 3940649681330250 -bash script/train.sh 4222124654431329 -bash script/train.sh 4222124658020004 -bash script/train.sh 4222124658467837 -bash script/train.sh 4785074599520860 -bash script/train.sh 4785074599792504 -bash script/train.sh 4785074600347962 -bash script/train.sh 5066549357106838 -bash script/train.sh 5066549357219186 -bash script/train.sh 5066549357798842 -bash script/train.sh 5348024333764403 -bash script/train.sh 5348024333962443 -bash script/train.sh 5348024334065079 -bash script/train.sh 5348024339154304 -bash script/train.sh 5629499489621786 -bash script/train.sh 562949957074725 -bash script/train.sh 562949957107160 -bash script/train.sh 562949959231308 -bash script/train.sh 562949959983729 -bash script/train.sh 562949961216348 -bash script/train.sh 5910973797429506 -bash script/train.sh 6192448700267754 -bash script/train.sh 6192448704854979 -bash script/train.sh 6192448705925464 -bash script/train.sh 6192448706574397 -bash script/train.sh 6473924132718447 -bash script/train.sh 6755399375717418 -bash script/train.sh 6755399378511542 -bash script/train.sh 6755399379632118 -bash script/train.sh 6755399443759397 -bash script/train.sh 6755399447820306 -bash script/train.sh 7036874420974681 -bash script/train.sh 7036874422862432 -bash script/train.sh 7036874423888346 -bash script/train.sh 7036874424774508 -bash script/train.sh 7318349395720624 -bash script/train.sh 7318349397789093 -bash script/train.sh 7318349399797992 -bash script/train.sh 7318349400063521 -bash script/train.sh 7318349402559835 -bash script/train.sh 7599824374258214 -bash script/train.sh 7599824374439276 -bash script/train.sh 7599824374580278 -bash script/train.sh 7599824374587612 -bash script/train.sh 7881299351264940 -bash script/train.sh 7881299355927226 -bash script/train.sh 8162774325621941 -bash script/train.sh 8162774328094468 -bash script/train.sh 8444249304724233 -bash script/train.sh 8444249305973270 -bash script/train.sh 8444249306804457 -bash script/train.sh 844424931580446 -bash script/train.sh 844424933082968 -bash script/train.sh 844424933461071 -bash script/train.sh 844424933613855 -bash script/train.sh 844424933858702 -bash script/train.sh 844424934490972 -bash script/train.sh 844424937122795 -bash script/train.sh 844424937146141 -bash script/train.sh 8725724280938085 -bash script/train.sh 8725724284783179 -bash script/train.sh 8725724285034474 \ No newline at end of file