diff --git a/AutoCoverTool/ref/tools/mixer/simple_mixer.cpp b/AutoCoverTool/ref/tools/mixer/simple_mixer.cpp index 036d772..78ca63a 100644 --- a/AutoCoverTool/ref/tools/mixer/simple_mixer.cpp +++ b/AutoCoverTool/ref/tools/mixer/simple_mixer.cpp @@ -1,238 +1,264 @@ // // Created by yangjianli on 2019-09-09. // /** * 输入一个音频和伴奏自动进行混合 * gated_loudness 当前音量 * gain 预期增益 */ #include "iostream" #include "WaveFile.h" #include "math.h" #include "ebur128.h" #include "AudioMixer.h" #include "alimiter.h" #include "waves/inc/WaveFile.h" #include "CAudioEffectsChainApi.h" #include "string" #include "ae_server/CAeServer.h" #include #include #include #include #include #include "denoise/webrtc/include/WebrtcDenoise.h" #define PROC_LEN 1024 #define DEFAULT_BASELINE_DB (float)-14.57f int short2float(short *pInBuf, int nLen, float *pOutBuf) { for (int i = 0; i < nLen; i++) { pOutBuf[i] = pInBuf[i] * 1.0 / 32768; } return 0; } int float2short(float *pInBuf, int nLen, short *pOutBuf) { for (int i = 0; i < nLen; i++) { pOutBuf[i] = int(pInBuf[i] * 32768); } return 0; } /** * 获取增益 * @param nChannel * @param nSampleRate * @param pData * @param nLength * @param gain * @return */ -int ebur128_whole(int nChannel, int nSampleRate, short *pData, const int nLength, double &gated_loudness, double &gain) +int ebur128_whole(int nChannel, int nSampleRate, float *pData, const int nLength, double &gated_loudness, double &gain) { printf("ebur128_init start .. %d\n", nLength); ebur128_state *st = NULL; st = ebur128_init(nChannel, nSampleRate, EBUR128_MODE_I); if (NULL == st) { return -1; } int nPos = 0; int nTmpLength = 0; int nRet; printf("process start ..\n"); while (nPos < nLength) { nTmpLength = PROC_LEN; if (nLength - nPos < PROC_LEN) { nTmpLength = nLength - nPos; } - nRet = ebur128_add_frames_short(st, pData + nPos, nTmpLength / nChannel); + nRet = ebur128_add_frames_float(st, pData + nPos, nTmpLength / nChannel); if (nRet != 0) { return -2; } nPos += nTmpLength; } printf("process ok..\n"); gated_loudness = -1; ebur128_loudness_global(st, &gated_loudness); float db = (DEFAULT_BASELINE_DB - gated_loudness) / 20.f; gain = pow(10, db); printf("gated_loudness = %f db = %f gain = %f\n", gated_loudness, db, gain); ebur128_destroy(&st); return 0; } /** * 混合音频和伴奏 * @param pVocalIn * @param pAccIn * @param nLength * @param gainVocal * @param gainAcc * @param pOutBuf * @return */ int mix(float *pVocalIn, float *pAccIn, int nLength, double gainVocal, double gainAcc, float *pOutBuf, int nSampleRate, int nChannel, int nDelay) { CAudioMixer *cAudioMixer = new CAudioMixer(); cAudioMixer->init(nSampleRate, nChannel); cAudioMixer->set_acc_delay(nDelay); cAudioMixer->set_vocal_volume(int(gainVocal * 50)); cAudioMixer->set_acc_volume(int(gainAcc * 50)); int nPos = 0; int nStep = 1024; float *fTmp = new float[nStep]; cAudioMixer->reset(); nPos = 0; nStep = 1024; int cnt = 0; while (nPos < nLength) { if (nLength - nPos < nStep) { nStep = nLength - nPos; } cnt++; cAudioMixer->process(pVocalIn + nPos, pAccIn + nPos, pOutBuf + nPos, nStep); nPos += nStep; } delete cAudioMixer; delete[] fTmp; return 0; } int denoise_webrtc(short *pInBuf, int nLength, int nChannel, int nSampleRate) { CWebrtcDenoise cWebrtcDenoise; cWebrtcDenoise.init(nSampleRate, nChannel); float *pTmp = new float[nLength]; for (int i = 0; i < nLength; i++) { pTmp[i] = pInBuf[i] * 1.0 / 32768; } cWebrtcDenoise.set_level(kHigh); int nStep = 512 * nChannel; for (int i = 0; i < nStep; i++) { pTmp[i] = pTmp[i] * i * 1.0 / nStep; } for (int i = 0, cnt = 0; i < nLength; i += nStep, cnt++) { if (nLength - i < nStep) continue; cWebrtcDenoise.process(pTmp + i, nStep); } for (int i = 0; i < nLength; i++) { pInBuf[i] = short(pTmp[i] * 32768); } delete[] pTmp; return 0; } double calc_power_rate(float *in_data, int32_t in_len, float *ref_data, int32_t ref_len) { double in_power = 0; double ref_power = 0; int32_t min_len = in_len > ref_len ? ref_len : in_len; for (int i = 0; i < min_len; i++) { in_power += (in_data[i]) * (in_data[i]); ref_power += (ref_data[i]) * (ref_data[i]); } return ref_power / in_power; } int main(int argc, char *argv[]) { - if (argc != 4) + if (argc < 4) { - printf("input error! example: ./main vocal_path acc_path mix_path\n"); + printf("input error! example: ./main vocal_path acc_path mix_path use_ada_loudness[0/1][可选]\n"); return -1; } + std::string sVocal = argv[1]; std::string sAcc = argv[2]; std::string sMix = argv[3]; + int ada_loudness = 0; + if (argc > 4) + { + ada_loudness = atoi(argv[4]); // 是否自适应音量 + } + // 读取人声 CWaveFile *oWaveFile = new CWaveFile(sVocal.c_str(), false); float *pfVocalBuf = new float[oWaveFile->GetTotalFrames() * oWaveFile->GetChannels()]; oWaveFile->ReadFrameAsfloat(pfVocalBuf, oWaveFile->GetTotalFrames()); //读取伴奏 CWaveFile *oWaveFile1 = new CWaveFile(sAcc.c_str(), false); float *pfAccBuf = new float[oWaveFile1->GetTotalFrames() * oWaveFile1->GetChannels()]; oWaveFile1->ReadFrameAsfloat(pfAccBuf, oWaveFile1->GetTotalFrames()); if (oWaveFile->GetChannels() != oWaveFile1->GetChannels()) { printf("channel not equal!\n"); return -1; } // 混合音频和伴奏 printf("mix wav:%s and acc:%s! %d,%d\n", sVocal.c_str(), sAcc.c_str(), oWaveFile->GetTotalFrames(), oWaveFile1->GetTotalFrames()); int nOutLen = oWaveFile->GetTotalFrames() < oWaveFile1->GetTotalFrames() ? oWaveFile->GetTotalFrames() : oWaveFile1->GetTotalFrames(); printf("XXXXXXX, %d,%d\n", nOutLen, oWaveFile->GetChannels()); nOutLen = nOutLen * oWaveFile->GetChannels(); float *pOutBuf = new float[nOutLen]; - mix(pfVocalBuf, pfAccBuf, nOutLen, 1.0, 1.0, pOutBuf, oWaveFile->GetSampleRate(), oWaveFile->GetChannels(), 0); + + // 伴奏不动,人声拉伸到伴奏的1.5倍 + double vocal_gain = 1.0; + if (ada_loudness == 1) + { + // 人声响度以及音量增益 + double vocal_gated_loudness = 0; + ebur128_whole(oWaveFile->GetChannels(), oWaveFile->GetSampleRate(), pfVocalBuf, nOutLen, vocal_gated_loudness, vocal_gain); + + // 伴奏响度以及音量增益 + double acc_gated_loudness = 0; + double acc_gain = 1.0; + ebur128_whole(oWaveFile->GetChannels(), oWaveFile->GetSampleRate(), pfAccBuf, nOutLen, acc_gated_loudness, acc_gain); + + // 伴奏不动,将人声拉到伴奏的1.5倍响度 + float db = (acc_gated_loudness - vocal_gated_loudness) / 20.f; + vocal_gain = pow(10, db) * 1.5; + } + + mix(pfVocalBuf, pfAccBuf, nOutLen, vocal_gain, 1.0, pOutBuf, oWaveFile->GetSampleRate(), oWaveFile->GetChannels(), 0); //写入文件 printf("write2file nLength:%d path:%s!\n", nOutLen, sMix.c_str()); CWaveFile *oWaveFile2 = new CWaveFile(sMix.c_str(), true); oWaveFile2->SetSampleFormat(SF_IEEE_FLOAT); oWaveFile2->SetSampleRate(oWaveFile->GetSampleRate()); oWaveFile2->SetChannels(oWaveFile->GetChannels()); oWaveFile2->SetupDone(); oWaveFile2->WriteFrame(pOutBuf, nOutLen / oWaveFile->GetChannels()); delete oWaveFile; delete oWaveFile1; delete oWaveFile2; delete[] pfVocalBuf; delete[] pfAccBuf; delete[] pOutBuf; return 0; } \ No newline at end of file diff --git a/AutoCoverTool/script/update_model_gender.py b/AutoCoverTool/script/update_model_gender.py new file mode 100644 index 0000000..c9160f1 --- /dev/null +++ b/AutoCoverTool/script/update_model_gender.py @@ -0,0 +1,198 @@ +from online.common import update_db, get_data_by_mysql + + +def update_gender(user_id, gender): + """ + 查看数据库,只有当性别是3[未知]再更新 + :return: + """ + sql = "select * from av_db.av_svc_model where user_id=\"{}\" and gender=3".format(user_id) + data = get_data_by_mysql(sql) + if len(data) == 1: + sql = "update av_db.av_svc_model set gender={} where user_id=\"{}\"".format(gender, user_id) + update_db(sql) + + +if __name__ == '__main__': + arr = [ + ["10133099162839896", 2], + ["10133099162997509", 2], + ["10133099163727028", 1], + ["10133099163890661", 2], + ["10133099163991355", 2], + ["10133099164311744", 1], + ["10133099164313669", 1], + ["10133099165386135", 1], + ["10133099166041782", 1], + ["10133099166050735", 1], + ["10133099166238022", 2], + ["10133099166605472", 2], + ["10133099166892845", 1], + ["10133099166898301", 2], + ["10133099167125366", 1], + ["10133099167394822", 2], + ["10133099167940583", 2], + ["10133099168376799", 1], + ["10133099168924385", 2], + ["10133099169324630", 1], + ["10133099169381678", 2], + ["10133099169957610", 2], + ["10133099169975944", 1], + ["10133099170492806", 1], + ["10133099170892510", 1], + ["10133099171013390", 1], + ["10133099171081854", 2], + ["10133099171087756", 1], + ["10133099171192036", 1], + ["10133099171607206", 2], + ["10133099171754668", 2], + ["10133099172086640", 2], + ["10133099172138002", 2], + ["10133099172207062", 1], + ["10133099172269180", 1], + ["10133099172339368", 1], + ["10414574138359736", 2], + ["10414574138369704", 2], + ["10414574138530136", 1], + ["10414574139102564", 2], + ["10414574139967984", 1], + ["10414574140258122", 1], + ["10414574140405046", 1], + ["10414574140676612", 1], + ["10414574140780266", 2], + ["10414574142812606", 2], + ["10414574143134746", 1], + ["10414574143604234", 1], + ["10414574143906306", 1], + ["10414574144526110", 1], + ["10414574144707118", 1], + ["10414574145823464", 2], + ["10414574145904464", 1], + ["10414574146080322", 2], + ["10414574146420792", 2], + ["10414574146884926", 1], + ["10414574147339012", 1], + ["10414574147372254", 2], + ["10414574147425002", 2], + ["10414574147597736", 2], + ["10414574147647706", 1], + ["10414574147658166", 1], + ["10414574147828554", 1], + ["10414574148014424", 1], + ["10414574148247626", 2], + ["10414574148624370", 2], + ["10414574148669184", 1], + ["10414574148692388", 1], + ["10414574148859406", 1], + ["10414574149000590", 1], + ["10414574149067094", 1], + ["10414574149143568", 2], + ["10414574149221618", 1], + ["10414574149303702", 2], + ["10696049115833380", 1], + ["10696049115944594", 1], + ["10696049115987498", 1], + ["10696049116130908", 2], + ["10696049116285936", 2], + ["10696049117044138", 2], + ["10696049117276112", 1], + ["10696049117685892", 2], + ["10696049119207544", 1], + ["10696049119659788", 1], + ["10696049120426324", 2], + ["10696049120919532", 1], + ["10696049121183928", 1], + ["10696049121338248", 1], + ["10696049121406512", 1], + ["10696049121502826", 2], + ["10696049123071172", 1], + ["10696049123219186", 1], + ["10696049123447868", 1], + ["10696049123506368", 2], + ["10696049123660154", 2], + ["10696049123805538", 1], + ["10696049124073344", 2], + ["10696049124110520", 1], + ["10696049124182084", 2], + ["10696049124450100", 1], + ["10696049124595430", 2], + ["10696049124833978", 2], + ["10696049125084058", 2], + ["10696049125481092", 2], + ["10696049125584584", 1], + ["10696049125798928", 2], + ["10696049125820940", 1], + ["10696049125864268", 2], + ["10696049125885128", 1], + ["10696049125972416", 1], + ["10696049125997808", 1], + ["10696049125999636", 2], + ["10977524091895906", 2], + ["10977524092611108", 1], + ["10977524092703694", 2], + ["10977524092737576", 2], + ["10977524092926748", 1], + ["10977524093350560", 2], + ["10977524093613618", 1], + ["10977524094859474", 1], + ["10977524096635844", 1], + ["10977524096695280", 2], + ["10977524096819198", 1], + ["10977524096995342", 2], + ["10977524098416100", 1], + ["10977524098804908", 1], + ["10977524099612646", 2], + ["10977524100174518", 1], + ["10977524100978492", 1], + ["10977524101050108", 1], + ["10977524101220516", 1], + ["10977524101243434", 2], + ["10977524101575638", 2], + ["10977524101593280", 1], + ["10977524101680844", 1], + ["10977524102313334", 1], + ["10977524102348346", 1], + ["10977524102432628", 1], + ["10977524102444474", 2], + ["10977524102525738", 2], + ["10977524102533320", 1], + ["10977524102598012", 1], + ["10977524102674590", 2], + ["10977524102678972", 2], + ["10977524102679572", 2], + ["1125899906849269", 1], + ["1125899908853925", 1], + ["1125899908854526", 1], + ["1125899908904395", 1], + ["1125899909347935", 1], + ["1125899909790502", 1], + ["1125899910057693", 2], + ["1125899910105120", 1], + ["1125899910461551", 1], + ["1125899910516883", 1], + ["1125899910808376", 2], + ["1125899910826302", 2], + ["1125899910943438", 1], + ["1125899911011477", 1], + ["1125899911821662", 2], + ["1125899911962207", 1], + ["1125899912327206", 2], + ["1125899912442110", 2], + ["1125899912511535", 1], + ["1125899912520616", 2], + ["1125899912538184", 1], + ["1125899912584668", 1], + ["1125899912859360", 1], + ["1125899912895306", 2], + ["1125899912929958", 1], + ["1125899912987231", 1], + ["1125899913281334", 1], + ["1125899913294939", 2], + ["3635049378", 1], + ["8725724286358130", 2] + ] + + for aa in arr: + user_id, gender = aa + update_gender(user_id, gender) + # exit(-1) diff --git a/AutoCoverTool/svc_inference/svc_inference_one.py b/AutoCoverTool/svc_inference/svc_inference_one.py index ecb8e8c..b0f7157 100644 --- a/AutoCoverTool/svc_inference/svc_inference_one.py +++ b/AutoCoverTool/svc_inference/svc_inference_one.py @@ -1,215 +1,215 @@ """ SVC推理逻辑 conda activate auto_song_cover_t4 export PYTHONPATH=$PWD:$PWD/ref/music_remover/demucs:$PWD/ref/so_vits_svc:$PWD/ref/split_dirty_frame:$PWD/ref/online """ import os import json import shutil from ref.so_vits_svc.inference_main import * from ref.speaker_feature_extractor.sf_extractor_interface import SFExtractorInterface gs_draw_volume_exe = "/data/gpu_env_common/bin/draw_volume" gs_simple_mixer_path = "/data/gpu_env_common/bin/simple_mixer" gs_svci_success = 0 gs_svci_data_params_check_model_path = 1 gs_svci_data_params_check_vocal_path = 2 gs_svci_data_params_check_acc_path = 3 gs_svci_data_params_check_video_path = 4 gs_svci_data_prepare_transcode_media = 5 gs_svci_data_inference = 6 gs_svci_svc_trans_442 = 7 gs_svci_svc_volume = 8 gs_svci_svc_mix = 9 gs_svci_svc_mix_gen = 10 gs_svci_svc_mix_audio_video = 11 class SVCInferenceOne: def __init__(self): self.vocal_32_wav_path = None self.vocal_wav_path = None self.acc_wav_path = None self.config = os.path.join(os.path.dirname(os.path.abspath(__file__)), "config.json") self.spk_emb_inst = SFExtractorInterface() def mix(self, work_dir, svc_file, vocal_file, acc_file, mix_path): """ :param work_dir: :param svc_file: :param vocal_file: :param acc_file: :param mix_path: :return: """ cache_dir = os.path.join(work_dir, "cache") if os.path.exists(cache_dir): shutil.rmtree(cache_dir) os.makedirs(cache_dir) # svc转码到442 svc_442_file = os.path.join(cache_dir, "442.wav") st = time.time() cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {} -loglevel fatal".format(svc_file, svc_442_file) os.system(cmd) if not os.path.exists(svc_442_file): return gs_svci_svc_trans_442 logging.info("transcode,{},sp={}".format(svc_file, time.time() - st)) # 合并转码后再做一次拉伸,保证响度 st = time.time() volume_path = os.path.join(cache_dir, "volume.wav") cmd = "{} {} {} {}".format(gs_draw_volume_exe, svc_442_file, vocal_file, volume_path) os.system(cmd) if not os.path.exists(volume_path): print("{} ERROR draw volume".format(volume_path)) return gs_svci_svc_volume logging.info("draw_volume2,{},sp={}".format(svc_file, time.time() - st)) # 混合 st = time.time() mix_wav_path = os.path.join(cache_dir, "mix.wav") - cmd = "{} {} {} {}".format(gs_simple_mixer_path, volume_path, acc_file, mix_wav_path) + cmd = "{} {} {} {} 1".format(gs_simple_mixer_path, volume_path, acc_file, mix_wav_path) os.system(cmd) if not os.path.exists(mix_wav_path): return gs_svci_svc_mix logging.info("mixer,{},sp={}".format(svc_file, time.time() - st)) # 编码为m4a st = time.time() cmd = "ffmpeg -i {} -ab 128k -y {} -loglevel fatal".format(mix_wav_path, mix_path) print(cmd) os.system(cmd) if not os.path.exists(mix_path): return gs_svci_svc_mix logging.info("encode,{},sp={}".format(svc_file, time.time() - st)) return gs_svci_success def params_check(self, model_path, vocal_path, acc_path, video_path): if not os.path.exists(model_path): print("model_path={} is null".format(model_path)) return gs_svci_data_params_check_model_path if not os.path.exists(vocal_path): print("vocal_path={} is null".format(vocal_path)) return gs_svci_data_params_check_vocal_path if not os.path.exists(acc_path): print("acc_path={} is null".format(acc_path)) return gs_svci_data_params_check_acc_path if not os.path.exists(video_path): print("video_path={} is null".format(video_path)) return gs_svci_data_params_check_video_path return gs_svci_success def data_prepare(self, work_dir, vocal_path, acc_path): self.vocal_32_wav_path = os.path.join(work_dir, "vocal_32.wav") self.vocal_wav_path = os.path.join(work_dir, "vocal.wav") self.acc_wav_path = os.path.join(work_dir, "acc.wav") cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {}".format(vocal_path, self.vocal_wav_path) os.system(cmd) cmd = "ffmpeg -i {} -ar 32000 -ac 1 -y {}".format(vocal_path, self.vocal_32_wav_path) os.system(cmd) cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {}".format(acc_path, self.acc_wav_path) os.system(cmd) return os.path.exists(self.vocal_32_wav_path) and os.path.exists(self.acc_wav_path) def process_logic(self, work_dir, model_path, vocal_path, acc_path, video_path, out_path): # 1. 先转码人声和伴奏 st = time.time() if not self.data_prepare(work_dir, vocal_path, acc_path): print("transcode vocal={} or acc={} err!\n".format(vocal_path, acc_path)) return gs_svci_data_prepare_transcode_media print("transcode vocal and acc sp={}".format(time.time() - st)) # 2. 进行推理 # raw_audio_path, dst_path st = time.time() svc_file = os.path.join(work_dir, "trans_vocal.wav") # try: inf(model_path, self.config, self.vocal_32_wav_path, svc_file, 'prod') # except Exception as ex: # print(ex) if not os.path.exists(svc_file): print("inference err vocal_path={}, model_path={}".format(vocal_path, model_path)) return gs_svci_data_inference, [] print("inf sp={}".format(time.time() - st)) # 3. 生成作品 st = time.time() mix_tmp_path = os.path.join(work_dir, "mix.wav") err = self.mix(work_dir, svc_file, self.vocal_wav_path, self.acc_wav_path, mix_tmp_path) if err != gs_svci_success: return err, [] if not os.path.exists(mix_tmp_path): return gs_svci_svc_mix_gen, [] print("mix sp={}".format(time.time() - st)) st = time.time() # 4. 音频编码,并且和视频合并 cmd = "ffmpeg -i {} -i {} -acodec aac -strict -2 -b:a 128k -vcodec copy -shortest -af apad -y {}".format( video_path, mix_tmp_path, out_path) os.system(cmd) if not os.path.exists(out_path): print("mix audio_video err={}".format(video_path, mix_tmp_path)) return gs_svci_svc_mix_audio_video, [] print("mix audio and video sp={}".format(time.time() - st)) # 5. 提取emb st = time.time() emb = self.spk_emb_inst.process(svc_file) print("get emb sp={}".format(time.time() - st)) return gs_svci_success, emb def process(self, work_dir, model_path, vocal_path, acc_path, video_path, out_path): err = self.params_check(model_path, vocal_path, acc_path, video_path) if err != gs_svci_success: return err, [] if os.path.exists(work_dir): shutil.rmtree(work_dir) os.makedirs(work_dir) st = time.time() err, emb = self.process_logic(work_dir, model_path, vocal_path, acc_path, video_path, out_path) print("process_logic sp={}".format(time.time() - st)) shutil.rmtree(work_dir) return err, emb def test(): svc_inst = SVCInferenceOne() b_dir = "/data/rsync/jianli.yang/AutoCoverTool/data/test_svc_inference_one/" w_dir = os.path.join(b_dir, "rg_input") in_m4a = os.path.join(b_dir, "rg_input.m4a") in_acc_m4a = os.path.join(b_dir, "acc.m4a") in_video = os.path.join(b_dir, "rg.mp4") out_video = os.path.join(b_dir, "rg_input_out.mp4") m_path = "/data/rsync/jianli.yang/AutoCoverTool/data/train_users/jianli/logs/32k/G_2000.pth" err, emb = svc_inst.process(w_dir, m_path, in_m4a, in_acc_m4a, in_video, out_video) print(err) print(emb) if __name__ == '__main__': if len(sys.argv) != 8: print("input err!") print( "example: work_dir[临时工作目录,内部会自动清除数据] model_path in_media_path in_acc_media_path in_video_path out_video_path emb_path") exit(-1) w_dir = sys.argv[1] m_path = sys.argv[2] in_m4a = sys.argv[3] in_acc_m4a = sys.argv[4] in_video = sys.argv[5] out_video = sys.argv[6] emb_path = sys.argv[7] svc_inst = SVCInferenceOne() err, emb = svc_inst.process(w_dir, m_path, in_m4a, in_acc_m4a, in_video, out_video) with open(emb_path, "w") as f: f.write(json.dumps({"emb": list(emb.tolist())})) # print(err) # print(emb)