diff --git a/.arcconfig b/.arcconfig new file mode 100644 index 0000000..99e9d55 --- /dev/null +++ b/.arcconfig @@ -0,0 +1,4 @@ +{ + "phabricator.uri" : "http://phabricator.ushow.media/", + "editor": "vim" +} diff --git a/mnn_demo/inc/CRvcLiteSynthesizer.h b/mnn_demo/inc/CRvcLiteSynthesizer.h index cedcb2e..c25d0df 100644 --- a/mnn_demo/inc/CRvcLiteSynthesizer.h +++ b/mnn_demo/inc/CRvcLiteSynthesizer.h @@ -1,82 +1,82 @@ // // Created by Administrator on 2024/1/21. // #ifndef MNN_DEMO_CRVCLITESYNTHESIZER_H #define MNN_DEMO_CRVCLITESYNTHESIZER_H #include "CRvcLiteOnline.h" class CRvcLiteSynthesizer { public: CRvcLiteSynthesizer(); ~CRvcLiteSynthesizer(); public: /** * 初始化 * @param hubert_model 语义模型地址 * @param sample_rate 采样率 * @param channel 通道数 * @return 0 表示正常 */ int init(const char* hubert_model, int sample_rate, int channel); /** * 选择人声模型 * @param synth_model 音色模型地址 * @param enable 是否开启 * @return */ int switch_model(const char* synth_model); /** * 设置变调,范围是[-12, 12] * 有人声模型才生效,否则不生效 * 换人声模型,该状态不会丢失,并且在无人声的时候设置之后,有人声模型后也会生效 * @param key */ void set_up_key(int key); /** * reset,清空内部数据 */ void reset(); /** - * 处理逻辑 + * 处理逻辑:每次输入的长度不要太长,建议在900ms左右即可 * @param in_buf 输入的buf * @param in_len 输入的Buf长度,frame*channel,建议输入小于等于1s的音频长度,尽量的大就好 * @param out_buf 输出的buf * @param out_len 输出的buf长度, frame*channel * 注意: 此处有可能出现输出的长度不一定等于in_len,输出的值会小于等于out_len,但是是连续的,所以out_len可以适当比in_len大一些,从而保证都能搞出来 * @return */ int process(float* in_buf, int in_len, float* out_buf, int &out_len); /** * 获取实时率,处理1s数据的真实耗时/1s * @return */ float get_rtf(); private: std::shared_ptr m_rvc_inst; std::shared_ptr m_resample2_16; std::shared_ptr m_resample2src; int m_channel; int m_sample_rate; std::shared_ptr m_buf_tmp_16k; int m_buf_tmp_16k_len; int m_buf_tmp_16k_cap; std::shared_ptr m_buf_tmp_32k; int m_buf_tmp_32k_len; int m_buf_tmp_32k_cap; std::shared_ptr m_buf_tmp_src; int m_buf_tmp_src_len; int m_buf_tmp_src_cap; bool m_first; }; #endif //MNN_DEMO_CRVCLITESYNTHESIZER_H diff --git a/mnn_demo/main.cpp b/mnn_demo/main.cpp index 0d6b685..d742793 100644 --- a/mnn_demo/main.cpp +++ b/mnn_demo/main.cpp @@ -1,285 +1,286 @@ #include #include #include #include "src/Hubert.h" #include "src/CSynthesizer.h" #include "CRvcLiteSynthesizer.h" #include "CRvcLiteOnlineV2.h" int test_hubert() { const char *hubert_model_path = "/mnt/d/dataset/svc/models/mnn/hubert_test_v1_fp16.mnn"; Hubert hubert; int err_code = hubert.init(hubert_model_path); std::vector input(33280, 0.1); std::vector>> ret; ret.resize(1); ret[0].resize(205); for (int i = 0; i < 205; i++) { ret[0][i].resize(256); } float time = hubert.process(input.data(), ret); return 0; } int test_contentvec() { const char *contentvec_model_path = "/mnt/d/dataset/svc/models/mnn/contentvec_test_fp16.mnn"; CSynthesizer contentVec; int err_code = contentVec.init(contentvec_model_path); std::vector>> input(1); input[0].resize(205); for (int i = 0; i < 205; i++) { for (int j = 0; j < 258; j++) { if (j == 256) { input[0][i].push_back(0.2); } else if (j == 257) { input[0][i].push_back(1.0); } else { input[0][i].push_back(0.1); } } } std::vector>> ret; ret.resize(1); for (int i = 0; i < 1; i++) { ret[i].resize(1); ret[i][0].resize(35840); } float tot = 0.f; for (int i = 0; i < 10; i++) { float time = contentVec.process(input, ret); tot += time; } printf("time: %f \n", tot / 100.f); return 0; } #include "CRvcLiteOnline.h" #include "av_waves/waves/inc/STWaveFile.h" void test() { const char *hubert_model_path = "/mnt/d/dataset/svc/models/mnn/hubert_test_v2_fp16.mnn"; const char *contentvec_model_path = "/mnt/d/dataset/svc/models/mnn/contentvec_test_fp16.mnn"; const char *in_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01_16.wav"; // const char *in_wav = "/mnt/d/code/develop/svc/Retrieval-based-Voice-Conversion-WebUI/online/1_1.wav"; const char *out_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01_cpp_v1.wav"; CRvcLiteOnline rvc_inst; rvc_inst.init(hubert_model_path); // 读取音频文件, 要求16k,单声道 STCWaveFile wav_inst(in_wav, false); int sample_rate = wav_inst.GetSampleRate(); int channel = wav_inst.GetChannels(); int len = wav_inst.GetTotalFrames() * channel; float *data = new float[len]; float *outdata = new float[len * 2]; wav_inst.ReadFrameAsfloat(data, wav_inst.GetTotalFrames()); int step = sample_rate; printf("start ..\n"); for (int i = 0; i < len; i += step) { if (i + step > len) { step = len - i; } struct timeval start; struct timeval end; gettimeofday(&start, NULL); rvc_inst.process_block(data + i, step, outdata + 2 * i, 2 * step); gettimeofday(&end, NULL); printf("sp = %f ms\n", (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0); } STCWaveFile wav_out_inst(out_wav, true); wav_out_inst.SetSampleRate(32000); wav_out_inst.SetChannels(1); wav_out_inst.SetSampleFormat(SF_IEEE_FLOAT); wav_out_inst.SetupDone(); wav_out_inst.WriteFrame(outdata, len * 2); printf("finish2 ....\n"); } void test_rvc_lite_synth() { const char *hubert_model_path = "/mnt/d/dataset/svc/models/layers_3/layer3_contentvec.mnn"; const char *syz_model = "/mnt/d/dataset/svc/models/layers_3/layer3_syz.mnn"; const char *out_wav = "/mnt/d/dataset/tmp/i_out3.wav"; const char *in_wav = "/mnt/d/dataset/tmp/t1.wav"; STCWaveFile wav_inst(in_wav, false); int sample_rate = wav_inst.GetSampleRate(); int channel = wav_inst.GetChannels(); int len = wav_inst.GetTotalFrames() * channel; float *data = new float[len]; float *outdata = new float[len]; wav_inst.ReadFrameAsfloat(data, wav_inst.GetTotalFrames()); CRvcLiteSynthesizer m_rvc_inst; int err = m_rvc_inst.init(hubert_model_path, sample_rate, channel); printf("init err=%d!\n", err); printf("rtf=%f\n", m_rvc_inst.get_rtf()); int step = sample_rate * channel - 100 * channel; int out_len = 0; for(int i = 0; i < len; i+=step) { if (i + step > len) { step = len - i; } int out_step = step; err = m_rvc_inst.process(data+i, step, outdata+out_len, out_step); if(err != ERR_RVC_LITE_SUCCESS) { printf("process err=%d!\n", err); return ; } out_len += out_step; } STCWaveFile wav_out_inst(out_wav, true); wav_out_inst.SetSampleRate(sample_rate); wav_out_inst.SetChannels(channel); wav_out_inst.SetSampleFormat(SF_IEEE_FLOAT); wav_out_inst.SetupDone(); wav_out_inst.WriteFrame(outdata, wav_inst.GetTotalFrames()); delete[] data; delete[] outdata; } void test_rvc_lite_v2() { const char *hubert_model_path = "/mnt/d/dataset/svc/models/layers_3/layer3_contentvec.mnn"; const char *syz_model = "/mnt/d/dataset/svc/models/layers_3/layer3_syz.mnn"; const char *out_wav = "/mnt/d/dataset/tmp/i_out_01_r.wav"; - const char *in_wav = "/mnt/d/dataset/tmp/t1.wav"; + const char *in_wav = "/mnt/d/dataset/tmp/t1_48.wav"; STCWaveFile wav_inst(in_wav, false); int sample_rate = wav_inst.GetSampleRate(); int channel = wav_inst.GetChannels(); int len = wav_inst.GetTotalFrames() * channel; float *data = new float[len]; float *outdata = new float[len]; wav_inst.ReadFrameAsfloat(data, wav_inst.GetTotalFrames()); CRvcLiteOnlineV2 m_rvc_inst; int err = m_rvc_inst.init(hubert_model_path, sample_rate, channel); // m_rvc_inst.switch_model(syz_model); // m_rvc_inst.set_up_key(0); printf("init err=%d!\n", err); int step = sample_rate * channel - 100 * channel; int out_len = 0; bool last = false; int flag = 0; for(int i = 0; i < len; i+=step) { if (i + step > len) { step = len - i; last = true; } int out_step = step; err = m_rvc_inst.push(data+i, step, last); if(err != ERR_RVC_LITE_SUCCESS) { printf("process err=%d!\n", err); return ; } if (i >= len / 3 && flag == 0) { flag = 1; m_rvc_inst.switch_model(syz_model); } + if (i >= len / 2 && flag == 1) { flag = 2; m_rvc_inst.reset(); } out_step = 2 * step; m_rvc_inst.pop(outdata+out_len, out_step); out_len += out_step; } STCWaveFile wav_out_inst(out_wav, true); wav_out_inst.SetSampleRate(sample_rate); wav_out_inst.SetChannels(channel); wav_out_inst.SetSampleFormat(SF_IEEE_FLOAT); wav_out_inst.SetupDone(); wav_out_inst.WriteFrame(outdata, wav_inst.GetTotalFrames()); delete[] data; delete[] outdata; } void test_rvc_lite_online() { // const char *hubert_model_path = "/mnt/d/dataset/svc/models/mnn/hubert_test_v2_fp16.mnn"; // const char *hubert_model_path = "/mnt/d/dataset/svc/models/layer6_bingxiao_v1/mnn/layers6_checkpoint_14_1660000_1_hubert.mnn"; const char *hubert_model_path = "/mnt/d/dataset/svc/models/layers_3/layer3_contentvec.mnn"; // const char *contentvec_model_path = "/mnt/d/dataset/svc/models/mnn/contentvec_test_fp16.mnn"; // const char *syz_model = "/mnt/d/dataset/svc/models/layer6_bingxiao_v1/mnn/xusong_v1_6hubert_hifix_syz_base_vctk_kd_32k_hubert6_jianli_e225_s62775_205.mnn"; const char *xs_model = "/mnt/d/dataset/svc/models/layers_3/layer3_xusong.mnn"; const char *syz_model = "/mnt/d/dataset/svc/models/layers_3/layer3_syz.mnn"; // const char *contentvec_model_path = "/mnt/d/dataset/svc/models/layer6_bingxiao_v1/mnn/xiafan_fp16.mnn"; // const char *in_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01.wav"; const char *in_wav = "/mnt/d/dataset/tmp/t1.wav"; // const char* in_wav = "/mnt/d/dataset/svc/dataset/短数据样本/男声/qiankun.wav"; // const char* in_wav = "/mnt/d/dataset/tmp/i.wav"; // const char *in_wav = "/mnt/d/code/develop/svc/Retrieval-based-Voice-Conversion-WebUI/online/1_1.wav"; // const char *out_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01_cpp_v4.wav"; // const char *out_wav = "/mnt/d/dataset/svc/dataset/tests/qiankun_412_v4.wav"; const char *out_wav = "/mnt/d/dataset/tmp/i_out2.wav"; // 读取音频文件, 要求16k,单声道 STCWaveFile wav_inst(in_wav, false); int sample_rate = wav_inst.GetSampleRate(); int channel = wav_inst.GetChannels(); int len = wav_inst.GetTotalFrames() * channel; float *data = new float[len]; float *outdata = new float[len]; CRvcLiteOnlineRealTime rvc_inst; rvc_inst.init(hubert_model_path, sample_rate, channel); wav_inst.ReadFrameAsfloat(data, wav_inst.GetTotalFrames()); int step = 1024; printf("start ..\n"); bool flag = true; rvc_inst.switch_synth(syz_model); for (int i = 0; i < len; i += step) { if (i + step > len) { step = len - i; } struct timeval start; struct timeval end; gettimeofday(&start, NULL); int ret = rvc_inst.process(data + i, step, outdata+i, step); std::this_thread::sleep_for(std::chrono::milliseconds (15)); gettimeofday(&end, NULL); printf("ret = %d, sp = %f ms step=%d\n", ret, (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0, step); if (flag && i >= len / 3) { flag = false; rvc_inst.reset(); // rvc_inst.switch_synth(xs_model); } } STCWaveFile wav_out_inst(out_wav, true); wav_out_inst.SetSampleRate(sample_rate); wav_out_inst.SetChannels(channel); wav_out_inst.SetSampleFormat(SF_IEEE_FLOAT); wav_out_inst.SetupDone(); wav_out_inst.WriteFrame(outdata, wav_inst.GetTotalFrames()); float* flush_data; int flush_len; rvc_inst.flush(flush_data, flush_len); wav_out_inst.WriteFrame(flush_data, flush_len/channel); printf("finish2 ....\n"); } int main() { // int ret_hubert = test_hubert(); // int ret_contentvec = test_contentvec(); // test(); // test(); // test_rvc_lite_online(); // test_rvc_lite_synth(); test_rvc_lite_v2(); return 0; } diff --git a/mnn_demo/src/CRvcLiteOnline.cpp b/mnn_demo/src/CRvcLiteOnline.cpp index f9067f7..60b9fa6 100644 --- a/mnn_demo/src/CRvcLiteOnline.cpp +++ b/mnn_demo/src/CRvcLiteOnline.cpp @@ -1,831 +1,831 @@ // // Created by Administrator on 2023/11/29. // #include #include #include #include "CRvcLiteOnline.h" #include "Hubert.h" #include "CSynthesizer.h" #include "espyin-v1.0/ESPYIN.h" #include "ThreadPool.h" #include "CRvcCircleBuffer.h" #include "FfmpegResampler.h" #include inline bool file_exists (const std::string& name) { return ( access( name.c_str(), F_OK ) != -1 ); } // size代表了buf的长度 void stereo2mono(float *input, int size, float *output) { for (int i = 0; i < size - 1; i += 2) { output[i / 2] = (input[i] + input[i + 1]) / 2; } } void mono2stereo(float *input, int size, float *output) { for (int i = 0; i < size; i++) { output[2 * i] = input[i]; output[2 * i + 1] = input[i]; } } CRvcLiteOnline::CRvcLiteOnline() { init_variable(); m_init = false; m_switch_model = false; // 输入部分需要的变量 // 要求输入的时间片长度,采样点数 m_input_block_frame = int(gs_block_time * gs_src_samplerate); // 推理时额外需要的长度 m_input_extra_frame = int(gs_extra_time * gs_src_samplerate); int zc = gs_src_samplerate / 100; // 10ms的点数 int input_corssfade_frame = int(gs_crossfade_time * gs_src_samplerate); // 推理时使用的buffer长度 m_input_predict_buf_frame = int(ceil((m_input_extra_frame + input_corssfade_frame + m_input_block_frame) * 1.0 / zc) * zc); // 推理时使用的buffer m_input_predict_buf = new float[m_input_predict_buf_frame]; memset(m_input_predict_buf, 0, sizeof(float) * m_input_predict_buf_frame); // 输出部分需要的变量 m_crossfade_frame = int(gs_crossfade_time * gs_dst_samplerate); m_output_block_frame = int(gs_block_time * gs_dst_samplerate); int output_extra_frame = int(gs_extra_time * gs_dst_samplerate); zc = gs_dst_samplerate / 100; m_output_cache_buf_frame = int(ceil((m_output_block_frame + m_crossfade_frame + output_extra_frame) * 1.0 / zc) * zc); m_output_cache_buf = new float[m_output_cache_buf_frame]; memset(m_output_cache_buf, 0, sizeof(float) * m_output_cache_buf_frame); m_crossfade_buf = new float[m_crossfade_frame]; memset(m_crossfade_buf, 0, sizeof(float) * m_crossfade_frame); // 对于模型的输入和输出进行缓存 // 此处是写死的和模型有关 m_hubert_ret.resize(1); m_hubert_ret[0].resize(gs_hubert_frame); for (int i = 0; i < gs_hubert_frame; i++) { m_hubert_ret[0][i].resize(gs_hubert_dim); } // synth模型的输入 m_synth_input.resize(1); m_synth_input[0].resize(gs_synth_input_frame); for (int i = 0; i < gs_synth_input_frame; i++) { m_synth_input[0][i].resize(gs_synth_input_dim); } m_synth_out.resize(1); m_synth_out[0].resize(1); m_synth_out[0][0].resize(gs_synth_output_frame); } CRvcLiteOnline::~CRvcLiteOnline() { uninit(); } /**********************************对内函数*********************************************/ void CRvcLiteOnline::uninit() { if (m_input_predict_buf != NULL) { delete[] m_input_predict_buf; m_input_predict_buf = NULL; } if (m_output_cache_buf != NULL) { delete[] m_output_cache_buf; m_output_cache_buf = NULL; } if (m_crossfade_buf != NULL) { delete[] m_crossfade_buf; m_crossfade_buf = NULL; } init_variable(); } void CRvcLiteOnline::get_pyin_f0() { - for (int i = 0; i < m_input_predict_buf_frame; i += 160) { + for (int i = 0; i < m_input_predict_buf_frame - 1024 - 160; i += 160) { m_es_pyin->process(m_input_predict_buf + i); } m_f0_data.clear(); ESFeatureSet feats = m_es_pyin->getRemainingFeatures(); if (!feats.empty()) { m_f0_data.resize(feats[4].size()); for (size_t i = 0; i < feats[4].size(); ++i) { // 设置变调 m_f0_data[i] = feats[4][i].values[0] * m_f0_up_key; if (m_f0_data[i] < 0) { m_f0_data[i] = 0; } } } m_es_pyin->reset(); get_f0_post(); } void CRvcLiteOnline::get_f0_post() { int f0_min = 50; int f0_max = 1100; float f0_mel_min = 1127 * log2(1 + f0_min * 1.0 / 700); float f0_mel_max = 1127 * log2(1 + f0_max * 1.0 / 700); m_f0_coarse_data.clear(); m_f0_coarse_data.resize(m_f0_data.size()); for (int i = 0; i < m_f0_data.size(); i++) { float f0_mel = 1127 * log2(1 + m_f0_data[i] / 700); if (f0_mel > 0) { f0_mel = (f0_mel - f0_mel_min) * 254.f / (f0_mel_max - f0_mel_min) + 1; } if (f0_mel <= 1) { f0_mel = 1; } else if (f0_mel > 255) { f0_mel = 255; } m_f0_coarse_data[i] = float(int(f0_mel + 0.5)); } } void CRvcLiteOnline::init_variable() { m_init = false; m_switch_model = false; // 缓存使用的数据 // 要求输入的时间片长度,采样点数 m_input_block_frame = 0; m_input_extra_frame = 0; m_input_predict_buf_frame = 0; m_input_predict_buf = nullptr; m_f0_data.clear(); m_f0_coarse_data.clear(); m_crossfade_frame = 0; m_output_block_frame = 0; m_output_cache_buf_frame = 0; m_crossfade_buf = nullptr; m_output_cache_buf = nullptr; // 各个实例的返回结果 m_hubert_ret.clear(); m_synth_input.clear(); m_synth_out.clear(); m_fade_in = true; m_f0_up_key = 1.f; m_f0_new_up_key = 1.f; } /**********************************对外函数*********************************************/ int CRvcLiteOnline::init(const char *hubert_model_path) { if (m_init) { return ERR_RVC_LITE_REINIT; } m_hubert_inst = std::make_shared(); m_synthesizer_inst = std::make_shared(); m_hubert_inst->init(hubert_model_path); // m_synthesizer_inst->init(synth_model_path); // 要求stepSize必须是2^n m_es_pyin = std::make_shared(16000, 160, 1024, 50, 1100); m_init = true; m_switch_model = false; m_fade_in = true; m_f0_up_key = 1.f; m_f0_new_up_key = 1.f; return ERR_RVC_LITE_SUCCESS; } int CRvcLiteOnline::switch_synth_model(const char *synth_model_path) { if (!m_init) { return ERR_RVC_LITE_NOT_INIT; } if (file_exists(synth_model_path)) { m_synthesizer_inst = std::make_shared(); m_synthesizer_inst->init(synth_model_path); m_switch_model = true; return ERR_RVC_LITE_SUCCESS; } return ERR_RVC_LITE_MODEL_NOT_EXISTS; } void CRvcLiteOnline::set_up_key(int key) { if (key > 12) { key = 12; } if (key < -12) { key = -12; } m_f0_new_up_key = pow(2, key / 12.f); } void CRvcLiteOnline::reset() { memset(m_input_predict_buf, 0, sizeof(float) * m_input_predict_buf_frame); memset(m_crossfade_buf, 0, sizeof(float) * m_crossfade_frame); memset(m_output_cache_buf, 0, sizeof(float) * m_output_cache_buf_frame); m_fade_in = true; } int CRvcLiteOnline::process_block(float *in_buf, int in_len, float *out_buf, int out_len) { if (!m_init) { return ERR_RVC_LITE_NOT_INIT; } if (!m_switch_model) { return ERR_RVC_LITE_NOT_SWITCH_MODEL; } // 外部数据产生不连贯,比如做了reset的时候,需要做fade_in if (m_fade_in) { for(int i = 0; i < in_len; i++) { float rate = i * 1.0 / in_len; in_buf[i] = in_buf[i] * rate; } m_fade_in = false; } // 剔除尾部的block的数据 memcpy(m_input_predict_buf, m_input_predict_buf + in_len, sizeof(float) * (m_input_predict_buf_frame - in_len)); // 向尾部填充in_buf的数据 memcpy(m_input_predict_buf + (m_input_predict_buf_frame - in_len), in_buf, sizeof(float) * in_len); // 提取f0特征序列 struct timeval start; struct timeval end; gettimeofday(&start, NULL); m_f0_up_key = m_f0_new_up_key; get_pyin_f0(); gettimeofday(&end, NULL); LOGE("CRvcLiteOnline", "get pyin sp = %f ms\n", (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0); // 推理hubert gettimeofday(&start, NULL); m_hubert_inst->process(m_input_predict_buf, m_hubert_ret); gettimeofday(&end, NULL); LOGE("CRvcLiteOnline", "m_hubert_inst sp = %f ms\n", (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0); // 合成语音 for (int i = 0; i < gs_synth_input_frame; i++) { // 拷贝数据 1,gs_hubert_frame,258 for (int j = 0; j < gs_hubert_dim; j++) { m_synth_input[0][i][j] = m_hubert_ret[0][i][j]; } m_synth_input[0][i][256] = m_f0_coarse_data[i]; m_synth_input[0][i][257] = m_f0_data[i]; } gettimeofday(&start, NULL); m_synthesizer_inst->process(m_synth_input, m_synth_out); gettimeofday(&end, NULL); LOGE("CRvcLiteOnline", "m_synthesizer_inst sp = %f ms\n", (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0); // 将结果全部放到缓存中 memcpy(m_output_cache_buf, m_output_cache_buf + gs_synth_output_frame, sizeof(float) * (m_output_cache_buf_frame - gs_synth_output_frame)); memcpy(m_output_cache_buf + (m_output_cache_buf_frame - gs_synth_output_frame), m_synth_out[0][0].data(), sizeof(float) * gs_synth_output_frame); int start_pos = m_output_cache_buf_frame - m_crossfade_frame - out_len; memcpy(out_buf, m_output_cache_buf + start_pos, sizeof(float) * out_len); // 对头部数据做fade_in以及fadeout for (int i = 0; i < m_crossfade_frame; i++) { float rate = float(i * 1.f / m_crossfade_frame); out_buf[i] = rate * out_buf[i] + m_crossfade_buf[i] * (1 - rate); } memcpy(m_crossfade_buf, m_output_cache_buf + (m_output_cache_buf_frame - m_crossfade_frame), sizeof(float) * m_crossfade_frame); return 0; } int CRvcLiteOnline::get_latency_ms() { // 此处除了block的延迟,还有推理时hubert理论上应该获取208,实际获取205帧,所以少的30ms return gs_crossfade_time * 1000 + 30; } /*******************************对内的类**************************************/ CResample::CResample() { m_resample_inst = nullptr; } CResample::~CResample() { } int CResample::init(int in_samplerate, int out_samplerate, int in_channel, int out_channel) { // 只是通道数不一致时走自驱逻辑 m_in_channel = in_channel; m_out_channel = out_channel; if (in_samplerate == out_samplerate && in_channel != out_channel) { m_resample_inst = nullptr; } else { m_resample_inst = std::make_shared(); return m_resample_inst->init(in_samplerate, out_samplerate, in_channel, out_channel); } return ERR_RVC_LITE_SUCCESS; } int CResample::get_out_samples(int num) { if (m_resample_inst) { return m_resample_inst->get_out_samples(num); } return num; } void CResample::reset() { if (m_resample_inst) { return m_resample_inst->reset(); } } int CResample::get_latency() { if (m_resample_inst) { return m_resample_inst->get_latency(); } return 0; } int CResample::resample(float *in_buf, int in_num, float *out_buf, int &out_num) { if (m_resample_inst) { return m_resample_inst->resample(in_buf, in_num, out_buf, out_num); } if (m_in_channel == 2 && m_out_channel == 1) { if (out_num < in_num) { return ERR_RVC_LITE_RT_RESAMPLE_OUTBUF_SHORT; } stereo2mono(in_buf, in_num, out_buf); return ERR_RVC_LITE_SUCCESS; } if (m_in_channel == 1 && m_out_channel == 2) { if (out_num < in_num) { return ERR_RVC_LITE_RT_RESAMPLE_OUTBUF_SHORT; } mono2stereo(in_buf, in_num, out_buf); return ERR_RVC_LITE_SUCCESS; } return ERR_RVC_LITE_SUCCESS; } /*******************************对外的类***************************************/ /*******************************对内函数***************************************/ void CRvcLiteOnlineRealTime::init_variable() { m_init = false; m_rvc_stop = true; m_sample_rate = 44100; m_channel = 1; m_synth_path = ""; m_new_synth_path = ""; m_syn_state = RVC_LITE_RT_SYN_STATE_DEFAULT; } /*******************************对外函数***************************************/ CRvcLiteOnlineRealTime::CRvcLiteOnlineRealTime() { init_variable(); } CRvcLiteOnlineRealTime::~CRvcLiteOnlineRealTime() { uninit(); } int CRvcLiteOnlineRealTime::init(const char *hubert_model_path, int sample_rate, int channel) { if (m_init) { return ERR_RVC_LITE_RT_REINIT; } if (sample_rate < 16000) { return ERR_RVC_LITE_RT_INPUT_SAMPLE_ERR; } init_variable(); m_sample_rate = sample_rate; m_channel = channel; m_synth_path = ""; m_new_synth_path = ""; m_syn_state = RVC_LITE_RT_SYN_STATE_DEFAULT; int output_one_sec_number = m_sample_rate * m_channel; // 临时使用的数据 int latency_len = gs_crossfade_time * m_sample_rate * m_channel; CThreadPool::Task task = std::bind(&CRvcLiteOnlineRealTime::rvc_process, this); m_rvc_inst = std::make_shared(); int err = m_rvc_inst->init(hubert_model_path); if (ERR_RVC_LITE_SUCCESS != err) { goto exit; } // 重采样部分 m_resample_queue = std::make_shared(sample_rate * 3 * m_channel); m_resample16 = std::make_shared(); err = m_resample16->init(m_sample_rate, gs_src_samplerate, m_channel, 1); if (ERR_RVC_LITE_SUCCESS != err) { goto exit; } m_resample2src = std::make_shared(); err = m_resample2src->init(gs_dst_samplerate, m_sample_rate, 1, m_channel); if (ERR_RVC_LITE_SUCCESS != err) { goto exit; } m_resample_buf_max_len = 2048; // 此时空间最大是2048,保证不超即可 m_resample_in_buf = std::shared_ptr(new float[m_resample_buf_max_len], std::default_delete()); m_resample_out_buf = std::shared_ptr(new float[m_resample_buf_max_len], std::default_delete()); // 核心处理部分 m_input_tmp_buf_len = gs_src_samplerate; m_output_tmp_buf_len = gs_dst_samplerate; m_input_tmp_buf = std::shared_ptr(new float[m_input_tmp_buf_len], std::default_delete()); m_output_tmp_buf = std::shared_ptr(new float[m_output_tmp_buf_len], std::default_delete()); memset(m_input_tmp_buf.get(), 0, sizeof(float) * m_input_tmp_buf_len); memset(m_output_tmp_buf.get(), 0, sizeof(float) * m_output_tmp_buf_len); // 循环buffer m_input_queue = std::make_shared(m_input_tmp_buf_len * 3); // 对外的是目标的采样率和通道数的数据 m_out_queue = std::make_shared(output_one_sec_number * 3); m_latency_queue = std::make_shared(latency_len); // 提前塞入两组,保证延迟稳定在2s for (int i = 0; i < 2; i++) { // 塞入1s数据 for (int j = 0; j < output_one_sec_number / m_output_tmp_buf_len; j++) { m_out_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len); } m_out_queue->push(m_output_tmp_buf.get(), output_one_sec_number % m_output_tmp_buf_len); } // 算法本身有延迟,所有为了保证延迟一致,在无效果的时候需要添加该延迟 for (int j = 0; j < latency_len / m_output_tmp_buf_len; j++) { m_latency_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len); } m_latency_queue->push(m_output_tmp_buf.get(), latency_len % m_output_tmp_buf_len); // 开始处理线程 m_thread_pool = std::make_shared(); m_thread_pool->start(1); m_rvc_stop = false; m_thread_pool->run(task); m_init = true; exit: if (ERR_RVC_LITE_SUCCESS != err) { m_init = true; uninit(); } return err; } int CRvcLiteOnlineRealTime::switch_synth(const char *synth_model_path) { if (!m_init) { return ERR_RVC_LITE_RT_NOT_INIT; } { std::unique_lock lock(m_rvc_mutex); m_new_synth_path = synth_model_path; } return ERR_RVC_LITE_SUCCESS; } int CRvcLiteOnlineRealTime::process(float *in_buf, int in_len, float *out_buf, int out_len) { if (!m_init) { return ERR_RVC_LITE_RT_NOT_INIT; } // 写入数据 { std::unique_lock lock(m_rvc_mutex); m_resample_queue->push(in_buf, in_len); m_rvc_cond.notify_all(); } memset(out_buf, 0, sizeof(float) * out_len); int tmp_out_len = out_len; // 获取数据 { std::unique_lock lock(m_rvc_mutex); m_out_queue->pop(out_buf, tmp_out_len); } if (tmp_out_len != out_len) { return ERR_RVC_LITE_RT_NOT_ENOUGH_DATA; } return ERR_RVC_LITE_SUCCESS; } void CRvcLiteOnlineRealTime::reset() { if (!m_init) { return; } { std::unique_lock lock(m_rvc_mutex); m_resample_queue->reset(); m_resample16->reset(); m_resample2src->reset(); m_input_queue->reset(); m_out_queue->reset(); m_rvc_inst->reset(); m_latency_queue->reset(); // 提前塞入两组,保证延迟稳定在2s int output_one_sec_number = m_sample_rate * m_channel; // 临时使用的数据 memset(m_output_tmp_buf.get(), 0, sizeof(float) * m_output_tmp_buf_len); for (int i = 0; i < 2; i++) { for (int j = 0; j < output_one_sec_number / m_output_tmp_buf_len; j++) { m_out_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len); } m_out_queue->push(m_output_tmp_buf.get(), output_one_sec_number % m_output_tmp_buf_len); } // 算法本身有延迟,所有为了保证延迟一致,在无效果的时候需要添加该延迟 int latency_len = gs_crossfade_time * m_sample_rate * m_channel; for (int j = 0; j < latency_len / m_output_tmp_buf_len; j++) { m_latency_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len); } m_latency_queue->push(m_output_tmp_buf.get(), latency_len % m_output_tmp_buf_len); } } void CRvcLiteOnlineRealTime::flush(float *&out_buf, int &len) { // 将内部的所有的数据吐出来 /** * 先停止 */ stop(); // 无音色转换的情况 int resample_in_len = 0; int resample_out_len = 0; if(m_syn_state == RVC_LITE_RT_SYN_STATE_DEFAULT) { while (m_resample_queue->size() > 0) { resample_in_len = m_resample_buf_max_len; m_resample_queue->pop(m_resample_in_buf.get(), resample_in_len); m_latency_queue->push(m_resample_in_buf.get(), resample_in_len); m_latency_queue->pop(m_resample_in_buf.get(), resample_in_len); m_out_queue->push(m_resample_in_buf.get(), resample_in_len); } while(m_latency_queue->size() > 0) { resample_in_len = m_resample_buf_max_len; m_latency_queue->pop(m_resample_in_buf.get(), resample_in_len); m_out_queue->push(m_resample_in_buf.get(), resample_in_len); } len = m_out_queue->size(); out_buf = new float[len]; m_out_queue->pop(out_buf, len); return; } // 有音色转换的情况 while (m_resample_queue->size() > 0) { resample_in_len = m_resample_buf_max_len; m_resample_queue->pop(m_resample_in_buf.get(), resample_in_len); // 输入的数据需要考虑channel resample_out_len = m_resample16->get_out_samples(resample_in_len / m_channel); m_resample16->resample(m_resample_in_buf.get(), resample_in_len / m_channel, m_resample_out_buf.get(), resample_out_len); // 输出是16k单声道,不需要考虑 m_input_queue->push(m_resample_out_buf.get(), resample_out_len); } memset(m_input_tmp_buf.get(), 0, sizeof(float) * m_input_tmp_buf_len); int add_size = m_input_tmp_buf_len - m_input_queue->size() % m_input_tmp_buf_len; if (add_size != 0 && add_size < m_input_tmp_buf_len) { m_input_queue->push(m_input_tmp_buf.get(), add_size); } int num = m_input_queue->size() / m_input_tmp_buf_len; for (int i = 0; i < num; i++) { rvc_process_step(); } // 将所有数据拷贝出来 len = m_out_queue->size(); out_buf = new float[len]; m_out_queue->pop(out_buf, len); } int CRvcLiteOnlineRealTime::get_latency_ms() { return m_rvc_inst->get_latency_ms() + 2000; } /*******************************对内函数***************************************/ void CRvcLiteOnlineRealTime::uninit() { if (!m_init) { return; } stop(); } void CRvcLiteOnlineRealTime::stop() { // 释放thread_pool的数据,先通知一下rvc_process,防止是在等待中 m_rvc_stop = true; if (m_thread_pool) { m_rvc_cond.notify_all(); m_thread_pool->stop(); } } void CRvcLiteOnlineRealTime::rvc_process_step() { struct timeval start; struct timeval end; int sample_out_len = 0; // 开始处理 if (m_input_queue->size() < m_input_tmp_buf_len) { return; } gettimeofday(&start, NULL); m_input_queue->pop(m_input_tmp_buf.get(), m_input_tmp_buf_len); m_rvc_inst->process_block(m_input_tmp_buf.get(), m_input_tmp_buf_len, m_output_tmp_buf.get(), m_output_tmp_buf_len); gettimeofday(&end, NULL); LOGD("RvcLite", "rvc_process process sp %f ms", (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0); // 重采样 // 考虑到此处采样率变大,但是最多也不到两倍,但是通道数有可能扩展到两倍,所以按照1/4进行设置 gettimeofday(&start, NULL); bool last = false; int step = m_resample_buf_max_len / 4; for (int i = 0; i < m_output_tmp_buf_len; i += step) { if (i + step >= m_output_tmp_buf_len) { step = m_output_tmp_buf_len - i; last = true; } // 此时的输入是单声道,采样点数量和总长度一致 sample_out_len = m_resample2src->get_out_samples(step); m_resample2src->resample(m_output_tmp_buf.get() + i, step, m_resample_out_buf.get(), sample_out_len); // 从有到无 if(last && m_syn_state == RVC_LITE_RT_SYN_STATE_EFFECT2DEFAULT) { // 因为不加音效也需要延迟对齐,所以此处只要做fade_out就行了 for(int ii =0; ii < sample_out_len * m_channel; ii+=m_channel) { float rate = ii * 1.0 / step; for(int jj = 0; jj < m_channel; jj++) { m_resample_out_buf.get()[ii+jj] = m_resample_out_buf.get()[ii+jj] * (1 - rate); } } m_syn_state = RVC_LITE_RT_SYN_STATE_BEFORE_DEFAULT; } { std::unique_lock lock(m_rvc_mutex); m_out_queue->push(m_resample_out_buf.get(), sample_out_len * m_channel); } } gettimeofday(&end, NULL); LOGD("RvcLite", "rvc_process re_resample sp %f ms", (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0); printf("finish ...\n"); } void CRvcLiteOnlineRealTime::rvc_process() { int sample_in_len; int sample_out_len = 0; while (!m_rvc_stop) { { // 重采样 std::unique_lock lock(m_rvc_mutex); if (m_resample_queue->size() < m_resample_buf_max_len) { // 睡眠前检查下情况 if (m_rvc_stop) { return; } m_rvc_cond.wait(lock); continue; } sample_in_len = m_resample_buf_max_len; m_resample_queue->pop(m_resample_in_buf.get(), sample_in_len); } /** * 此处有三种情况: * 因为无论哪种变换,有延迟的存在,导致输入的数据都是需要塞0进去,所以对当前的数据做fade_out即可 * 1. 无到有:对无到有的部分做个fade_out,对下一帧要塞入音效器的部分做fade_in * 2. 有到无:对无到有的部分做个fade_out,对下一帧要塞入音效器的部分做fade_in * 3. 有到有[这个不用考虑,内部自己做了处理] */ if (m_synth_path != m_new_synth_path) { // 从无到有,此时对本帧做fade_out,对下一帧输入做fade_in if(m_synth_path.empty() && !m_new_synth_path.empty()) { m_syn_state = RVC_LITE_RT_SYN_STATE_DEFAULT2EFFECT; } // 从有到无 if (!m_synth_path.empty() && m_new_synth_path.empty()) { m_syn_state = RVC_LITE_RT_SYN_STATE_EFFECT2DEFAULT; } { std::unique_lock lock(m_rvc_mutex); m_synth_path = m_new_synth_path; } m_rvc_inst->switch_synth_model(m_new_synth_path.c_str()); } // 刚切过来第一次做效果 if(m_syn_state == RVC_LITE_RT_SYN_STATE_BEFORE_DEFAULT) { // 刚从有到无,需要清空数据,以及对输入的队列添加fade_in m_latency_queue->reset(); // 算法本身有延迟,所有为了保证延迟一致,在无效果的时候需要添加该延迟 memset(m_output_tmp_buf.get(), 0, sizeof(float) * m_output_tmp_buf_len); int latency_len = gs_crossfade_time * m_sample_rate * m_channel; for (int j = 0; j < latency_len / m_output_tmp_buf_len; j++) { m_latency_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len); } m_latency_queue->push(m_output_tmp_buf.get(), latency_len % m_output_tmp_buf_len); // 对输入做fade_in for(int i = 0; i < sample_in_len; i+=m_channel) { float rate = i * 1.0 / sample_in_len; for(int j = 0; j < m_channel; j++) { m_resample_in_buf.get()[i+j] *= rate; } } m_syn_state = RVC_LITE_RT_SYN_STATE_DEFAULT; } // 不做效果 if(m_syn_state == RVC_LITE_RT_SYN_STATE_DEFAULT) { m_latency_queue->push(m_resample_in_buf.get(), sample_in_len); m_latency_queue->pop(m_resample_in_buf.get(), sample_in_len); { std::unique_lock lock(m_rvc_mutex); m_out_queue->push(m_resample_in_buf.get(), sample_in_len); } continue; } // 从无到有的转换 if (m_syn_state == RVC_LITE_RT_SYN_STATE_DEFAULT2EFFECT) { // 做fade_out for(int i = 0; i < sample_in_len; i+=m_channel) { float rate = i * 1.0 / sample_in_len; for(int j = 0; j < m_channel; j++) { m_resample_in_buf.get()[i+j] *= 1 - rate; } } m_latency_queue->push(m_resample_in_buf.get(), sample_in_len); m_latency_queue->pop(m_resample_in_buf.get(), sample_in_len); { std::unique_lock lock(m_rvc_mutex); m_out_queue->push(m_resample_in_buf.get(), sample_in_len); } // 此时对于rvc来说输入的数据不连贯了,所以清空内部数据重新搞 m_syn_state = RVC_LITE_RT_SYN_STATE_EFFECT; m_rvc_inst->reset(); continue; } // 重采样到16k,此处采样率变低,所以不会出现sample_out_len > sample_in_len的情况 sample_out_len = m_resample16->get_out_samples(sample_in_len / m_channel); m_resample16->resample(m_resample_in_buf.get(), sample_in_len / m_channel, m_resample_out_buf.get(), sample_out_len); m_input_queue->push(m_resample_out_buf.get(), sample_out_len); rvc_process_step(); } } \ No newline at end of file diff --git a/mnn_demo/src/CRvcLiteOnlineV2.cpp b/mnn_demo/src/CRvcLiteOnlineV2.cpp index 0269a7d..97a67fb 100644 --- a/mnn_demo/src/CRvcLiteOnlineV2.cpp +++ b/mnn_demo/src/CRvcLiteOnlineV2.cpp @@ -1,215 +1,215 @@ // // Created by Administrator on 2024/1/22. // #include "CRvcLiteOnlineV2.h" #include "CRvcCircleBuffer.h" #include inline bool file_exists1 (const std::string& name) { return ( access( name.c_str(), F_OK ) != -1 ); } CRvcLiteOnlineV2::CRvcLiteOnlineV2() { } CRvcLiteOnlineV2::~CRvcLiteOnlineV2() { } /*****************************************对内函数***************************************************************/ void CRvcLiteOnlineV2::set_cur_state(bool reset) { /** * 一共三种状态 * 从无到有: 让不做效果的fade_out,做效果的fade_in * 从有到无: 让做效果的fade_out, 不做效果的fade_in即可 * 从有到有,这种情况不考虑,内部自己会做fade */ if (m_syn_model != m_new_syn_model) { // 从无到有 if (m_syn_model.empty() && !m_new_syn_model.empty()) { m_sync_state = CRVC_V2_STATE_DEFAULT2EFFECT; // 如果此时已经发生了reset,则不需要做切换,直接做就行 if (reset) { m_sync_state = CRVC_V2_STATE_EFFECT; } m_syn_model = m_new_syn_model; m_rvc_inst->switch_model(m_syn_model.c_str()); } // 从有到无 if (!m_syn_model.empty() && m_new_syn_model.empty()) { m_sync_state = CRVC_V2_STATE_EFFECT2DEFAULT; // 如果此时已经发生了reset,则不需要做切换,直接做就行 if (reset) { m_sync_state = CRVC_V2_STATE_DEFAULT; } m_syn_model = m_new_syn_model; } } } /*****************************************对外函数***************************************************************/ int CRvcLiteOnlineV2::init(const char *hubert_model, int sample_rate, int channel) { m_rvc_inst = std::make_shared(); - m_block_len = sample_rate * channel - 100 * channel; + m_block_len = int(sample_rate * 0.9) * channel; // 每900ms处理一次 m_tmp_buf_len = m_block_len * 2; m_reset = true; m_syn_model = ""; m_new_syn_model = ""; m_sync_state = CRVC_V2_STATE_DEFAULT; m_fade_len = int(sample_rate * 0.05) * channel; // 50ms的时长用来做fade m_channel = channel; m_tmp_in_buf = std::shared_ptr(new float[m_tmp_buf_len], std::default_delete()); m_tmp_out_buf = std::shared_ptr(new float[m_tmp_buf_len], std::default_delete()); m_in_queue = std::make_shared(m_tmp_buf_len * 2); m_out_queue = std::make_shared(m_tmp_buf_len * 2); m_input_latency_output_frame = 0; return m_rvc_inst->init(hubert_model, sample_rate, channel); } int CRvcLiteOnlineV2::switch_model(const char *synth_model) { if (synth_model != "" && !file_exists1(synth_model)) { return ERR_RVC_LITE_MODEL_NOT_EXISTS; } m_new_syn_model = synth_model; return ERR_RVC_LITE_SUCCESS; } void CRvcLiteOnlineV2::set_up_key(int key) { // 内部是线程安全的,所以直接设置即可 m_rvc_inst->set_up_key(key); } void CRvcLiteOnlineV2::reset() { m_reset = true; } int CRvcLiteOnlineV2::push(float *buf, int len, bool last) { bool reset = m_reset; if (m_reset) { m_reset = false; m_input_latency_output_frame = 0; m_in_queue->reset(); m_out_queue->reset(); m_rvc_inst->reset(); } set_cur_state(reset); if (CRVC_V2_STATE_DEFAULT == m_sync_state) { std::unique_lock lock(m_rvc_mutex); m_out_queue->push(buf, len); return ERR_RVC_LITE_SUCCESS; } // 此时无论怎样,都要让模型跑一下,得到结果再说 m_in_queue->push(buf, len); while(m_in_queue->size() >= m_block_len || last) { if (m_in_queue->size() <= 0) { return ERR_RVC_LITE_SUCCESS; } int cur_in_len = m_block_len; int cur_out_len = m_block_len; m_in_queue->pop(m_tmp_in_buf.get(), cur_in_len); int err = m_rvc_inst->process(m_tmp_in_buf.get(), cur_in_len, m_tmp_out_buf.get(), cur_out_len); if (err != ERR_RVC_LITE_SUCCESS) { return err; } // 此时对于effect做fade_out,default做fade_in if (m_sync_state == CRVC_V2_STATE_EFFECT2DEFAULT) { // 此时由于m_rvc_inst本身存在延迟输出的情况[虽然头部的静音帧已经被砍掉了],但是其输入的数据和输出的数据并不是完美对应的,存在延迟差 // 所以此时输入的头部和输出的头部之前存在延迟差,但是不加音效是没有这个延迟差的 // 所以需要将输入的头部对应到其应该对应的输出真实数据的头部 // 比如: 输入: 1,2,3,4,5 输出: l1,l2,1,2,3 ,其中l1和l2是延迟采样点,也就是1,2,对应的是输出+延迟采样点才对 for(int i = 0; i < m_fade_len; i+=m_channel) { float rate = i * 1.0 / m_fade_len; for(int j = 0; j < m_channel; j+=1) { m_tmp_in_buf.get()[i+j] = m_tmp_in_buf.get()[i+j] * rate + m_tmp_out_buf.get()[i+j+m_input_latency_output_frame] * (1 - rate); } } { std::unique_lock lock(m_rvc_mutex); // 将之前要输入的那块塞进去 m_out_queue->push(m_tmp_out_buf.get(), m_input_latency_output_frame); m_out_queue->push(m_tmp_in_buf.get(), cur_in_len); } m_sync_state = CRVC_V2_STATE_DEFAULT; m_input_latency_output_frame = 0; while(m_in_queue->size() > 0) { cur_in_len = m_block_len; m_in_queue->pop(m_tmp_in_buf.get(), cur_in_len); { std::unique_lock lock(m_rvc_mutex); m_out_queue->push(m_tmp_in_buf.get(), cur_in_len); } } return ERR_RVC_LITE_SUCCESS; } // 此时对effect做fade_in,default做fade_out if (m_sync_state == CRVC_V2_STATE_DEFAULT2EFFECT) { for(int i = 0; i < m_fade_len; i+=m_channel) { float rate = i * 1.0 / m_fade_len; for(int j = 0; j < m_channel; j+=1) { m_tmp_out_buf.get()[i+j] = m_tmp_out_buf.get()[i+j] * rate + m_tmp_in_buf.get()[i+j] * (1 - rate); } } // 设置状态 m_sync_state = CRVC_V2_STATE_EFFECT; } // effect会存在输入和输出长度不一致的情况 m_input_latency_output_frame += cur_in_len - cur_out_len; // 加锁塞入数据 { std::unique_lock lock(m_rvc_mutex); m_out_queue->push(m_tmp_out_buf.get(), cur_out_len); } } return ERR_RVC_LITE_SUCCESS; } int CRvcLiteOnlineV2::size() { return m_out_queue->size(); } void CRvcLiteOnlineV2::pop(float *buf, int &len) { std::unique_lock lock(m_rvc_mutex); m_out_queue->pop(buf, len); } diff --git a/mnn_demo/src/CRvcLiteSynthesizer.cpp b/mnn_demo/src/CRvcLiteSynthesizer.cpp index 6ff952b..711a6ed 100644 --- a/mnn_demo/src/CRvcLiteSynthesizer.cpp +++ b/mnn_demo/src/CRvcLiteSynthesizer.cpp @@ -1,128 +1,127 @@ // // Created by Administrator on 2024/1/21. // #include "CRvcLiteSynthesizer.h" #include #include CRvcLiteSynthesizer::CRvcLiteSynthesizer(){} CRvcLiteSynthesizer::~CRvcLiteSynthesizer() {} int CRvcLiteSynthesizer::init(const char *hubert_model, int sample_rate, int channel) { m_rvc_inst = std::make_shared(); int err = m_rvc_inst->init(hubert_model); if (err != ERR_RVC_LITE_SUCCESS) { return err; } m_resample2_16 = std::make_shared(); m_resample2_16->init(sample_rate, gs_src_samplerate, channel, 1); m_resample2src = std::make_shared(); m_resample2src->init(gs_dst_samplerate, sample_rate, 1, channel); m_channel = channel; m_sample_rate = sample_rate; m_buf_tmp_16k_len = 0; m_buf_tmp_16k_cap = 0; m_buf_tmp_32k_len = 0; m_buf_tmp_32k_cap = 0; m_buf_tmp_src_len = 0; m_buf_tmp_src_cap = 0; m_first = true; return ERR_RVC_LITE_SUCCESS; } int CRvcLiteSynthesizer::switch_model(const char *synth_model) { return m_rvc_inst->switch_synth_model(synth_model); } void CRvcLiteSynthesizer::set_up_key(int key) { m_rvc_inst->set_up_key(key); } void CRvcLiteSynthesizer::reset() { m_rvc_inst->reset(); m_first = true; } int CRvcLiteSynthesizer::process(float *in_buf, int in_len, float *out_buf, int &out_len) { // 1 重采样 2 推理 3 再次重采样 int resample_out_len = m_resample2_16->get_out_samples(in_len / m_channel); // 控制逻辑,不能超过该长度 if (resample_out_len > gs_src_samplerate) { return ERR_RVC_LITE_BLOCK_TOO_LONG; } - if (m_buf_tmp_16k_cap < resample_out_len) { m_buf_tmp_16k_cap = resample_out_len; m_buf_tmp_16k = std::shared_ptr(new float[m_buf_tmp_16k_cap], std::default_delete()); } m_buf_tmp_16k_len = resample_out_len; int err = m_resample2_16->resample(in_buf, in_len / m_channel, m_buf_tmp_16k.get(), m_buf_tmp_16k_len); if (err != ERR_RVC_LITE_SUCCESS) { return err; } if (m_buf_tmp_32k_cap < m_buf_tmp_16k_len * 2) { m_buf_tmp_32k_cap = m_buf_tmp_16k_len * 2; m_buf_tmp_32k = std::shared_ptr(new float[m_buf_tmp_32k_cap], std::default_delete()); } m_buf_tmp_32k_len = m_buf_tmp_16k_len * 2; // 推理 err = m_rvc_inst->process_block(m_buf_tmp_16k.get(), m_buf_tmp_16k_len, m_buf_tmp_32k.get(), m_buf_tmp_32k_len); if (err != ERR_RVC_LITE_SUCCESS) { return err; } // 重采样回来 int out_frame = m_resample2src->get_out_samples(m_buf_tmp_32k_len); if (m_buf_tmp_src_cap < out_frame * m_channel) { m_buf_tmp_src_cap = out_frame * m_channel; m_buf_tmp_src = std::shared_ptr(new float[m_buf_tmp_src_cap], std::default_delete()); } m_buf_tmp_src_len = out_frame; err = m_resample2src->resample(m_buf_tmp_32k.get(), m_buf_tmp_32k_len, m_buf_tmp_src.get(), m_buf_tmp_src_len); if (err != ERR_RVC_LITE_SUCCESS) { return err; } // 取较小的值 if (out_len > m_buf_tmp_src_len * m_channel) { out_len = m_buf_tmp_src_len * m_channel; } // 第一次过来,将头部的延迟块切掉 int latency_frame = 0; if (m_first) { m_first = false; latency_frame = int(m_rvc_inst->get_latency_ms() * 1.0 / 1000 * m_sample_rate) * m_channel; out_len -= latency_frame; } memcpy(out_buf, m_buf_tmp_src.get()+latency_frame, sizeof(float) * out_len); return ERR_RVC_LITE_SUCCESS; } float CRvcLiteSynthesizer::get_rtf() { struct timeval start; struct timeval end; gettimeofday(&start, NULL); int in_len = m_sample_rate * m_channel - 100 *m_channel; int out_len = in_len; float* in_buf = new float[in_len]; process(in_buf, in_len, in_buf, in_len); delete [] in_buf; gettimeofday(&end, NULL); double sp = (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0; return sp / 1000; } \ No newline at end of file diff --git a/mnn_demo/src/CSynthesizer.cpp b/mnn_demo/src/CSynthesizer.cpp index 2346fe9..427d6c8 100644 --- a/mnn_demo/src/CSynthesizer.cpp +++ b/mnn_demo/src/CSynthesizer.cpp @@ -1,73 +1,86 @@ // // Created by ZIHAO GUO on 2023/11/16. // #include "CSynthesizer.h" #include #include CSynthesizer::CSynthesizer() = default; CSynthesizer::~CSynthesizer() { uninit(); } int CSynthesizer::init(const char *model_path) { m_config.type = MNN_FORWARD_CPU; m_runtime_info = MNN::Interpreter::createRuntime({m_config}); m_net = std::shared_ptr(MNN::Interpreter::createFromFile(model_path)); m_session = m_net->createSession(m_config, m_runtime_info); m_input_tensor = m_net->getSessionInput(m_session, nullptr); return 0; } float CSynthesizer::process(std::vector>> &contentvec_input, std::vector>> &ret) { std::vector input_dims{1, 205, 258}; auto input_tensor = MNN::Tensor::create(input_dims, nullptr, MNN::Tensor::CAFFE); auto input_data = input_tensor->host(); auto input_size = input_tensor->size(); // ::memcpy(input_data, contentvec_input.data(), input_size); for (int i = 0; i < 205; i++) { std::memcpy(input_data+i*258, contentvec_input[0][i].data(), input_size / 205); } m_input_tensor->copyFromHostTensor(input_tensor); delete input_tensor; struct timeval start; struct timeval end; gettimeofday(&start, NULL); m_net->runSession(m_session); gettimeofday(&end, NULL); float time = (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0; auto output_tensor = m_net->getSessionOutput(m_session, nullptr); std::vector shape = output_tensor->shape(); auto output = MNN::Tensor::create(shape, nullptr, MNN::Tensor::CAFFE); auto output_data = output->host(); auto output_size = output->size(); output_tensor->copyToHostTensor(output); + for (int i = 0; i < shape[0]; i++) { + if (shape[0] > ret.size()) + { + ret.resize(shape[0]); + } for (int j = 0; j < shape[1]; j++) { + if (shape[1] > ret[j].size()) + { + ret[j].resize(shape[1]); + } for (int k = 0; k < shape[2]; k++) { + if (shape[2] > ret[i][j].size()) + { + ret[i][j].resize(shape[2]); + } ret[i][j][k] = *(output_data + i * 35840 + k); } } } return time; } void CSynthesizer::uninit() { if (m_net != nullptr) { m_net->releaseModel(); } m_net = nullptr; m_session = nullptr; m_input_tensor = nullptr; } diff --git a/mnn_demo/third_party/espyin-v1.0/ESPYIN.cpp b/mnn_demo/third_party/espyin-v1.0/ESPYIN.cpp index a0c762d..604ab5a 100644 --- a/mnn_demo/third_party/espyin-v1.0/ESPYIN.cpp +++ b/mnn_demo/third_party/espyin-v1.0/ESPYIN.cpp @@ -1,163 +1,162 @@ /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ /* pYIN - A fundamental frequency estimator for monophonic audio Centre for Digital Music, Queen Mary, University of London. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. See the file COPYING included with this distribution for more information. */ #include "ESPYIN.h" #include "ESMonoPitch.h" #include #include #include #include #include using std::string; using std::vector; ESPYIN::ESPYIN(float inputSampleRate, size_t stepSize, size_t blockSize, size_t fmin, size_t fmax) : m_stepSize(stepSize), m_blockSize(blockSize), m_fmin(fmin), m_fmax(fmax), m_yin(blockSize, inputSampleRate, 0.0), m_oF0Candidates(0), m_oF0Probs(1), m_oVoicedProb(2), m_oCandidateSalience(3), m_oSmoothedPitchTrack(4), m_threshDistr(2.0f), m_outputUnvoiced(2.0f), m_pitchProb(0) { reset(); } ESPYIN::~ESPYIN() { } void ESPYIN::reset() { m_yin.setThresholdDistr(m_threshDistr); m_yin.setFrameSize(m_blockSize); m_pitchProb.clear(); } void ESPYIN::updata(int reserve_frame_num) { vector > > temp_pitchProb(m_pitchProb); if (!temp_pitchProb.empty()) { int frame_num = int(temp_pitchProb.size()); if (reserve_frame_num <= 0 || reserve_frame_num > frame_num) { return; } for (int i = 0; i < reserve_frame_num; ++i) { temp_pitchProb[i] = temp_pitchProb[frame_num - reserve_frame_num + i]; } temp_pitchProb.resize(reserve_frame_num); m_pitchProb = temp_pitchProb; } } -ESFeatureSet -ESPYIN::process(const float * const inputBuffers) +ESFeatureSet ESPYIN::process(const float * const inputBuffers) { ESFeatureSet fs; double *dInputBuffers = new double[m_blockSize]; for (size_t i = 0; i < m_blockSize; ++i) dInputBuffers[i] = inputBuffers[i]; ESYin::YinOutput yo = m_yin.processProbabilisticYin(dInputBuffers); ESFeature f; for (size_t i = 0; i < yo.freqProb.size(); ++i) { f.values.push_back(yo.freqProb[i].first); } fs[m_oF0Candidates].push_back(f); f.values.clear(); float voicedProb = 0; for (size_t i = 0; i < yo.freqProb.size(); ++i) { f.values.push_back(yo.freqProb[i].second); voicedProb += yo.freqProb[i].second; } fs[m_oF0Probs].push_back(f); f.values.clear(); f.values.push_back(voicedProb); fs[m_oVoicedProb].push_back(f); f.values.clear(); float salienceSum = 0; for (size_t iBin = 0; iBin < yo.salience.size(); ++iBin) { f.values.push_back(yo.salience[iBin]); salienceSum += yo.salience[iBin]; } fs[m_oCandidateSalience].push_back(f); delete [] dInputBuffers; vector > tempPitchProb; for (size_t iCandidate = 0; iCandidate < yo.freqProb.size(); ++iCandidate) { double tempPitch = 12 * std::log(yo.freqProb[iCandidate].first/440)/std::log(2.) + 69; tempPitchProb.push_back(pair (tempPitch, yo.freqProb[iCandidate].second)); } m_pitchProb.push_back(tempPitchProb); return fs; } ESFeatureSet ESPYIN::getRemainingFeatures(int reso_type) { ESFeatureSet fs; ESFeature f; vector > > temp_pitchProb(m_pitchProb); if (temp_pitchProb.empty()) { return fs; } // MONO-PITCH STUFF ESMonoPitch mp(reso_type); // std::cerr << "before viterbi" << std::endl; vector mpOut = mp.process(temp_pitchProb); // std::cerr << "after viterbi " << mpOut.size() << " "<< m_timestamp.size() << std::endl; for (size_t iFrame = 0; iFrame < mpOut.size(); ++iFrame) { if (mpOut[iFrame] < 0 && (m_outputUnvoiced==0)) continue; f.values.clear(); if (m_outputUnvoiced == 1) { f.values.push_back(abs(mpOut[iFrame])); } else { f.values.push_back(mpOut[iFrame]); } fs[m_oSmoothedPitchTrack].push_back(f); } return fs; } int ESPYIN::getFrames() { return int(m_pitchProb.size()); }