diff --git a/mnn_demo/inc/CRvcLiteOnline.h b/mnn_demo/inc/CRvcLiteOnline.h index ad62799..b4100aa 100644 --- a/mnn_demo/inc/CRvcLiteOnline.h +++ b/mnn_demo/inc/CRvcLiteOnline.h @@ -1,309 +1,318 @@ // // Created by jianli.yang on 2023/11/29. // #ifndef MNN_DEMO_CRVCLITEONLINE_H #define MNN_DEMO_CRVCLITEONLINE_H #define DEBUG #ifdef __ANDROID__ #include #ifdef STRELEASE #define LOGD(...) #define LOGE(...) #else #define LOGD(TAG, ...) __android_log_print(ANDROID_LOG_DEBUG , TAG, __VA_ARGS__) #define LOGE(TAG, ...) __android_log_print(ANDROID_LOG_ERROR , TAG, __VA_ARGS__) #endif #else #ifdef DEBUG #define LOGD(TAG, ...) printf("\nDebug: %s",TAG);printf(__VA_ARGS__); #define LOGE(TAG, ...) printf("\nError: %s",TAG);printf(__VA_ARGS__); #else #define LOGD(TAG, ...) #define LOGE(TAG, ...) #endif #endif #include #include #include #include #include #define gs_src_samplerate 16000 #define gs_dst_samplerate 32000 #define gs_crossfade_time 0.08 // 单位是s #define gs_block_time 1 #define gs_extra_time 1 #define gs_hubert_frame 206 // 和模型相关 #define gs_hubert_dim 256 // 和模型相关 #define gs_synth_input_frame 205 // 和模型相关 #define gs_synth_input_dim 258 // 和模型相关 #define gs_synth_output_frame 35840 // 和模型相关 enum { ERR_RVC_LITE_SUCCESS = 0, ERR_RVC_LITE_NOT_INIT = 1, ERR_RVC_LITE_REINIT = 2, ERR_RVC_LITE_RT_REINIT = 3, ERR_RVC_LITE_RT_NOT_INIT = 4, ERR_RVC_LITE_RT_NOT_ENOUGH_DATA = 5, ERR_RVC_LITE_RT_INPUT_SAMPLE_ERR = 6, // 采样率小于16000 ERR_RVC_LITE_RT_RESAMPLE_OUTBUF_SHORT = 7, // 重采样后的buf太短 - ERR_RVC_LITE_NOT_SWITCH_MODEL = 8, // 重采样后的buf太短 + ERR_RVC_LITE_NOT_SWITCH_MODEL = 8, // 没有选择音色模型 ERR_RVC_LITE_MODEL_NOT_EXISTS = 9, // 没有人声模型 ERR_RVC_LITE_BLOCK_TOO_LONG = 10, // 区块过大 }; const int RVC_LITE_RT_SYN_STATE_DEFAULT = 0; const int RVC_LITE_RT_SYN_STATE_EFFECT = 1; const int RVC_LITE_RT_SYN_STATE_DEFAULT2EFFECT = 2; const int RVC_LITE_RT_SYN_STATE_EFFECT2DEFAULT = 3; const int RVC_LITE_RT_SYN_STATE_BEFORE_DEFAULT = 4; class Hubert; class CSynthesizer; class ESPYIN; class CThreadPool; class CRvcCircleBuffer; class CFfmpegResampler; /** * Rvc轻量化实时推理代码 * 要求输入16k的音频数据,输出是目标采样率的数据 */ class CRvcLiteOnline { public: CRvcLiteOnline(); ~CRvcLiteOnline(); private: void uninit(); void get_f0_post(); void get_pyin_f0(); void init_variable(); public: /** * 初始化函数 * @param hubert_model_path * @return 0 表示正常 */ int init(const char *hubert_model_path); /** * 换音色模型 * @param synth_model_path * @return */ int switch_synth_model(const char* synth_model_path); /** + * 设置key的变化[-12,12] + * @param key + */ + void set_up_key(int key); + + /** * 处理定长的一帧数据 * 要求输入单声道16k音频 * @param in_buf * @param in_len 长度小于等于gs_src_samplerate,最佳是gs_src_samplerate * @param out_buf * @param out_len 小于等于gs_dst_samplerate,最佳是gs_dst_samplerate[和输入有关,如果是32k,则恰好是输入的两倍] * @return 0 表示正常 */ int process_block(float *in_buf, int in_len, float *out_buf, int out_len); /** * 清空存储 * @return */ void reset(); /** * 获取延迟时间 * @return */ int get_latency_ms(); private: // 是否进行过init bool m_init; bool m_switch_model; std::shared_ptr m_hubert_inst; std::shared_ptr m_synthesizer_inst; std::shared_ptr m_es_pyin; // 缓存使用的数据 // 要求输入的时间片长度,采样点数 int m_input_block_frame; // 推理时额外需要的长度 int m_input_extra_frame; // 推理时使用的buffer长度 int m_input_predict_buf_frame; // 推理时使用的buffer float *m_input_predict_buf; std::vector m_f0_data; std::vector m_f0_coarse_data; // 输出的情况 int m_crossfade_frame; int m_output_block_frame; int m_output_cache_buf_frame; float *m_crossfade_buf; float *m_output_cache_buf; bool m_fade_in; + + float m_f0_new_up_key; + float m_f0_up_key; // 各个实例的返回结果 std::vector>> m_hubert_ret; std::vector>> m_synth_input; std::vector>> m_synth_out; }; class CResample { public: CResample(); ~CResample(); public: int init(int in_samplerate, int out_samplerate, int in_channel=1, int out_channel=1); // 返回的是单通道的采样点数 int get_out_samples(int num); int get_latency(); void reset(); // 不考虑让内部缓存的情况,有多少拿多少,in_num和out_num均是单通道采样点个数 int resample(float * in_buf, int in_num, float * out_buf, int & out_num); private: std::shared_ptr m_resample_inst; int m_in_channel; int m_out_channel; }; /** * 实时处理的类 * 入一帧出一帧,允许非常短的帧做输入,延迟较高,在2s左右 * 思路: * 1. 构造函数设置变量 * 2. init初始化环境,开启处理线程 * 3. process,每次送一帧,触发一次判断逻辑 * 4. flush函数将输入的未处理的数据全部处理一次,联合之前没有被取出的数据一起刷出来 * 5. 析构时关闭处理线程,并释放所有空间 */ class CRvcLiteOnlineRealTime { public: CRvcLiteOnlineRealTime(); ~CRvcLiteOnlineRealTime(); private: void init_variable(); void rvc_process(); void rvc_process_step(); void uninit(); void stop(); public: /** * 初始化函数 * @param hubert_model_path * @param sample_rate * @param channel * @return */ int init(const char *hubert_model_path, int sample_rate, int channel); /** * 切换音色 * @param synth_model_path * @return */ int switch_synth(const char *synth_model_path); /** * 清空缓存 */ void reset(); /** * 入一帧,出一帧,要求长度一致 * 两者可以是同一块buffer * @param in_buf * @param in_len * @param out_buf * @param out_len * @return */ int process(float *in_buf, int in_len, float *out_buf, int out_len); /** * 将所有处理好的结果获取出来 * 因为不确定还有多少,所以由内部来开辟空间,外部进行释放 * @return */ void flush(float *&out_buf, int &len); /** * 获取延迟时间 */ int get_latency_ms(); private: int m_sample_rate; int m_channel; std::shared_ptr m_resample_queue; std::shared_ptr m_input_queue; std::shared_ptr m_out_queue; int m_input_tmp_buf_len; int m_output_tmp_buf_len; std::shared_ptr m_input_tmp_buf; std::shared_ptr m_output_tmp_buf; std::shared_ptr m_rvc_inst; std::shared_ptr m_thread_pool; // 逻辑变量 bool m_init; // 处理线程相关 bool m_rvc_stop; std::mutex m_rvc_mutex; std::condition_variable m_rvc_cond; // 重采样相关 std::shared_ptr m_resample16; std::shared_ptr m_resample2src; int m_resample_buf_max_len; std::shared_ptr m_resample_in_buf; std::shared_ptr m_resample_out_buf; // 切换音色 std::string m_synth_path; std::string m_new_synth_path; // 合成的状态 int m_syn_state; // 延迟器 std::shared_ptr m_latency_queue; }; #endif //MNN_DEMO_CRVCLITEONLINE_H diff --git a/mnn_demo/inc/CRvcLiteOnlineV2.h b/mnn_demo/inc/CRvcLiteOnlineV2.h new file mode 100644 index 0000000..5abeb15 --- /dev/null +++ b/mnn_demo/inc/CRvcLiteOnlineV2.h @@ -0,0 +1,105 @@ +// +// Created by Administrator on 2024/1/22. +// + +#ifndef MNN_DEMO_CRVCLITEONLINEV2_H +#define MNN_DEMO_CRVCLITEONLINEV2_H +#include "CRvcLiteSynthesizer.h" + +const int CRVC_V2_STATE_DEFAULT = 0; +const int CRVC_V2_STATE_EFFECT = 1; +const int CRVC_V2_STATE_DEFAULT2EFFECT = 2; +const int CRVC_V2_STATE_EFFECT2DEFAULT = 3; + +/** + * 使用方式: + * 初始化之后: push和pop以及switch_model均异步处理即可 + * 具体使用方式可以参考:main.cpp用法 + */ +class CRvcLiteOnlineV2 +{ +public: + CRvcLiteOnlineV2(); + ~CRvcLiteOnlineV2(); + +private: + void set_cur_state(bool reset); + +public: + /** + * 初始化,给定HubertModel,采样率和通道数 + * @param hubert_model + * @param sample_rate + * @param channel + * @return 0 表示正常 + */ + int init(const char* hubert_model, int sample_rate, int channel); + + /** + * 设置人声模型地址,如果人声模型不存在,则会返回错误码 + * @param synth_model + * @return 0 表示正常 + */ + int switch_model(const char* synth_model); + + /** + * 设置变调,范围是[-12, 12] + * 有人声模型才生效,否则不生效 + * 换人声模型,该状态不会丢失,并且在无人声的时候设置之后,有人声模型后也会生效 + * @param key + */ + void set_up_key(int key); + + /** + * 清空缓存数据 + */ + void reset(); + + /** + * 输入人声数据,阻塞, + * @param buf 人声数据地址[为了省空间,会对这个buf做修改,reset之后的第一帧会进行fade_in操作] + * @param len 长度为len代表sample*channel + * @param last true代表最后一帧,不论如何都会进行推理,将结果获取出来 + * @return 0 代表正常 + */ + int push(float* buf, int len, bool last=false); + + /** + * 返回内部当前可用的数据总数 + * frame*channel + * @return + */ + int size(); + + /** + * 获取处理之后的结果 + * @param buf buf地址 + * @param len 当前buf的长度,返回时如果内部数据不足len的长度则会修改len,表明返回的长度,如果超过,则最多返回len + * len 长度代表buffer长度也就是frame*channel + */ + void pop(float* buf, int& len); + +public: + // 处理逻辑 + std::shared_ptr m_rvc_inst; + // 输入的队列 + std::shared_ptr m_in_queue; + // 输出的队列 + std::shared_ptr m_out_queue; + std::shared_ptr m_tmp_in_buf; + std::shared_ptr m_tmp_out_buf; + int m_tmp_buf_len; + int m_block_len; + bool m_reset; + std::string m_syn_model; + std::string m_new_syn_model; + int m_sync_state; + int m_fade_len; + int m_channel; + // 输入和输出的差距 + int m_input_latency_output_frame; + std::mutex m_rvc_mutex; +}; + + +#endif //MNN_DEMO_CRVCLITEONLINEV2_H diff --git a/mnn_demo/inc/CRvcLiteSynthesizer.h b/mnn_demo/inc/CRvcLiteSynthesizer.h index bac8c21..cedcb2e 100644 --- a/mnn_demo/inc/CRvcLiteSynthesizer.h +++ b/mnn_demo/inc/CRvcLiteSynthesizer.h @@ -1,58 +1,82 @@ // // Created by Administrator on 2024/1/21. // #ifndef MNN_DEMO_CRVCLITESYNTHESIZER_H #define MNN_DEMO_CRVCLITESYNTHESIZER_H #include "CRvcLiteOnline.h" class CRvcLiteSynthesizer { public: CRvcLiteSynthesizer(); ~CRvcLiteSynthesizer(); public: /** * 初始化 * @param hubert_model 语义模型地址 - * @param synth_model 音色模型地址 * @param sample_rate 采样率 * @param channel 通道数 * @return 0 表示正常 */ - int init(const char* hubert_model, const char* synth_model, int sample_rate, int channel); + int init(const char* hubert_model, int sample_rate, int channel); + + /** + * 选择人声模型 + * @param synth_model 音色模型地址 + * @param enable 是否开启 + * @return + */ + int switch_model(const char* synth_model); + + /** + * 设置变调,范围是[-12, 12] + * 有人声模型才生效,否则不生效 + * 换人声模型,该状态不会丢失,并且在无人声的时候设置之后,有人声模型后也会生效 + * @param key + */ + void set_up_key(int key); + + /** + * reset,清空内部数据 + */ + void reset(); /** * 处理逻辑 * @param in_buf 输入的buf * @param in_len 输入的Buf长度,frame*channel,建议输入小于等于1s的音频长度,尽量的大就好 * @param out_buf 输出的buf * @param out_len 输出的buf长度, frame*channel * 注意: 此处有可能出现输出的长度不一定等于in_len,输出的值会小于等于out_len,但是是连续的,所以out_len可以适当比in_len大一些,从而保证都能搞出来 * @return */ int process(float* in_buf, int in_len, float* out_buf, int &out_len); - // 获取实时率,处理1s数据的真实耗时/1s + /** + * 获取实时率,处理1s数据的真实耗时/1s + * @return + */ float get_rtf(); private: std::shared_ptr m_rvc_inst; std::shared_ptr m_resample2_16; std::shared_ptr m_resample2src; int m_channel; int m_sample_rate; std::shared_ptr m_buf_tmp_16k; int m_buf_tmp_16k_len; int m_buf_tmp_16k_cap; std::shared_ptr m_buf_tmp_32k; int m_buf_tmp_32k_len; int m_buf_tmp_32k_cap; std::shared_ptr m_buf_tmp_src; int m_buf_tmp_src_len; int m_buf_tmp_src_cap; + bool m_first; }; #endif //MNN_DEMO_CRVCLITESYNTHESIZER_H diff --git a/mnn_demo/main.cpp b/mnn_demo/main.cpp index 8aa637d..0d6b685 100644 --- a/mnn_demo/main.cpp +++ b/mnn_demo/main.cpp @@ -1,221 +1,285 @@ #include #include #include #include "src/Hubert.h" #include "src/CSynthesizer.h" #include "CRvcLiteSynthesizer.h" +#include "CRvcLiteOnlineV2.h" int test_hubert() { const char *hubert_model_path = "/mnt/d/dataset/svc/models/mnn/hubert_test_v1_fp16.mnn"; Hubert hubert; int err_code = hubert.init(hubert_model_path); std::vector input(33280, 0.1); std::vector>> ret; ret.resize(1); ret[0].resize(205); for (int i = 0; i < 205; i++) { ret[0][i].resize(256); } float time = hubert.process(input.data(), ret); return 0; } int test_contentvec() { const char *contentvec_model_path = "/mnt/d/dataset/svc/models/mnn/contentvec_test_fp16.mnn"; CSynthesizer contentVec; int err_code = contentVec.init(contentvec_model_path); std::vector>> input(1); input[0].resize(205); for (int i = 0; i < 205; i++) { for (int j = 0; j < 258; j++) { if (j == 256) { input[0][i].push_back(0.2); } else if (j == 257) { input[0][i].push_back(1.0); } else { input[0][i].push_back(0.1); } } } std::vector>> ret; ret.resize(1); for (int i = 0; i < 1; i++) { ret[i].resize(1); ret[i][0].resize(35840); } float tot = 0.f; for (int i = 0; i < 10; i++) { float time = contentVec.process(input, ret); tot += time; } printf("time: %f \n", tot / 100.f); return 0; } #include "CRvcLiteOnline.h" #include "av_waves/waves/inc/STWaveFile.h" void test() { const char *hubert_model_path = "/mnt/d/dataset/svc/models/mnn/hubert_test_v2_fp16.mnn"; const char *contentvec_model_path = "/mnt/d/dataset/svc/models/mnn/contentvec_test_fp16.mnn"; const char *in_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01_16.wav"; // const char *in_wav = "/mnt/d/code/develop/svc/Retrieval-based-Voice-Conversion-WebUI/online/1_1.wav"; const char *out_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01_cpp_v1.wav"; CRvcLiteOnline rvc_inst; rvc_inst.init(hubert_model_path); // 读取音频文件, 要求16k,单声道 STCWaveFile wav_inst(in_wav, false); int sample_rate = wav_inst.GetSampleRate(); int channel = wav_inst.GetChannels(); int len = wav_inst.GetTotalFrames() * channel; float *data = new float[len]; float *outdata = new float[len * 2]; wav_inst.ReadFrameAsfloat(data, wav_inst.GetTotalFrames()); int step = sample_rate; printf("start ..\n"); for (int i = 0; i < len; i += step) { if (i + step > len) { step = len - i; } struct timeval start; struct timeval end; gettimeofday(&start, NULL); rvc_inst.process_block(data + i, step, outdata + 2 * i, 2 * step); gettimeofday(&end, NULL); printf("sp = %f ms\n", (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0); } STCWaveFile wav_out_inst(out_wav, true); wav_out_inst.SetSampleRate(32000); wav_out_inst.SetChannels(1); wav_out_inst.SetSampleFormat(SF_IEEE_FLOAT); wav_out_inst.SetupDone(); wav_out_inst.WriteFrame(outdata, len * 2); printf("finish2 ....\n"); } void test_rvc_lite_synth() { const char *hubert_model_path = "/mnt/d/dataset/svc/models/layers_3/layer3_contentvec.mnn"; const char *syz_model = "/mnt/d/dataset/svc/models/layers_3/layer3_syz.mnn"; const char *out_wav = "/mnt/d/dataset/tmp/i_out3.wav"; const char *in_wav = "/mnt/d/dataset/tmp/t1.wav"; STCWaveFile wav_inst(in_wav, false); int sample_rate = wav_inst.GetSampleRate(); int channel = wav_inst.GetChannels(); int len = wav_inst.GetTotalFrames() * channel; float *data = new float[len]; float *outdata = new float[len]; wav_inst.ReadFrameAsfloat(data, wav_inst.GetTotalFrames()); CRvcLiteSynthesizer m_rvc_inst; - int err = m_rvc_inst.init(hubert_model_path, syz_model, sample_rate, channel); + int err = m_rvc_inst.init(hubert_model_path, sample_rate, channel); printf("init err=%d!\n", err); printf("rtf=%f\n", m_rvc_inst.get_rtf()); int step = sample_rate * channel - 100 * channel; int out_len = 0; for(int i = 0; i < len; i+=step) { if (i + step > len) { step = len - i; } int out_step = step; err = m_rvc_inst.process(data+i, step, outdata+out_len, out_step); if(err != ERR_RVC_LITE_SUCCESS) { - printf("process err!\n"); + printf("process err=%d!\n", err); return ; } out_len += out_step; } STCWaveFile wav_out_inst(out_wav, true); wav_out_inst.SetSampleRate(sample_rate); wav_out_inst.SetChannels(channel); wav_out_inst.SetSampleFormat(SF_IEEE_FLOAT); wav_out_inst.SetupDone(); wav_out_inst.WriteFrame(outdata, wav_inst.GetTotalFrames()); delete[] data; delete[] outdata; } +void test_rvc_lite_v2() +{ + const char *hubert_model_path = "/mnt/d/dataset/svc/models/layers_3/layer3_contentvec.mnn"; + const char *syz_model = "/mnt/d/dataset/svc/models/layers_3/layer3_syz.mnn"; + const char *out_wav = "/mnt/d/dataset/tmp/i_out_01_r.wav"; + const char *in_wav = "/mnt/d/dataset/tmp/t1.wav"; + + STCWaveFile wav_inst(in_wav, false); + int sample_rate = wav_inst.GetSampleRate(); + int channel = wav_inst.GetChannels(); + int len = wav_inst.GetTotalFrames() * channel; + float *data = new float[len]; + float *outdata = new float[len]; + wav_inst.ReadFrameAsfloat(data, wav_inst.GetTotalFrames()); + CRvcLiteOnlineV2 m_rvc_inst; + int err = m_rvc_inst.init(hubert_model_path, sample_rate, channel); +// m_rvc_inst.switch_model(syz_model); +// m_rvc_inst.set_up_key(0); + printf("init err=%d!\n", err); + int step = sample_rate * channel - 100 * channel; + int out_len = 0; + bool last = false; + int flag = 0; + for(int i = 0; i < len; i+=step) + { + if (i + step > len) { + step = len - i; + last = true; + } + int out_step = step; + err = m_rvc_inst.push(data+i, step, last); + if(err != ERR_RVC_LITE_SUCCESS) + { + printf("process err=%d!\n", err); + return ; + } + + if (i >= len / 3 && flag == 0) + { + flag = 1; + m_rvc_inst.switch_model(syz_model); + } + + if (i >= len / 2 && flag == 1) + { + flag = 2; + m_rvc_inst.reset(); + } + + out_step = 2 * step; + m_rvc_inst.pop(outdata+out_len, out_step); + out_len += out_step; + } + STCWaveFile wav_out_inst(out_wav, true); + wav_out_inst.SetSampleRate(sample_rate); + wav_out_inst.SetChannels(channel); + wav_out_inst.SetSampleFormat(SF_IEEE_FLOAT); + wav_out_inst.SetupDone(); + wav_out_inst.WriteFrame(outdata, wav_inst.GetTotalFrames()); + delete[] data; + delete[] outdata; +} void test_rvc_lite_online() { // const char *hubert_model_path = "/mnt/d/dataset/svc/models/mnn/hubert_test_v2_fp16.mnn"; // const char *hubert_model_path = "/mnt/d/dataset/svc/models/layer6_bingxiao_v1/mnn/layers6_checkpoint_14_1660000_1_hubert.mnn"; const char *hubert_model_path = "/mnt/d/dataset/svc/models/layers_3/layer3_contentvec.mnn"; // const char *contentvec_model_path = "/mnt/d/dataset/svc/models/mnn/contentvec_test_fp16.mnn"; // const char *syz_model = "/mnt/d/dataset/svc/models/layer6_bingxiao_v1/mnn/xusong_v1_6hubert_hifix_syz_base_vctk_kd_32k_hubert6_jianli_e225_s62775_205.mnn"; const char *xs_model = "/mnt/d/dataset/svc/models/layers_3/layer3_xusong.mnn"; const char *syz_model = "/mnt/d/dataset/svc/models/layers_3/layer3_syz.mnn"; // const char *contentvec_model_path = "/mnt/d/dataset/svc/models/layer6_bingxiao_v1/mnn/xiafan_fp16.mnn"; // const char *in_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01.wav"; const char *in_wav = "/mnt/d/dataset/tmp/t1.wav"; // const char* in_wav = "/mnt/d/dataset/svc/dataset/短数据样本/男声/qiankun.wav"; // const char* in_wav = "/mnt/d/dataset/tmp/i.wav"; // const char *in_wav = "/mnt/d/code/develop/svc/Retrieval-based-Voice-Conversion-WebUI/online/1_1.wav"; // const char *out_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01_cpp_v4.wav"; // const char *out_wav = "/mnt/d/dataset/svc/dataset/tests/qiankun_412_v4.wav"; const char *out_wav = "/mnt/d/dataset/tmp/i_out2.wav"; // 读取音频文件, 要求16k,单声道 STCWaveFile wav_inst(in_wav, false); int sample_rate = wav_inst.GetSampleRate(); int channel = wav_inst.GetChannels(); int len = wav_inst.GetTotalFrames() * channel; float *data = new float[len]; float *outdata = new float[len]; CRvcLiteOnlineRealTime rvc_inst; rvc_inst.init(hubert_model_path, sample_rate, channel); wav_inst.ReadFrameAsfloat(data, wav_inst.GetTotalFrames()); int step = 1024; printf("start ..\n"); bool flag = true; rvc_inst.switch_synth(syz_model); for (int i = 0; i < len; i += step) { if (i + step > len) { step = len - i; } struct timeval start; struct timeval end; gettimeofday(&start, NULL); int ret = rvc_inst.process(data + i, step, outdata+i, step); std::this_thread::sleep_for(std::chrono::milliseconds (15)); gettimeofday(&end, NULL); printf("ret = %d, sp = %f ms step=%d\n", ret, (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0, step); if (flag && i >= len / 3) { flag = false; rvc_inst.reset(); // rvc_inst.switch_synth(xs_model); } } STCWaveFile wav_out_inst(out_wav, true); wav_out_inst.SetSampleRate(sample_rate); wav_out_inst.SetChannels(channel); wav_out_inst.SetSampleFormat(SF_IEEE_FLOAT); wav_out_inst.SetupDone(); wav_out_inst.WriteFrame(outdata, wav_inst.GetTotalFrames()); float* flush_data; int flush_len; rvc_inst.flush(flush_data, flush_len); wav_out_inst.WriteFrame(flush_data, flush_len/channel); printf("finish2 ....\n"); } int main() { // int ret_hubert = test_hubert(); // int ret_contentvec = test_contentvec(); // test(); // test(); // test_rvc_lite_online(); - test_rvc_lite_synth(); +// test_rvc_lite_synth(); + test_rvc_lite_v2(); return 0; } diff --git a/mnn_demo/src/CRvcLiteOnline.cpp b/mnn_demo/src/CRvcLiteOnline.cpp index 241c6b8..f9067f7 100644 --- a/mnn_demo/src/CRvcLiteOnline.cpp +++ b/mnn_demo/src/CRvcLiteOnline.cpp @@ -1,811 +1,831 @@ // // Created by Administrator on 2023/11/29. // #include #include #include #include "CRvcLiteOnline.h" #include "Hubert.h" #include "CSynthesizer.h" #include "espyin-v1.0/ESPYIN.h" #include "ThreadPool.h" #include "CRvcCircleBuffer.h" #include "FfmpegResampler.h" #include inline bool file_exists (const std::string& name) { return ( access( name.c_str(), F_OK ) != -1 ); } // size代表了buf的长度 void stereo2mono(float *input, int size, float *output) { for (int i = 0; i < size - 1; i += 2) { output[i / 2] = (input[i] + input[i + 1]) / 2; } } void mono2stereo(float *input, int size, float *output) { for (int i = 0; i < size; i++) { output[2 * i] = input[i]; output[2 * i + 1] = input[i]; } } CRvcLiteOnline::CRvcLiteOnline() { init_variable(); m_init = false; m_switch_model = false; // 输入部分需要的变量 // 要求输入的时间片长度,采样点数 m_input_block_frame = int(gs_block_time * gs_src_samplerate); // 推理时额外需要的长度 m_input_extra_frame = int(gs_extra_time * gs_src_samplerate); int zc = gs_src_samplerate / 100; // 10ms的点数 int input_corssfade_frame = int(gs_crossfade_time * gs_src_samplerate); // 推理时使用的buffer长度 m_input_predict_buf_frame = int(ceil((m_input_extra_frame + input_corssfade_frame + m_input_block_frame) * 1.0 / zc) * zc); // 推理时使用的buffer m_input_predict_buf = new float[m_input_predict_buf_frame]; memset(m_input_predict_buf, 0, sizeof(float) * m_input_predict_buf_frame); // 输出部分需要的变量 m_crossfade_frame = int(gs_crossfade_time * gs_dst_samplerate); m_output_block_frame = int(gs_block_time * gs_dst_samplerate); int output_extra_frame = int(gs_extra_time * gs_dst_samplerate); zc = gs_dst_samplerate / 100; m_output_cache_buf_frame = int(ceil((m_output_block_frame + m_crossfade_frame + output_extra_frame) * 1.0 / zc) * zc); m_output_cache_buf = new float[m_output_cache_buf_frame]; memset(m_output_cache_buf, 0, sizeof(float) * m_output_cache_buf_frame); m_crossfade_buf = new float[m_crossfade_frame]; memset(m_crossfade_buf, 0, sizeof(float) * m_crossfade_frame); // 对于模型的输入和输出进行缓存 // 此处是写死的和模型有关 m_hubert_ret.resize(1); m_hubert_ret[0].resize(gs_hubert_frame); for (int i = 0; i < gs_hubert_frame; i++) { m_hubert_ret[0][i].resize(gs_hubert_dim); } // synth模型的输入 m_synth_input.resize(1); m_synth_input[0].resize(gs_synth_input_frame); for (int i = 0; i < gs_synth_input_frame; i++) { m_synth_input[0][i].resize(gs_synth_input_dim); } m_synth_out.resize(1); m_synth_out[0].resize(1); m_synth_out[0][0].resize(gs_synth_output_frame); } CRvcLiteOnline::~CRvcLiteOnline() { uninit(); } /**********************************对内函数*********************************************/ void CRvcLiteOnline::uninit() { if (m_input_predict_buf != NULL) { delete[] m_input_predict_buf; m_input_predict_buf = NULL; } if (m_output_cache_buf != NULL) { delete[] m_output_cache_buf; m_output_cache_buf = NULL; } if (m_crossfade_buf != NULL) { delete[] m_crossfade_buf; m_crossfade_buf = NULL; } init_variable(); } void CRvcLiteOnline::get_pyin_f0() { for (int i = 0; i < m_input_predict_buf_frame; i += 160) { m_es_pyin->process(m_input_predict_buf + i); } m_f0_data.clear(); ESFeatureSet feats = m_es_pyin->getRemainingFeatures(); if (!feats.empty()) { m_f0_data.resize(feats[4].size()); for (size_t i = 0; i < feats[4].size(); ++i) { - // JL_DEBUG - m_f0_data[i] = feats[4][i].values[0]; + // 设置变调 + m_f0_data[i] = feats[4][i].values[0] * m_f0_up_key; if (m_f0_data[i] < 0) { m_f0_data[i] = 0; } } } m_es_pyin->reset(); get_f0_post(); } void CRvcLiteOnline::get_f0_post() { int f0_min = 50; int f0_max = 1100; float f0_mel_min = 1127 * log2(1 + f0_min * 1.0 / 700); float f0_mel_max = 1127 * log2(1 + f0_max * 1.0 / 700); m_f0_coarse_data.clear(); m_f0_coarse_data.resize(m_f0_data.size()); for (int i = 0; i < m_f0_data.size(); i++) { float f0_mel = 1127 * log2(1 + m_f0_data[i] / 700); if (f0_mel > 0) { f0_mel = (f0_mel - f0_mel_min) * 254.f / (f0_mel_max - f0_mel_min) + 1; } if (f0_mel <= 1) { f0_mel = 1; } else if (f0_mel > 255) { f0_mel = 255; } m_f0_coarse_data[i] = float(int(f0_mel + 0.5)); } } void CRvcLiteOnline::init_variable() { m_init = false; m_switch_model = false; // 缓存使用的数据 // 要求输入的时间片长度,采样点数 m_input_block_frame = 0; m_input_extra_frame = 0; m_input_predict_buf_frame = 0; m_input_predict_buf = nullptr; m_f0_data.clear(); m_f0_coarse_data.clear(); m_crossfade_frame = 0; m_output_block_frame = 0; m_output_cache_buf_frame = 0; m_crossfade_buf = nullptr; m_output_cache_buf = nullptr; // 各个实例的返回结果 m_hubert_ret.clear(); m_synth_input.clear(); m_synth_out.clear(); m_fade_in = true; + m_f0_up_key = 1.f; + m_f0_new_up_key = 1.f; } /**********************************对外函数*********************************************/ int CRvcLiteOnline::init(const char *hubert_model_path) { if (m_init) { return ERR_RVC_LITE_REINIT; } m_hubert_inst = std::make_shared(); m_synthesizer_inst = std::make_shared(); m_hubert_inst->init(hubert_model_path); // m_synthesizer_inst->init(synth_model_path); // 要求stepSize必须是2^n m_es_pyin = std::make_shared(16000, 160, 1024, 50, 1100); m_init = true; m_switch_model = false; m_fade_in = true; + m_f0_up_key = 1.f; + m_f0_new_up_key = 1.f; return ERR_RVC_LITE_SUCCESS; } int CRvcLiteOnline::switch_synth_model(const char *synth_model_path) { if (!m_init) { return ERR_RVC_LITE_NOT_INIT; } if (file_exists(synth_model_path)) { m_synthesizer_inst = std::make_shared(); m_synthesizer_inst->init(synth_model_path); m_switch_model = true; return ERR_RVC_LITE_SUCCESS; } return ERR_RVC_LITE_MODEL_NOT_EXISTS; } +void CRvcLiteOnline::set_up_key(int key) +{ + if (key > 12) + { + key = 12; + } + + if (key < -12) + { + key = -12; + } + m_f0_new_up_key = pow(2, key / 12.f); +} + void CRvcLiteOnline::reset() { memset(m_input_predict_buf, 0, sizeof(float) * m_input_predict_buf_frame); memset(m_crossfade_buf, 0, sizeof(float) * m_crossfade_frame); memset(m_output_cache_buf, 0, sizeof(float) * m_output_cache_buf_frame); m_fade_in = true; } int CRvcLiteOnline::process_block(float *in_buf, int in_len, float *out_buf, int out_len) { if (!m_init) { return ERR_RVC_LITE_NOT_INIT; } if (!m_switch_model) { return ERR_RVC_LITE_NOT_SWITCH_MODEL; } // 外部数据产生不连贯,比如做了reset的时候,需要做fade_in if (m_fade_in) { for(int i = 0; i < in_len; i++) { float rate = i * 1.0 / in_len; in_buf[i] = in_buf[i] * rate; } m_fade_in = false; } // 剔除尾部的block的数据 memcpy(m_input_predict_buf, m_input_predict_buf + in_len, sizeof(float) * (m_input_predict_buf_frame - in_len)); // 向尾部填充in_buf的数据 memcpy(m_input_predict_buf + (m_input_predict_buf_frame - in_len), in_buf, sizeof(float) * in_len); // 提取f0特征序列 struct timeval start; struct timeval end; gettimeofday(&start, NULL); + m_f0_up_key = m_f0_new_up_key; get_pyin_f0(); gettimeofday(&end, NULL); LOGE("CRvcLiteOnline", "get pyin sp = %f ms\n", (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0); // 推理hubert gettimeofday(&start, NULL); m_hubert_inst->process(m_input_predict_buf, m_hubert_ret); gettimeofday(&end, NULL); LOGE("CRvcLiteOnline", "m_hubert_inst sp = %f ms\n", (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0); // 合成语音 for (int i = 0; i < gs_synth_input_frame; i++) { // 拷贝数据 1,gs_hubert_frame,258 for (int j = 0; j < gs_hubert_dim; j++) { m_synth_input[0][i][j] = m_hubert_ret[0][i][j]; } m_synth_input[0][i][256] = m_f0_coarse_data[i]; m_synth_input[0][i][257] = m_f0_data[i]; } gettimeofday(&start, NULL); m_synthesizer_inst->process(m_synth_input, m_synth_out); gettimeofday(&end, NULL); LOGE("CRvcLiteOnline", "m_synthesizer_inst sp = %f ms\n", (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0); // 将结果全部放到缓存中 memcpy(m_output_cache_buf, m_output_cache_buf + gs_synth_output_frame, sizeof(float) * (m_output_cache_buf_frame - gs_synth_output_frame)); memcpy(m_output_cache_buf + (m_output_cache_buf_frame - gs_synth_output_frame), m_synth_out[0][0].data(), sizeof(float) * gs_synth_output_frame); int start_pos = m_output_cache_buf_frame - m_crossfade_frame - out_len; memcpy(out_buf, m_output_cache_buf + start_pos, sizeof(float) * out_len); // 对头部数据做fade_in以及fadeout for (int i = 0; i < m_crossfade_frame; i++) { float rate = float(i * 1.f / m_crossfade_frame); out_buf[i] = rate * out_buf[i] + m_crossfade_buf[i] * (1 - rate); } memcpy(m_crossfade_buf, m_output_cache_buf + (m_output_cache_buf_frame - m_crossfade_frame), sizeof(float) * m_crossfade_frame); return 0; } int CRvcLiteOnline::get_latency_ms() { - return gs_crossfade_time * 1000; + // 此处除了block的延迟,还有推理时hubert理论上应该获取208,实际获取205帧,所以少的30ms + return gs_crossfade_time * 1000 + 30; } /*******************************对内的类**************************************/ CResample::CResample() { m_resample_inst = nullptr; } CResample::~CResample() { } int CResample::init(int in_samplerate, int out_samplerate, int in_channel, int out_channel) { // 只是通道数不一致时走自驱逻辑 m_in_channel = in_channel; m_out_channel = out_channel; if (in_samplerate == out_samplerate && in_channel != out_channel) { m_resample_inst = nullptr; } else { m_resample_inst = std::make_shared(); return m_resample_inst->init(in_samplerate, out_samplerate, in_channel, out_channel); } return ERR_RVC_LITE_SUCCESS; } int CResample::get_out_samples(int num) { if (m_resample_inst) { return m_resample_inst->get_out_samples(num); } return num; } void CResample::reset() { if (m_resample_inst) { return m_resample_inst->reset(); } } int CResample::get_latency() { if (m_resample_inst) { return m_resample_inst->get_latency(); } return 0; } int CResample::resample(float *in_buf, int in_num, float *out_buf, int &out_num) { if (m_resample_inst) { return m_resample_inst->resample(in_buf, in_num, out_buf, out_num); } if (m_in_channel == 2 && m_out_channel == 1) { if (out_num < in_num) { return ERR_RVC_LITE_RT_RESAMPLE_OUTBUF_SHORT; } stereo2mono(in_buf, in_num, out_buf); return ERR_RVC_LITE_SUCCESS; } if (m_in_channel == 1 && m_out_channel == 2) { if (out_num < in_num) { return ERR_RVC_LITE_RT_RESAMPLE_OUTBUF_SHORT; } mono2stereo(in_buf, in_num, out_buf); return ERR_RVC_LITE_SUCCESS; } return ERR_RVC_LITE_SUCCESS; } /*******************************对外的类***************************************/ /*******************************对内函数***************************************/ void CRvcLiteOnlineRealTime::init_variable() { m_init = false; m_rvc_stop = true; m_sample_rate = 44100; m_channel = 1; m_synth_path = ""; m_new_synth_path = ""; m_syn_state = RVC_LITE_RT_SYN_STATE_DEFAULT; } /*******************************对外函数***************************************/ CRvcLiteOnlineRealTime::CRvcLiteOnlineRealTime() { init_variable(); } CRvcLiteOnlineRealTime::~CRvcLiteOnlineRealTime() { uninit(); } int CRvcLiteOnlineRealTime::init(const char *hubert_model_path, int sample_rate, int channel) { if (m_init) { return ERR_RVC_LITE_RT_REINIT; } if (sample_rate < 16000) { return ERR_RVC_LITE_RT_INPUT_SAMPLE_ERR; } init_variable(); m_sample_rate = sample_rate; m_channel = channel; m_synth_path = ""; m_new_synth_path = ""; m_syn_state = RVC_LITE_RT_SYN_STATE_DEFAULT; int output_one_sec_number = m_sample_rate * m_channel; // 临时使用的数据 int latency_len = gs_crossfade_time * m_sample_rate * m_channel; CThreadPool::Task task = std::bind(&CRvcLiteOnlineRealTime::rvc_process, this); m_rvc_inst = std::make_shared(); int err = m_rvc_inst->init(hubert_model_path); if (ERR_RVC_LITE_SUCCESS != err) { goto exit; } // 重采样部分 m_resample_queue = std::make_shared(sample_rate * 3 * m_channel); m_resample16 = std::make_shared(); err = m_resample16->init(m_sample_rate, gs_src_samplerate, m_channel, 1); if (ERR_RVC_LITE_SUCCESS != err) { goto exit; } m_resample2src = std::make_shared(); err = m_resample2src->init(gs_dst_samplerate, m_sample_rate, 1, m_channel); if (ERR_RVC_LITE_SUCCESS != err) { goto exit; } m_resample_buf_max_len = 2048; // 此时空间最大是2048,保证不超即可 m_resample_in_buf = std::shared_ptr(new float[m_resample_buf_max_len], std::default_delete()); m_resample_out_buf = std::shared_ptr(new float[m_resample_buf_max_len], std::default_delete()); // 核心处理部分 m_input_tmp_buf_len = gs_src_samplerate; m_output_tmp_buf_len = gs_dst_samplerate; m_input_tmp_buf = std::shared_ptr(new float[m_input_tmp_buf_len], std::default_delete()); m_output_tmp_buf = std::shared_ptr(new float[m_output_tmp_buf_len], std::default_delete()); memset(m_input_tmp_buf.get(), 0, sizeof(float) * m_input_tmp_buf_len); memset(m_output_tmp_buf.get(), 0, sizeof(float) * m_output_tmp_buf_len); // 循环buffer m_input_queue = std::make_shared(m_input_tmp_buf_len * 3); // 对外的是目标的采样率和通道数的数据 m_out_queue = std::make_shared(output_one_sec_number * 3); m_latency_queue = std::make_shared(latency_len); // 提前塞入两组,保证延迟稳定在2s for (int i = 0; i < 2; i++) { // 塞入1s数据 for (int j = 0; j < output_one_sec_number / m_output_tmp_buf_len; j++) { m_out_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len); } m_out_queue->push(m_output_tmp_buf.get(), output_one_sec_number % m_output_tmp_buf_len); } // 算法本身有延迟,所有为了保证延迟一致,在无效果的时候需要添加该延迟 for (int j = 0; j < latency_len / m_output_tmp_buf_len; j++) { m_latency_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len); } m_latency_queue->push(m_output_tmp_buf.get(), latency_len % m_output_tmp_buf_len); // 开始处理线程 m_thread_pool = std::make_shared(); m_thread_pool->start(1); m_rvc_stop = false; m_thread_pool->run(task); m_init = true; exit: if (ERR_RVC_LITE_SUCCESS != err) { m_init = true; uninit(); } return err; } int CRvcLiteOnlineRealTime::switch_synth(const char *synth_model_path) { if (!m_init) { return ERR_RVC_LITE_RT_NOT_INIT; } { std::unique_lock lock(m_rvc_mutex); m_new_synth_path = synth_model_path; } return ERR_RVC_LITE_SUCCESS; } int CRvcLiteOnlineRealTime::process(float *in_buf, int in_len, float *out_buf, int out_len) { if (!m_init) { return ERR_RVC_LITE_RT_NOT_INIT; } // 写入数据 { std::unique_lock lock(m_rvc_mutex); m_resample_queue->push(in_buf, in_len); m_rvc_cond.notify_all(); } memset(out_buf, 0, sizeof(float) * out_len); int tmp_out_len = out_len; // 获取数据 { std::unique_lock lock(m_rvc_mutex); m_out_queue->pop(out_buf, tmp_out_len); } if (tmp_out_len != out_len) { return ERR_RVC_LITE_RT_NOT_ENOUGH_DATA; } return ERR_RVC_LITE_SUCCESS; } void CRvcLiteOnlineRealTime::reset() { if (!m_init) { return; } { std::unique_lock lock(m_rvc_mutex); m_resample_queue->reset(); m_resample16->reset(); m_resample2src->reset(); m_input_queue->reset(); m_out_queue->reset(); m_rvc_inst->reset(); m_latency_queue->reset(); // 提前塞入两组,保证延迟稳定在2s int output_one_sec_number = m_sample_rate * m_channel; // 临时使用的数据 memset(m_output_tmp_buf.get(), 0, sizeof(float) * m_output_tmp_buf_len); for (int i = 0; i < 2; i++) { for (int j = 0; j < output_one_sec_number / m_output_tmp_buf_len; j++) { m_out_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len); } m_out_queue->push(m_output_tmp_buf.get(), output_one_sec_number % m_output_tmp_buf_len); } // 算法本身有延迟,所有为了保证延迟一致,在无效果的时候需要添加该延迟 int latency_len = gs_crossfade_time * m_sample_rate * m_channel; for (int j = 0; j < latency_len / m_output_tmp_buf_len; j++) { m_latency_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len); } m_latency_queue->push(m_output_tmp_buf.get(), latency_len % m_output_tmp_buf_len); } } void CRvcLiteOnlineRealTime::flush(float *&out_buf, int &len) { // 将内部的所有的数据吐出来 /** * 先停止 */ stop(); // 无音色转换的情况 int resample_in_len = 0; int resample_out_len = 0; if(m_syn_state == RVC_LITE_RT_SYN_STATE_DEFAULT) { while (m_resample_queue->size() > 0) { resample_in_len = m_resample_buf_max_len; m_resample_queue->pop(m_resample_in_buf.get(), resample_in_len); m_latency_queue->push(m_resample_in_buf.get(), resample_in_len); m_latency_queue->pop(m_resample_in_buf.get(), resample_in_len); m_out_queue->push(m_resample_in_buf.get(), resample_in_len); } while(m_latency_queue->size() > 0) { resample_in_len = m_resample_buf_max_len; m_latency_queue->pop(m_resample_in_buf.get(), resample_in_len); m_out_queue->push(m_resample_in_buf.get(), resample_in_len); } len = m_out_queue->size(); out_buf = new float[len]; m_out_queue->pop(out_buf, len); return; } // 有音色转换的情况 while (m_resample_queue->size() > 0) { resample_in_len = m_resample_buf_max_len; m_resample_queue->pop(m_resample_in_buf.get(), resample_in_len); // 输入的数据需要考虑channel resample_out_len = m_resample16->get_out_samples(resample_in_len / m_channel); m_resample16->resample(m_resample_in_buf.get(), resample_in_len / m_channel, m_resample_out_buf.get(), resample_out_len); // 输出是16k单声道,不需要考虑 m_input_queue->push(m_resample_out_buf.get(), resample_out_len); } memset(m_input_tmp_buf.get(), 0, sizeof(float) * m_input_tmp_buf_len); int add_size = m_input_tmp_buf_len - m_input_queue->size() % m_input_tmp_buf_len; if (add_size != 0 && add_size < m_input_tmp_buf_len) { m_input_queue->push(m_input_tmp_buf.get(), add_size); } int num = m_input_queue->size() / m_input_tmp_buf_len; for (int i = 0; i < num; i++) { rvc_process_step(); } // 将所有数据拷贝出来 len = m_out_queue->size(); out_buf = new float[len]; m_out_queue->pop(out_buf, len); } int CRvcLiteOnlineRealTime::get_latency_ms() { return m_rvc_inst->get_latency_ms() + 2000; } /*******************************对内函数***************************************/ void CRvcLiteOnlineRealTime::uninit() { if (!m_init) { return; } stop(); } void CRvcLiteOnlineRealTime::stop() { // 释放thread_pool的数据,先通知一下rvc_process,防止是在等待中 m_rvc_stop = true; if (m_thread_pool) { m_rvc_cond.notify_all(); m_thread_pool->stop(); } } void CRvcLiteOnlineRealTime::rvc_process_step() { struct timeval start; struct timeval end; int sample_out_len = 0; // 开始处理 if (m_input_queue->size() < m_input_tmp_buf_len) { return; } gettimeofday(&start, NULL); m_input_queue->pop(m_input_tmp_buf.get(), m_input_tmp_buf_len); m_rvc_inst->process_block(m_input_tmp_buf.get(), m_input_tmp_buf_len, m_output_tmp_buf.get(), m_output_tmp_buf_len); gettimeofday(&end, NULL); LOGD("RvcLite", "rvc_process process sp %f ms", (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0); // 重采样 // 考虑到此处采样率变大,但是最多也不到两倍,但是通道数有可能扩展到两倍,所以按照1/4进行设置 gettimeofday(&start, NULL); bool last = false; int step = m_resample_buf_max_len / 4; for (int i = 0; i < m_output_tmp_buf_len; i += step) { if (i + step >= m_output_tmp_buf_len) { step = m_output_tmp_buf_len - i; last = true; } // 此时的输入是单声道,采样点数量和总长度一致 sample_out_len = m_resample2src->get_out_samples(step); m_resample2src->resample(m_output_tmp_buf.get() + i, step, m_resample_out_buf.get(), sample_out_len); // 从有到无 if(last && m_syn_state == RVC_LITE_RT_SYN_STATE_EFFECT2DEFAULT) { // 因为不加音效也需要延迟对齐,所以此处只要做fade_out就行了 for(int ii =0; ii < sample_out_len * m_channel; ii+=m_channel) { float rate = ii * 1.0 / step; for(int jj = 0; jj < m_channel; jj++) { m_resample_out_buf.get()[ii+jj] = m_resample_out_buf.get()[ii+jj] * (1 - rate); } } m_syn_state = RVC_LITE_RT_SYN_STATE_BEFORE_DEFAULT; } { std::unique_lock lock(m_rvc_mutex); m_out_queue->push(m_resample_out_buf.get(), sample_out_len * m_channel); } } gettimeofday(&end, NULL); LOGD("RvcLite", "rvc_process re_resample sp %f ms", (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0); printf("finish ...\n"); } void CRvcLiteOnlineRealTime::rvc_process() { int sample_in_len; int sample_out_len = 0; while (!m_rvc_stop) { { // 重采样 std::unique_lock lock(m_rvc_mutex); if (m_resample_queue->size() < m_resample_buf_max_len) { // 睡眠前检查下情况 if (m_rvc_stop) { return; } m_rvc_cond.wait(lock); continue; } sample_in_len = m_resample_buf_max_len; m_resample_queue->pop(m_resample_in_buf.get(), sample_in_len); } /** * 此处有三种情况: * 因为无论哪种变换,有延迟的存在,导致输入的数据都是需要塞0进去,所以对当前的数据做fade_out即可 * 1. 无到有:对无到有的部分做个fade_out,对下一帧要塞入音效器的部分做fade_in * 2. 有到无:对无到有的部分做个fade_out,对下一帧要塞入音效器的部分做fade_in * 3. 有到有[这个不用考虑,内部自己做了处理] */ if (m_synth_path != m_new_synth_path) { // 从无到有,此时对本帧做fade_out,对下一帧输入做fade_in if(m_synth_path.empty() && !m_new_synth_path.empty()) { m_syn_state = RVC_LITE_RT_SYN_STATE_DEFAULT2EFFECT; } // 从有到无 if (!m_synth_path.empty() && m_new_synth_path.empty()) { m_syn_state = RVC_LITE_RT_SYN_STATE_EFFECT2DEFAULT; } { std::unique_lock lock(m_rvc_mutex); m_synth_path = m_new_synth_path; } m_rvc_inst->switch_synth_model(m_new_synth_path.c_str()); } // 刚切过来第一次做效果 if(m_syn_state == RVC_LITE_RT_SYN_STATE_BEFORE_DEFAULT) { // 刚从有到无,需要清空数据,以及对输入的队列添加fade_in m_latency_queue->reset(); // 算法本身有延迟,所有为了保证延迟一致,在无效果的时候需要添加该延迟 memset(m_output_tmp_buf.get(), 0, sizeof(float) * m_output_tmp_buf_len); int latency_len = gs_crossfade_time * m_sample_rate * m_channel; for (int j = 0; j < latency_len / m_output_tmp_buf_len; j++) { m_latency_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len); } m_latency_queue->push(m_output_tmp_buf.get(), latency_len % m_output_tmp_buf_len); // 对输入做fade_in for(int i = 0; i < sample_in_len; i+=m_channel) { float rate = i * 1.0 / sample_in_len; for(int j = 0; j < m_channel; j++) { m_resample_in_buf.get()[i+j] *= rate; } } m_syn_state = RVC_LITE_RT_SYN_STATE_DEFAULT; } // 不做效果 if(m_syn_state == RVC_LITE_RT_SYN_STATE_DEFAULT) { m_latency_queue->push(m_resample_in_buf.get(), sample_in_len); m_latency_queue->pop(m_resample_in_buf.get(), sample_in_len); { std::unique_lock lock(m_rvc_mutex); m_out_queue->push(m_resample_in_buf.get(), sample_in_len); } continue; } // 从无到有的转换 if (m_syn_state == RVC_LITE_RT_SYN_STATE_DEFAULT2EFFECT) { // 做fade_out for(int i = 0; i < sample_in_len; i+=m_channel) { float rate = i * 1.0 / sample_in_len; for(int j = 0; j < m_channel; j++) { m_resample_in_buf.get()[i+j] *= 1 - rate; } } m_latency_queue->push(m_resample_in_buf.get(), sample_in_len); m_latency_queue->pop(m_resample_in_buf.get(), sample_in_len); { std::unique_lock lock(m_rvc_mutex); m_out_queue->push(m_resample_in_buf.get(), sample_in_len); } // 此时对于rvc来说输入的数据不连贯了,所以清空内部数据重新搞 m_syn_state = RVC_LITE_RT_SYN_STATE_EFFECT; m_rvc_inst->reset(); continue; } // 重采样到16k,此处采样率变低,所以不会出现sample_out_len > sample_in_len的情况 sample_out_len = m_resample16->get_out_samples(sample_in_len / m_channel); m_resample16->resample(m_resample_in_buf.get(), sample_in_len / m_channel, m_resample_out_buf.get(), sample_out_len); m_input_queue->push(m_resample_out_buf.get(), sample_out_len); rvc_process_step(); } } \ No newline at end of file diff --git a/mnn_demo/src/CRvcLiteOnlineV2.cpp b/mnn_demo/src/CRvcLiteOnlineV2.cpp new file mode 100644 index 0000000..0269a7d --- /dev/null +++ b/mnn_demo/src/CRvcLiteOnlineV2.cpp @@ -0,0 +1,215 @@ +// +// Created by Administrator on 2024/1/22. +// + +#include "CRvcLiteOnlineV2.h" +#include "CRvcCircleBuffer.h" +#include + +inline bool file_exists1 (const std::string& name) { + return ( access( name.c_str(), F_OK ) != -1 ); +} + +CRvcLiteOnlineV2::CRvcLiteOnlineV2() +{ + +} + +CRvcLiteOnlineV2::~CRvcLiteOnlineV2() +{ + +} +/*****************************************对内函数***************************************************************/ +void CRvcLiteOnlineV2::set_cur_state(bool reset) +{ + /** + * 一共三种状态 + * 从无到有: 让不做效果的fade_out,做效果的fade_in + * 从有到无: 让做效果的fade_out, 不做效果的fade_in即可 + * 从有到有,这种情况不考虑,内部自己会做fade + */ + if (m_syn_model != m_new_syn_model) + { + // 从无到有 + if (m_syn_model.empty() && !m_new_syn_model.empty()) + { + m_sync_state = CRVC_V2_STATE_DEFAULT2EFFECT; + + // 如果此时已经发生了reset,则不需要做切换,直接做就行 + if (reset) + { + m_sync_state = CRVC_V2_STATE_EFFECT; + } + m_syn_model = m_new_syn_model; + m_rvc_inst->switch_model(m_syn_model.c_str()); + } + + // 从有到无 + if (!m_syn_model.empty() && m_new_syn_model.empty()) + { + m_sync_state = CRVC_V2_STATE_EFFECT2DEFAULT; + // 如果此时已经发生了reset,则不需要做切换,直接做就行 + if (reset) + { + m_sync_state = CRVC_V2_STATE_DEFAULT; + } + m_syn_model = m_new_syn_model; + } + } +} + +/*****************************************对外函数***************************************************************/ +int CRvcLiteOnlineV2::init(const char *hubert_model, int sample_rate, int channel) +{ + m_rvc_inst = std::make_shared(); + m_block_len = sample_rate * channel - 100 * channel; + m_tmp_buf_len = m_block_len * 2; + m_reset = true; + m_syn_model = ""; + m_new_syn_model = ""; + m_sync_state = CRVC_V2_STATE_DEFAULT; + m_fade_len = int(sample_rate * 0.05) * channel; // 50ms的时长用来做fade + m_channel = channel; + + m_tmp_in_buf = std::shared_ptr(new float[m_tmp_buf_len], std::default_delete()); + m_tmp_out_buf = std::shared_ptr(new float[m_tmp_buf_len], std::default_delete()); + m_in_queue = std::make_shared(m_tmp_buf_len * 2); + m_out_queue = std::make_shared(m_tmp_buf_len * 2); + m_input_latency_output_frame = 0; + return m_rvc_inst->init(hubert_model, sample_rate, channel); +} + +int CRvcLiteOnlineV2::switch_model(const char *synth_model) +{ + if (synth_model != "" && !file_exists1(synth_model)) + { + return ERR_RVC_LITE_MODEL_NOT_EXISTS; + } + + m_new_syn_model = synth_model; + return ERR_RVC_LITE_SUCCESS; +} + +void CRvcLiteOnlineV2::set_up_key(int key) +{ + // 内部是线程安全的,所以直接设置即可 + m_rvc_inst->set_up_key(key); +} + +void CRvcLiteOnlineV2::reset() +{ + m_reset = true; +} + + +int CRvcLiteOnlineV2::push(float *buf, int len, bool last) +{ + bool reset = m_reset; + if (m_reset) + { + m_reset = false; + m_input_latency_output_frame = 0; + m_in_queue->reset(); + m_out_queue->reset(); + m_rvc_inst->reset(); + } + + set_cur_state(reset); + + if (CRVC_V2_STATE_DEFAULT == m_sync_state) + { + std::unique_lock lock(m_rvc_mutex); + m_out_queue->push(buf, len); + return ERR_RVC_LITE_SUCCESS; + } + + // 此时无论怎样,都要让模型跑一下,得到结果再说 + m_in_queue->push(buf, len); + while(m_in_queue->size() >= m_block_len || last) { + if (m_in_queue->size() <= 0) + { + return ERR_RVC_LITE_SUCCESS; + } + + int cur_in_len = m_block_len; + int cur_out_len = m_block_len; + m_in_queue->pop(m_tmp_in_buf.get(), cur_in_len); + int err = m_rvc_inst->process(m_tmp_in_buf.get(), cur_in_len, m_tmp_out_buf.get(), cur_out_len); + if (err != ERR_RVC_LITE_SUCCESS) { + return err; + } + + // 此时对于effect做fade_out,default做fade_in + if (m_sync_state == CRVC_V2_STATE_EFFECT2DEFAULT) + { + // 此时由于m_rvc_inst本身存在延迟输出的情况[虽然头部的静音帧已经被砍掉了],但是其输入的数据和输出的数据并不是完美对应的,存在延迟差 + // 所以此时输入的头部和输出的头部之前存在延迟差,但是不加音效是没有这个延迟差的 + // 所以需要将输入的头部对应到其应该对应的输出真实数据的头部 + // 比如: 输入: 1,2,3,4,5 输出: l1,l2,1,2,3 ,其中l1和l2是延迟采样点,也就是1,2,对应的是输出+延迟采样点才对 + for(int i = 0; i < m_fade_len; i+=m_channel) + { + float rate = i * 1.0 / m_fade_len; + for(int j = 0; j < m_channel; j+=1) + { + m_tmp_in_buf.get()[i+j] = m_tmp_in_buf.get()[i+j] * rate + m_tmp_out_buf.get()[i+j+m_input_latency_output_frame] * (1 - rate); + } + } + { + std::unique_lock lock(m_rvc_mutex); + // 将之前要输入的那块塞进去 + m_out_queue->push(m_tmp_out_buf.get(), m_input_latency_output_frame); + m_out_queue->push(m_tmp_in_buf.get(), cur_in_len); + } + + m_sync_state = CRVC_V2_STATE_DEFAULT; + m_input_latency_output_frame = 0; + + while(m_in_queue->size() > 0) + { + cur_in_len = m_block_len; + m_in_queue->pop(m_tmp_in_buf.get(), cur_in_len); + { + std::unique_lock lock(m_rvc_mutex); + m_out_queue->push(m_tmp_in_buf.get(), cur_in_len); + } + } + return ERR_RVC_LITE_SUCCESS; + } + + // 此时对effect做fade_in,default做fade_out + if (m_sync_state == CRVC_V2_STATE_DEFAULT2EFFECT) + { + for(int i = 0; i < m_fade_len; i+=m_channel) + { + float rate = i * 1.0 / m_fade_len; + for(int j = 0; j < m_channel; j+=1) + { + m_tmp_out_buf.get()[i+j] = m_tmp_out_buf.get()[i+j] * rate + m_tmp_in_buf.get()[i+j] * (1 - rate); + } + } + // 设置状态 + m_sync_state = CRVC_V2_STATE_EFFECT; + } + + // effect会存在输入和输出长度不一致的情况 + m_input_latency_output_frame += cur_in_len - cur_out_len; + + // 加锁塞入数据 + { + std::unique_lock lock(m_rvc_mutex); + m_out_queue->push(m_tmp_out_buf.get(), cur_out_len); + } + } + return ERR_RVC_LITE_SUCCESS; +} + +int CRvcLiteOnlineV2::size() +{ + return m_out_queue->size(); +} + +void CRvcLiteOnlineV2::pop(float *buf, int &len) +{ + std::unique_lock lock(m_rvc_mutex); + m_out_queue->pop(buf, len); +} diff --git a/mnn_demo/src/CRvcLiteSynthesizer.cpp b/mnn_demo/src/CRvcLiteSynthesizer.cpp index 9bce8d7..6ff952b 100644 --- a/mnn_demo/src/CRvcLiteSynthesizer.cpp +++ b/mnn_demo/src/CRvcLiteSynthesizer.cpp @@ -1,106 +1,128 @@ // // Created by Administrator on 2024/1/21. // #include "CRvcLiteSynthesizer.h" #include #include CRvcLiteSynthesizer::CRvcLiteSynthesizer(){} CRvcLiteSynthesizer::~CRvcLiteSynthesizer() {} -int CRvcLiteSynthesizer::init(const char *hubert_model, const char *synth_model, int sample_rate, int channel) +int CRvcLiteSynthesizer::init(const char *hubert_model, int sample_rate, int channel) { m_rvc_inst = std::make_shared(); int err = m_rvc_inst->init(hubert_model); if (err != ERR_RVC_LITE_SUCCESS) { return err; } - err = m_rvc_inst->switch_synth_model(synth_model); - if (err != ERR_RVC_LITE_SUCCESS) - { - return err; - } + m_resample2_16 = std::make_shared(); m_resample2_16->init(sample_rate, gs_src_samplerate, channel, 1); m_resample2src = std::make_shared(); m_resample2src->init(gs_dst_samplerate, sample_rate, 1, channel); m_channel = channel; m_sample_rate = sample_rate; m_buf_tmp_16k_len = 0; m_buf_tmp_16k_cap = 0; m_buf_tmp_32k_len = 0; m_buf_tmp_32k_cap = 0; m_buf_tmp_src_len = 0; m_buf_tmp_src_cap = 0; + m_first = true; return ERR_RVC_LITE_SUCCESS; } +int CRvcLiteSynthesizer::switch_model(const char *synth_model) +{ + return m_rvc_inst->switch_synth_model(synth_model); +} + +void CRvcLiteSynthesizer::set_up_key(int key) +{ + m_rvc_inst->set_up_key(key); +} + +void CRvcLiteSynthesizer::reset() +{ + m_rvc_inst->reset(); + m_first = true; +} + int CRvcLiteSynthesizer::process(float *in_buf, int in_len, float *out_buf, int &out_len) { // 1 重采样 2 推理 3 再次重采样 int resample_out_len = m_resample2_16->get_out_samples(in_len / m_channel); // 控制逻辑,不能超过该长度 if (resample_out_len > gs_src_samplerate) { return ERR_RVC_LITE_BLOCK_TOO_LONG; } if (m_buf_tmp_16k_cap < resample_out_len) { m_buf_tmp_16k_cap = resample_out_len; m_buf_tmp_16k = std::shared_ptr(new float[m_buf_tmp_16k_cap], std::default_delete()); } m_buf_tmp_16k_len = resample_out_len; int err = m_resample2_16->resample(in_buf, in_len / m_channel, m_buf_tmp_16k.get(), m_buf_tmp_16k_len); if (err != ERR_RVC_LITE_SUCCESS) { return err; } if (m_buf_tmp_32k_cap < m_buf_tmp_16k_len * 2) { m_buf_tmp_32k_cap = m_buf_tmp_16k_len * 2; m_buf_tmp_32k = std::shared_ptr(new float[m_buf_tmp_32k_cap], std::default_delete()); } m_buf_tmp_32k_len = m_buf_tmp_16k_len * 2; // 推理 err = m_rvc_inst->process_block(m_buf_tmp_16k.get(), m_buf_tmp_16k_len, m_buf_tmp_32k.get(), m_buf_tmp_32k_len); if (err != ERR_RVC_LITE_SUCCESS) { return err; } // 重采样回来 int out_frame = m_resample2src->get_out_samples(m_buf_tmp_32k_len); if (m_buf_tmp_src_cap < out_frame * m_channel) { m_buf_tmp_src_cap = out_frame * m_channel; m_buf_tmp_src = std::shared_ptr(new float[m_buf_tmp_src_cap], std::default_delete()); } m_buf_tmp_src_len = out_frame; err = m_resample2src->resample(m_buf_tmp_32k.get(), m_buf_tmp_32k_len, m_buf_tmp_src.get(), m_buf_tmp_src_len); if (err != ERR_RVC_LITE_SUCCESS) { return err; } // 取较小的值 if (out_len > m_buf_tmp_src_len * m_channel) { out_len = m_buf_tmp_src_len * m_channel; } - memcpy(out_buf, m_buf_tmp_src.get(), sizeof(float) * out_len); + // 第一次过来,将头部的延迟块切掉 + int latency_frame = 0; + if (m_first) + { + m_first = false; + latency_frame = int(m_rvc_inst->get_latency_ms() * 1.0 / 1000 * m_sample_rate) * m_channel; + out_len -= latency_frame; + } + memcpy(out_buf, m_buf_tmp_src.get()+latency_frame, sizeof(float) * out_len); return ERR_RVC_LITE_SUCCESS; } + float CRvcLiteSynthesizer::get_rtf() { struct timeval start; struct timeval end; gettimeofday(&start, NULL); int in_len = m_sample_rate * m_channel - 100 *m_channel; int out_len = in_len; float* in_buf = new float[in_len]; process(in_buf, in_len, in_buf, in_len); delete [] in_buf; gettimeofday(&end, NULL); double sp = (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0; return sp / 1000; } \ No newline at end of file