No OneTemporary
Actions

Size

63 KB

Subscribers

None

View Options

	diff --git a/.arcconfig b/.arcconfig
	new file mode 100644
	index 0000000..99e9d55
	--- /dev/null
	+++ b/.arcconfig
	@@ -0,0 +1,4 @@
	+{
	+ "phabricator.uri" : "http://phabricator.ushow.media/",
	+ "editor": "vim"
	+}
	diff --git a/mnn_demo/inc/CRvcLiteSynthesizer.h b/mnn_demo/inc/CRvcLiteSynthesizer.h
	index cedcb2e..c25d0df 100644
	--- a/mnn_demo/inc/CRvcLiteSynthesizer.h
	+++ b/mnn_demo/inc/CRvcLiteSynthesizer.h
	@@ -1,82 +1,82 @@
	//
	// Created by Administrator on 2024/1/21.
	//

	#ifndef MNN_DEMO_CRVCLITESYNTHESIZER_H
	#define MNN_DEMO_CRVCLITESYNTHESIZER_H
	#include "CRvcLiteOnline.h"

	class CRvcLiteSynthesizer
	{
	public:
	CRvcLiteSynthesizer();
	~CRvcLiteSynthesizer();

	public:
	/**
	* 初始化
	* @param hubert_model 语义模型地址
	* @param sample_rate 采样率
	* @param channel 通道数
	* @return 0 表示正常
	*/
	int init(const char* hubert_model, int sample_rate, int channel);

	/**
	* 选择人声模型
	* @param synth_model 音色模型地址
	* @param enable 是否开启
	* @return
	*/
	int switch_model(const char* synth_model);

	/**
	* 设置变调，范围是[-12, 12]
	* 有人声模型才生效，否则不生效
	* 换人声模型，该状态不会丢失，并且在无人声的时候设置之后，有人声模型后也会生效
	* @param key
	*/
	void set_up_key(int key);

	/**
	* reset，清空内部数据
	*/
	void reset();

	/**
	- * 处理逻辑
	+ * 处理逻辑：每次输入的长度不要太长，建议在900ms左右即可
	* @param in_buf 输入的buf
	* @param in_len 输入的Buf长度，frame*channel，建议输入小于等于1s的音频长度，尽量的大就好
	* @param out_buf 输出的buf
	* @param out_len 输出的buf长度, frame*channel
	* 注意: 此处有可能出现输出的长度不一定等于in_len,输出的值会小于等于out_len,但是是连续的，所以out_len可以适当比in_len大一些，从而保证都能搞出来
	* @return
	*/
	int process(float* in_buf, int in_len, float* out_buf, int &out_len);

	/**
	* 获取实时率，处理1s数据的真实耗时/1s
	* @return
	*/
	float get_rtf();

	private:
	std::shared_ptr<CRvcLiteOnline> m_rvc_inst;
	std::shared_ptr<CResample> m_resample2_16;
	std::shared_ptr<CResample> m_resample2src;
	int m_channel;
	int m_sample_rate;
	std::shared_ptr<float> m_buf_tmp_16k;
	int m_buf_tmp_16k_len;
	int m_buf_tmp_16k_cap;
	std::shared_ptr<float> m_buf_tmp_32k;
	int m_buf_tmp_32k_len;
	int m_buf_tmp_32k_cap;
	std::shared_ptr<float> m_buf_tmp_src;
	int m_buf_tmp_src_len;
	int m_buf_tmp_src_cap;
	bool m_first;
	};


	#endif //MNN_DEMO_CRVCLITESYNTHESIZER_H
	diff --git a/mnn_demo/main.cpp b/mnn_demo/main.cpp
	index 0d6b685..d742793 100644
	--- a/mnn_demo/main.cpp
	+++ b/mnn_demo/main.cpp
	@@ -1,285 +1,286 @@
	#include <sys/time.h>
	#include <thread>
	#include <chrono>
	#include "src/Hubert.h"
	#include "src/CSynthesizer.h"
	#include "CRvcLiteSynthesizer.h"
	#include "CRvcLiteOnlineV2.h"
	int test_hubert() {
	const char *hubert_model_path = "/mnt/d/dataset/svc/models/mnn/hubert_test_v1_fp16.mnn";
	Hubert hubert;
	int err_code = hubert.init(hubert_model_path);
	std::vector<float> input(33280, 0.1);
	std::vector<std::vector<std::vector<float>>> ret;
	ret.resize(1);
	ret[0].resize(205);
	for (int i = 0; i < 205; i++) {
	ret[0][i].resize(256);
	}
	float time = hubert.process(input.data(), ret);
	return 0;
	}

	int test_contentvec() {
	const char *contentvec_model_path = "/mnt/d/dataset/svc/models/mnn/contentvec_test_fp16.mnn";
	CSynthesizer contentVec;
	int err_code = contentVec.init(contentvec_model_path);
	std::vector<std::vector<std::vector<float>>> input(1);
	input[0].resize(205);
	for (int i = 0; i < 205; i++) {
	for (int j = 0; j < 258; j++) {
	if (j == 256) {
	input[0][i].push_back(0.2);
	} else if (j == 257) {
	input[0][i].push_back(1.0);
	} else {
	input[0][i].push_back(0.1);
	}
	}
	}

	std::vector<std::vector<std::vector<float>>> ret;
	ret.resize(1);
	for (int i = 0; i < 1; i++) {
	ret[i].resize(1);
	ret[i][0].resize(35840);
	}

	float tot = 0.f;
	for (int i = 0; i < 10; i++) {
	float time = contentVec.process(input, ret);
	tot += time;
	}
	printf("time: %f \n", tot / 100.f);
	return 0;
	}

	#include "CRvcLiteOnline.h"
	#include "av_waves/waves/inc/STWaveFile.h"

	void test() {
	const char *hubert_model_path = "/mnt/d/dataset/svc/models/mnn/hubert_test_v2_fp16.mnn";
	const char *contentvec_model_path = "/mnt/d/dataset/svc/models/mnn/contentvec_test_fp16.mnn";
	const char *in_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01_16.wav";
	// const char *in_wav = "/mnt/d/code/develop/svc/Retrieval-based-Voice-Conversion-WebUI/online/1_1.wav";
	const char *out_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01_cpp_v1.wav";

	CRvcLiteOnline rvc_inst;
	rvc_inst.init(hubert_model_path);

	// 读取音频文件, 要求16k,单声道
	STCWaveFile wav_inst(in_wav, false);
	int sample_rate = wav_inst.GetSampleRate();
	int channel = wav_inst.GetChannels();
	int len = wav_inst.GetTotalFrames() * channel;
	float *data = new float[len];
	float outdata = new float[len 2];
	wav_inst.ReadFrameAsfloat(data, wav_inst.GetTotalFrames());
	int step = sample_rate;
	printf("start ..\n");
	for (int i = 0; i < len; i += step) {
	if (i + step > len) {
	step = len - i;
	}
	struct timeval start;
	struct timeval end;
	gettimeofday(&start, NULL);
	rvc_inst.process_block(data + i, step, outdata + 2 * i, 2 * step);
	gettimeofday(&end, NULL);
	printf("sp = %f ms\n", (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);
	}
	STCWaveFile wav_out_inst(out_wav, true);
	wav_out_inst.SetSampleRate(32000);
	wav_out_inst.SetChannels(1);
	wav_out_inst.SetSampleFormat(SF_IEEE_FLOAT);
	wav_out_inst.SetupDone();
	wav_out_inst.WriteFrame(outdata, len * 2);
	printf("finish2 ....\n");
	}


	void test_rvc_lite_synth()
	{
	const char *hubert_model_path = "/mnt/d/dataset/svc/models/layers_3/layer3_contentvec.mnn";
	const char *syz_model = "/mnt/d/dataset/svc/models/layers_3/layer3_syz.mnn";
	const char *out_wav = "/mnt/d/dataset/tmp/i_out3.wav";
	const char *in_wav = "/mnt/d/dataset/tmp/t1.wav";

	STCWaveFile wav_inst(in_wav, false);
	int sample_rate = wav_inst.GetSampleRate();
	int channel = wav_inst.GetChannels();
	int len = wav_inst.GetTotalFrames() * channel;
	float *data = new float[len];
	float *outdata = new float[len];
	wav_inst.ReadFrameAsfloat(data, wav_inst.GetTotalFrames());
	CRvcLiteSynthesizer m_rvc_inst;
	int err = m_rvc_inst.init(hubert_model_path, sample_rate, channel);
	printf("init err=%d!\n", err);
	printf("rtf=%f\n", m_rvc_inst.get_rtf());
	int step = sample_rate * channel - 100 * channel;
	int out_len = 0;
	for(int i = 0; i < len; i+=step)
	{
	if (i + step > len) {
	step = len - i;
	}
	int out_step = step;
	err = m_rvc_inst.process(data+i, step, outdata+out_len, out_step);
	if(err != ERR_RVC_LITE_SUCCESS)
	{
	printf("process err=%d!\n", err);
	return ;
	}
	out_len += out_step;
	}
	STCWaveFile wav_out_inst(out_wav, true);
	wav_out_inst.SetSampleRate(sample_rate);
	wav_out_inst.SetChannels(channel);
	wav_out_inst.SetSampleFormat(SF_IEEE_FLOAT);
	wav_out_inst.SetupDone();
	wav_out_inst.WriteFrame(outdata, wav_inst.GetTotalFrames());
	delete[] data;
	delete[] outdata;
	}

	void test_rvc_lite_v2()
	{
	const char *hubert_model_path = "/mnt/d/dataset/svc/models/layers_3/layer3_contentvec.mnn";
	const char *syz_model = "/mnt/d/dataset/svc/models/layers_3/layer3_syz.mnn";
	const char *out_wav = "/mnt/d/dataset/tmp/i_out_01_r.wav";
	- const char *in_wav = "/mnt/d/dataset/tmp/t1.wav";
	+ const char *in_wav = "/mnt/d/dataset/tmp/t1_48.wav";

	STCWaveFile wav_inst(in_wav, false);
	int sample_rate = wav_inst.GetSampleRate();
	int channel = wav_inst.GetChannels();
	int len = wav_inst.GetTotalFrames() * channel;
	float *data = new float[len];
	float *outdata = new float[len];
	wav_inst.ReadFrameAsfloat(data, wav_inst.GetTotalFrames());
	CRvcLiteOnlineV2 m_rvc_inst;
	int err = m_rvc_inst.init(hubert_model_path, sample_rate, channel);
	// m_rvc_inst.switch_model(syz_model);
	// m_rvc_inst.set_up_key(0);
	printf("init err=%d!\n", err);
	int step = sample_rate * channel - 100 * channel;
	int out_len = 0;
	bool last = false;
	int flag = 0;
	for(int i = 0; i < len; i+=step)
	{
	if (i + step > len) {
	step = len - i;
	last = true;
	}
	int out_step = step;
	err = m_rvc_inst.push(data+i, step, last);
	if(err != ERR_RVC_LITE_SUCCESS)
	{
	printf("process err=%d!\n", err);
	return ;
	}

	if (i >= len / 3 && flag == 0)
	{
	flag = 1;
	m_rvc_inst.switch_model(syz_model);
	}

	+
	if (i >= len / 2 && flag == 1)
	{
	flag = 2;
	m_rvc_inst.reset();
	}

	out_step = 2 * step;
	m_rvc_inst.pop(outdata+out_len, out_step);
	out_len += out_step;
	}
	STCWaveFile wav_out_inst(out_wav, true);
	wav_out_inst.SetSampleRate(sample_rate);
	wav_out_inst.SetChannels(channel);
	wav_out_inst.SetSampleFormat(SF_IEEE_FLOAT);
	wav_out_inst.SetupDone();
	wav_out_inst.WriteFrame(outdata, wav_inst.GetTotalFrames());
	delete[] data;
	delete[] outdata;
	}

	void test_rvc_lite_online() {
	// const char *hubert_model_path = "/mnt/d/dataset/svc/models/mnn/hubert_test_v2_fp16.mnn";
	// const char *hubert_model_path = "/mnt/d/dataset/svc/models/layer6_bingxiao_v1/mnn/layers6_checkpoint_14_1660000_1_hubert.mnn";
	const char *hubert_model_path = "/mnt/d/dataset/svc/models/layers_3/layer3_contentvec.mnn";
	// const char *contentvec_model_path = "/mnt/d/dataset/svc/models/mnn/contentvec_test_fp16.mnn";
	// const char *syz_model = "/mnt/d/dataset/svc/models/layer6_bingxiao_v1/mnn/xusong_v1_6hubert_hifix_syz_base_vctk_kd_32k_hubert6_jianli_e225_s62775_205.mnn";
	const char *xs_model = "/mnt/d/dataset/svc/models/layers_3/layer3_xusong.mnn";
	const char *syz_model = "/mnt/d/dataset/svc/models/layers_3/layer3_syz.mnn";
	// const char *contentvec_model_path = "/mnt/d/dataset/svc/models/layer6_bingxiao_v1/mnn/xiafan_fp16.mnn";

	// const char *in_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01.wav";
	const char *in_wav = "/mnt/d/dataset/tmp/t1.wav";
	// const char* in_wav = "/mnt/d/dataset/svc/dataset/短数据样本/男声/qiankun.wav";
	// const char* in_wav = "/mnt/d/dataset/tmp/i.wav";
	// const char *in_wav = "/mnt/d/code/develop/svc/Retrieval-based-Voice-Conversion-WebUI/online/1_1.wav";
	// const char *out_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01_cpp_v4.wav";
	// const char *out_wav = "/mnt/d/dataset/svc/dataset/tests/qiankun_412_v4.wav";
	const char *out_wav = "/mnt/d/dataset/tmp/i_out2.wav";

	// 读取音频文件, 要求16k,单声道
	STCWaveFile wav_inst(in_wav, false);
	int sample_rate = wav_inst.GetSampleRate();
	int channel = wav_inst.GetChannels();
	int len = wav_inst.GetTotalFrames() * channel;
	float *data = new float[len];
	float *outdata = new float[len];

	CRvcLiteOnlineRealTime rvc_inst;
	rvc_inst.init(hubert_model_path, sample_rate, channel);

	wav_inst.ReadFrameAsfloat(data, wav_inst.GetTotalFrames());
	int step = 1024;
	printf("start ..\n");
	bool flag = true;
	rvc_inst.switch_synth(syz_model);
	for (int i = 0; i < len; i += step) {
	if (i + step > len) {
	step = len - i;
	}
	struct timeval start;
	struct timeval end;
	gettimeofday(&start, NULL);
	int ret = rvc_inst.process(data + i, step, outdata+i, step);
	std::this_thread::sleep_for(std::chrono::milliseconds (15));
	gettimeofday(&end, NULL);
	printf("ret = %d, sp = %f ms step=%d\n", ret,
	(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0, step);

	if (flag && i >= len / 3) {
	flag = false;
	rvc_inst.reset();
	// rvc_inst.switch_synth(xs_model);
	}
	}
	STCWaveFile wav_out_inst(out_wav, true);
	wav_out_inst.SetSampleRate(sample_rate);
	wav_out_inst.SetChannels(channel);
	wav_out_inst.SetSampleFormat(SF_IEEE_FLOAT);
	wav_out_inst.SetupDone();
	wav_out_inst.WriteFrame(outdata, wav_inst.GetTotalFrames());

	float* flush_data;
	int flush_len;
	rvc_inst.flush(flush_data, flush_len);
	wav_out_inst.WriteFrame(flush_data, flush_len/channel);
	printf("finish2 ....\n");
	}

	int main() {
	// int ret_hubert = test_hubert();
	// int ret_contentvec = test_contentvec();
	// test();
	// test();
	// test_rvc_lite_online();
	// test_rvc_lite_synth();
	test_rvc_lite_v2();
	return 0;
	}
	diff --git a/mnn_demo/src/CRvcLiteOnline.cpp b/mnn_demo/src/CRvcLiteOnline.cpp
	index f9067f7..60b9fa6 100644
	--- a/mnn_demo/src/CRvcLiteOnline.cpp
	+++ b/mnn_demo/src/CRvcLiteOnline.cpp
	@@ -1,831 +1,831 @@
	//
	// Created by Administrator on 2023/11/29.
	//

	#include <cmath>
	#include <cstring>
	#include <sys/time.h>
	#include "CRvcLiteOnline.h"
	#include "Hubert.h"
	#include "CSynthesizer.h"
	#include "espyin-v1.0/ESPYIN.h"
	#include "ThreadPool.h"
	#include "CRvcCircleBuffer.h"
	#include "FfmpegResampler.h"
	#include <unistd.h>

	inline bool file_exists (const std::string& name) {
	return ( access( name.c_str(), F_OK ) != -1 );
	}

	// size代表了buf的长度
	void stereo2mono(float input, int size, float output) {
	for (int i = 0; i < size - 1; i += 2) {
	output[i / 2] = (input[i] + input[i + 1]) / 2;
	}
	}

	void mono2stereo(float input, int size, float output) {
	for (int i = 0; i < size; i++) {
	output[2 * i] = input[i];
	output[2 * i + 1] = input[i];
	}
	}


	CRvcLiteOnline::CRvcLiteOnline() {
	init_variable();
	m_init = false;
	m_switch_model = false;

	// 输入部分需要的变量
	// 要求输入的时间片长度,采样点数
	m_input_block_frame = int(gs_block_time * gs_src_samplerate);
	// 推理时额外需要的长度
	m_input_extra_frame = int(gs_extra_time * gs_src_samplerate);
	int zc = gs_src_samplerate / 100; // 10ms的点数
	int input_corssfade_frame = int(gs_crossfade_time * gs_src_samplerate);

	// 推理时使用的buffer长度
	m_input_predict_buf_frame = int(ceil((m_input_extra_frame + input_corssfade_frame + m_input_block_frame)
	* 1.0 / zc) * zc);
	// 推理时使用的buffer
	m_input_predict_buf = new float[m_input_predict_buf_frame];
	memset(m_input_predict_buf, 0, sizeof(float) * m_input_predict_buf_frame);

	// 输出部分需要的变量
	m_crossfade_frame = int(gs_crossfade_time * gs_dst_samplerate);
	m_output_block_frame = int(gs_block_time * gs_dst_samplerate);
	int output_extra_frame = int(gs_extra_time * gs_dst_samplerate);
	zc = gs_dst_samplerate / 100;
	m_output_cache_buf_frame = int(ceil((m_output_block_frame + m_crossfade_frame + output_extra_frame)
	* 1.0 / zc) * zc);
	m_output_cache_buf = new float[m_output_cache_buf_frame];
	memset(m_output_cache_buf, 0, sizeof(float) * m_output_cache_buf_frame);
	m_crossfade_buf = new float[m_crossfade_frame];
	memset(m_crossfade_buf, 0, sizeof(float) * m_crossfade_frame);

	// 对于模型的输入和输出进行缓存
	// 此处是写死的和模型有关
	m_hubert_ret.resize(1);
	m_hubert_ret[0].resize(gs_hubert_frame);
	for (int i = 0; i < gs_hubert_frame; i++) {
	m_hubert_ret[0][i].resize(gs_hubert_dim);
	}

	// synth模型的输入
	m_synth_input.resize(1);
	m_synth_input[0].resize(gs_synth_input_frame);
	for (int i = 0; i < gs_synth_input_frame; i++) {
	m_synth_input[0][i].resize(gs_synth_input_dim);
	}

	m_synth_out.resize(1);
	m_synth_out[0].resize(1);
	m_synth_out[0][0].resize(gs_synth_output_frame);
	}

	CRvcLiteOnline::~CRvcLiteOnline() {
	uninit();
	}

	/********************************对内函数*******************************************/
	void CRvcLiteOnline::uninit() {
	if (m_input_predict_buf != NULL) {
	delete[] m_input_predict_buf;
	m_input_predict_buf = NULL;
	}
	if (m_output_cache_buf != NULL) {
	delete[] m_output_cache_buf;
	m_output_cache_buf = NULL;
	}
	if (m_crossfade_buf != NULL) {
	delete[] m_crossfade_buf;
	m_crossfade_buf = NULL;
	}
	init_variable();
	}

	void CRvcLiteOnline::get_pyin_f0() {
	- for (int i = 0; i < m_input_predict_buf_frame; i += 160) {
	+ for (int i = 0; i < m_input_predict_buf_frame - 1024 - 160; i += 160) {
	m_es_pyin->process(m_input_predict_buf + i);
	}
	m_f0_data.clear();
	ESFeatureSet feats = m_es_pyin->getRemainingFeatures();
	if (!feats.empty()) {
	m_f0_data.resize(feats[4].size());
	for (size_t i = 0; i < feats[4].size(); ++i) {
	// 设置变调
	m_f0_data[i] = feats[4][i].values[0] * m_f0_up_key;
	if (m_f0_data[i] < 0) {
	m_f0_data[i] = 0;
	}
	}
	}
	m_es_pyin->reset();
	get_f0_post();
	}

	void CRvcLiteOnline::get_f0_post() {
	int f0_min = 50;
	int f0_max = 1100;
	float f0_mel_min = 1127 * log2(1 + f0_min * 1.0 / 700);
	float f0_mel_max = 1127 * log2(1 + f0_max * 1.0 / 700);
	m_f0_coarse_data.clear();
	m_f0_coarse_data.resize(m_f0_data.size());
	for (int i = 0; i < m_f0_data.size(); i++) {
	float f0_mel = 1127 * log2(1 + m_f0_data[i] / 700);
	if (f0_mel > 0) {
	f0_mel = (f0_mel - f0_mel_min) * 254.f / (f0_mel_max - f0_mel_min) + 1;
	}
	if (f0_mel <= 1) {
	f0_mel = 1;
	} else if (f0_mel > 255) {
	f0_mel = 255;
	}
	m_f0_coarse_data[i] = float(int(f0_mel + 0.5));
	}
	}

	void CRvcLiteOnline::init_variable() {
	m_init = false;
	m_switch_model = false;
	// 缓存使用的数据
	// 要求输入的时间片长度,采样点数
	m_input_block_frame = 0;
	m_input_extra_frame = 0;
	m_input_predict_buf_frame = 0;
	m_input_predict_buf = nullptr;

	m_f0_data.clear();
	m_f0_coarse_data.clear();

	m_crossfade_frame = 0;
	m_output_block_frame = 0;
	m_output_cache_buf_frame = 0;
	m_crossfade_buf = nullptr;
	m_output_cache_buf = nullptr;

	// 各个实例的返回结果
	m_hubert_ret.clear();
	m_synth_input.clear();
	m_synth_out.clear();

	m_fade_in = true;
	m_f0_up_key = 1.f;
	m_f0_new_up_key = 1.f;
	}

	/********************************对外函数*******************************************/
	int CRvcLiteOnline::init(const char *hubert_model_path) {
	if (m_init) {
	return ERR_RVC_LITE_REINIT;
	}

	m_hubert_inst = std::make_shared<Hubert>();
	m_synthesizer_inst = std::make_shared<CSynthesizer>();
	m_hubert_inst->init(hubert_model_path);
	// m_synthesizer_inst->init(synth_model_path);
	// 要求stepSize必须是2^n
	m_es_pyin = std::make_shared<ESPYIN>(16000, 160, 1024, 50, 1100);

	m_init = true;
	m_switch_model = false;
	m_fade_in = true;
	m_f0_up_key = 1.f;
	m_f0_new_up_key = 1.f;
	return ERR_RVC_LITE_SUCCESS;
	}

	int CRvcLiteOnline::switch_synth_model(const char *synth_model_path) {
	if (!m_init) {
	return ERR_RVC_LITE_NOT_INIT;
	}

	if (file_exists(synth_model_path))
	{
	m_synthesizer_inst = std::make_shared<CSynthesizer>();
	m_synthesizer_inst->init(synth_model_path);
	m_switch_model = true;
	return ERR_RVC_LITE_SUCCESS;
	}
	return ERR_RVC_LITE_MODEL_NOT_EXISTS;
	}

	void CRvcLiteOnline::set_up_key(int key)
	{
	if (key > 12)
	{
	key = 12;
	}

	if (key < -12)
	{
	key = -12;
	}
	m_f0_new_up_key = pow(2, key / 12.f);
	}


	void CRvcLiteOnline::reset() {
	memset(m_input_predict_buf, 0, sizeof(float) * m_input_predict_buf_frame);
	memset(m_crossfade_buf, 0, sizeof(float) * m_crossfade_frame);
	memset(m_output_cache_buf, 0, sizeof(float) * m_output_cache_buf_frame);
	m_fade_in = true;
	}

	int CRvcLiteOnline::process_block(float in_buf, int in_len, float out_buf, int out_len) {
	if (!m_init) {
	return ERR_RVC_LITE_NOT_INIT;
	}

	if (!m_switch_model)
	{
	return ERR_RVC_LITE_NOT_SWITCH_MODEL;
	}

	// 外部数据产生不连贯，比如做了reset的时候，需要做fade_in
	if (m_fade_in)
	{
	for(int i = 0; i < in_len; i++)
	{
	float rate = i * 1.0 / in_len;
	in_buf[i] = in_buf[i] * rate;
	}
	m_fade_in = false;
	}

	// 剔除尾部的block的数据
	memcpy(m_input_predict_buf, m_input_predict_buf + in_len,
	sizeof(float) * (m_input_predict_buf_frame - in_len));
	// 向尾部填充in_buf的数据
	memcpy(m_input_predict_buf + (m_input_predict_buf_frame - in_len), in_buf,
	sizeof(float) * in_len);

	// 提取f0特征序列
	struct timeval start;
	struct timeval end;
	gettimeofday(&start, NULL);
	m_f0_up_key = m_f0_new_up_key;
	get_pyin_f0();
	gettimeofday(&end, NULL);
	LOGE("CRvcLiteOnline", "get pyin sp = %f ms\n",
	(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);

	// 推理hubert
	gettimeofday(&start, NULL);
	m_hubert_inst->process(m_input_predict_buf, m_hubert_ret);
	gettimeofday(&end, NULL);
	LOGE("CRvcLiteOnline", "m_hubert_inst sp = %f ms\n",
	(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);

	// 合成语音
	for (int i = 0; i < gs_synth_input_frame; i++) {
	// 拷贝数据 1,gs_hubert_frame,258
	for (int j = 0; j < gs_hubert_dim; j++) {
	m_synth_input[0][i][j] = m_hubert_ret[0][i][j];
	}
	m_synth_input[0][i][256] = m_f0_coarse_data[i];
	m_synth_input[0][i][257] = m_f0_data[i];
	}
	gettimeofday(&start, NULL);
	m_synthesizer_inst->process(m_synth_input, m_synth_out);
	gettimeofday(&end, NULL);
	LOGE("CRvcLiteOnline", "m_synthesizer_inst sp = %f ms\n",
	(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);

	// 将结果全部放到缓存中
	memcpy(m_output_cache_buf, m_output_cache_buf + gs_synth_output_frame,
	sizeof(float) * (m_output_cache_buf_frame - gs_synth_output_frame));
	memcpy(m_output_cache_buf + (m_output_cache_buf_frame - gs_synth_output_frame),
	m_synth_out[0][0].data(), sizeof(float) * gs_synth_output_frame);

	int start_pos = m_output_cache_buf_frame - m_crossfade_frame - out_len;
	memcpy(out_buf, m_output_cache_buf + start_pos, sizeof(float) * out_len);
	// 对头部数据做fade_in以及fadeout
	for (int i = 0; i < m_crossfade_frame; i++) {
	float rate = float(i * 1.f / m_crossfade_frame);
	out_buf[i] = rate * out_buf[i] + m_crossfade_buf[i] * (1 - rate);
	}
	memcpy(m_crossfade_buf, m_output_cache_buf + (m_output_cache_buf_frame - m_crossfade_frame),
	sizeof(float) * m_crossfade_frame);

	return 0;
	}

	int CRvcLiteOnline::get_latency_ms() {
	// 此处除了block的延迟，还有推理时hubert理论上应该获取208，实际获取205帧，所以少的30ms
	return gs_crossfade_time * 1000 + 30;
	}



	/*****************************对内的类************************************/
	CResample::CResample()
	{
	m_resample_inst = nullptr;
	}

	CResample::~CResample()
	{

	}

	int CResample::init(int in_samplerate, int out_samplerate, int in_channel, int out_channel)
	{
	// 只是通道数不一致时走自驱逻辑
	m_in_channel = in_channel;
	m_out_channel = out_channel;
	if (in_samplerate == out_samplerate && in_channel != out_channel) {
	m_resample_inst = nullptr;
	}
	else {
	m_resample_inst = std::make_shared<CFfmpegResampler>();
	return m_resample_inst->init(in_samplerate, out_samplerate, in_channel, out_channel);
	}
	return ERR_RVC_LITE_SUCCESS;
	}

	int CResample::get_out_samples(int num)
	{
	if (m_resample_inst)
	{
	return m_resample_inst->get_out_samples(num);
	}
	return num;
	}

	void CResample::reset()
	{
	if (m_resample_inst)
	{
	return m_resample_inst->reset();
	}
	}

	int CResample::get_latency()
	{
	if (m_resample_inst)
	{
	return m_resample_inst->get_latency();
	}
	return 0;
	}

	int CResample::resample(float in_buf, int in_num, float out_buf, int &out_num) {
	if (m_resample_inst) {
	return m_resample_inst->resample(in_buf, in_num, out_buf, out_num);
	}

	if (m_in_channel == 2 && m_out_channel == 1) {
	if (out_num < in_num) {
	return ERR_RVC_LITE_RT_RESAMPLE_OUTBUF_SHORT;
	}
	stereo2mono(in_buf, in_num, out_buf);
	return ERR_RVC_LITE_SUCCESS;
	}

	if (m_in_channel == 1 && m_out_channel == 2) {
	if (out_num < in_num) {
	return ERR_RVC_LITE_RT_RESAMPLE_OUTBUF_SHORT;
	}
	mono2stereo(in_buf, in_num, out_buf);
	return ERR_RVC_LITE_SUCCESS;
	}
	return ERR_RVC_LITE_SUCCESS;
	}

	/*****************************对外的类*************************************/




	/*****************************对内函数*************************************/
	void CRvcLiteOnlineRealTime::init_variable() {
	m_init = false;
	m_rvc_stop = true;
	m_sample_rate = 44100;
	m_channel = 1;
	m_synth_path = "";
	m_new_synth_path = "";
	m_syn_state = RVC_LITE_RT_SYN_STATE_DEFAULT;
	}

	/*****************************对外函数*************************************/
	CRvcLiteOnlineRealTime::CRvcLiteOnlineRealTime() {
	init_variable();
	}

	CRvcLiteOnlineRealTime::~CRvcLiteOnlineRealTime() {
	uninit();
	}

	int CRvcLiteOnlineRealTime::init(const char *hubert_model_path, int sample_rate, int channel) {
	if (m_init) {
	return ERR_RVC_LITE_RT_REINIT;
	}

	if (sample_rate < 16000) {
	return ERR_RVC_LITE_RT_INPUT_SAMPLE_ERR;
	}
	init_variable();
	m_sample_rate = sample_rate;
	m_channel = channel;
	m_synth_path = "";
	m_new_synth_path = "";
	m_syn_state = RVC_LITE_RT_SYN_STATE_DEFAULT;
	int output_one_sec_number = m_sample_rate * m_channel; // 临时使用的数据
	int latency_len = gs_crossfade_time * m_sample_rate * m_channel;
	CThreadPool::Task task = std::bind(&CRvcLiteOnlineRealTime::rvc_process, this);

	m_rvc_inst = std::make_shared<CRvcLiteOnline>();
	int err = m_rvc_inst->init(hubert_model_path);
	if (ERR_RVC_LITE_SUCCESS != err) {
	goto exit;
	}

	// 重采样部分
	m_resample_queue = std::make_shared<CRvcCircleBuffer>(sample_rate * 3 * m_channel);
	m_resample16 = std::make_shared<CResample>();
	err = m_resample16->init(m_sample_rate, gs_src_samplerate, m_channel, 1);
	if (ERR_RVC_LITE_SUCCESS != err) {
	goto exit;
	}

	m_resample2src = std::make_shared<CResample>();
	err = m_resample2src->init(gs_dst_samplerate, m_sample_rate, 1, m_channel);
	if (ERR_RVC_LITE_SUCCESS != err) {
	goto exit;
	}
	m_resample_buf_max_len = 2048; // 此时空间最大是2048，保证不超即可
	m_resample_in_buf = std::shared_ptr<float>(new float[m_resample_buf_max_len], std::default_delete<float[]>());
	m_resample_out_buf = std::shared_ptr<float>(new float[m_resample_buf_max_len], std::default_delete<float[]>());

	// 核心处理部分
	m_input_tmp_buf_len = gs_src_samplerate;
	m_output_tmp_buf_len = gs_dst_samplerate;
	m_input_tmp_buf = std::shared_ptr<float>(new float[m_input_tmp_buf_len], std::default_delete<float[]>());
	m_output_tmp_buf = std::shared_ptr<float>(new float[m_output_tmp_buf_len], std::default_delete<float[]>());
	memset(m_input_tmp_buf.get(), 0, sizeof(float) * m_input_tmp_buf_len);
	memset(m_output_tmp_buf.get(), 0, sizeof(float) * m_output_tmp_buf_len);

	// 循环buffer
	m_input_queue = std::make_shared<CRvcCircleBuffer>(m_input_tmp_buf_len * 3);
	// 对外的是目标的采样率和通道数的数据
	m_out_queue = std::make_shared<CRvcCircleBuffer>(output_one_sec_number * 3);
	m_latency_queue = std::make_shared<CRvcCircleBuffer>(latency_len);
	// 提前塞入两组，保证延迟稳定在2s
	for (int i = 0; i < 2; i++) {
	// 塞入1s数据
	for (int j = 0; j < output_one_sec_number / m_output_tmp_buf_len; j++) {
	m_out_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len);
	}
	m_out_queue->push(m_output_tmp_buf.get(), output_one_sec_number % m_output_tmp_buf_len);
	}
	// 算法本身有延迟，所有为了保证延迟一致，在无效果的时候需要添加该延迟
	for (int j = 0; j < latency_len / m_output_tmp_buf_len; j++) {
	m_latency_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len);
	}
	m_latency_queue->push(m_output_tmp_buf.get(), latency_len % m_output_tmp_buf_len);

	// 开始处理线程
	m_thread_pool = std::make_shared<CThreadPool>();
	m_thread_pool->start(1);
	m_rvc_stop = false;
	m_thread_pool->run(task);

	m_init = true;
	exit:
	if (ERR_RVC_LITE_SUCCESS != err) {
	m_init = true;
	uninit();
	}
	return err;
	}

	int CRvcLiteOnlineRealTime::switch_synth(const char *synth_model_path) {
	if (!m_init) {
	return ERR_RVC_LITE_RT_NOT_INIT;
	}

	{
	std::unique_lock<std::mutex> lock(m_rvc_mutex);
	m_new_synth_path = synth_model_path;
	}
	return ERR_RVC_LITE_SUCCESS;
	}


	int CRvcLiteOnlineRealTime::process(float in_buf, int in_len, float out_buf, int out_len) {
	if (!m_init) {
	return ERR_RVC_LITE_RT_NOT_INIT;
	}

	// 写入数据
	{
	std::unique_lock<std::mutex> lock(m_rvc_mutex);
	m_resample_queue->push(in_buf, in_len);
	m_rvc_cond.notify_all();
	}
	memset(out_buf, 0, sizeof(float) * out_len);
	int tmp_out_len = out_len;

	// 获取数据
	{
	std::unique_lock<std::mutex> lock(m_rvc_mutex);
	m_out_queue->pop(out_buf, tmp_out_len);
	}

	if (tmp_out_len != out_len) {
	return ERR_RVC_LITE_RT_NOT_ENOUGH_DATA;
	}
	return ERR_RVC_LITE_SUCCESS;
	}

	void CRvcLiteOnlineRealTime::reset() {
	if (!m_init) {
	return;
	}

	{
	std::unique_lock<std::mutex> lock(m_rvc_mutex);
	m_resample_queue->reset();
	m_resample16->reset();
	m_resample2src->reset();
	m_input_queue->reset();
	m_out_queue->reset();
	m_rvc_inst->reset();
	m_latency_queue->reset();
	// 提前塞入两组，保证延迟稳定在2s
	int output_one_sec_number = m_sample_rate * m_channel; // 临时使用的数据
	memset(m_output_tmp_buf.get(), 0, sizeof(float) * m_output_tmp_buf_len);
	for (int i = 0; i < 2; i++) {
	for (int j = 0; j < output_one_sec_number / m_output_tmp_buf_len; j++) {
	m_out_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len);
	}
	m_out_queue->push(m_output_tmp_buf.get(), output_one_sec_number % m_output_tmp_buf_len);
	}
	// 算法本身有延迟，所有为了保证延迟一致，在无效果的时候需要添加该延迟
	int latency_len = gs_crossfade_time * m_sample_rate * m_channel;
	for (int j = 0; j < latency_len / m_output_tmp_buf_len; j++) {
	m_latency_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len);
	}
	m_latency_queue->push(m_output_tmp_buf.get(), latency_len % m_output_tmp_buf_len);
	}
	}

	void CRvcLiteOnlineRealTime::flush(float *&out_buf, int &len) {
	// 将内部的所有的数据吐出来
	/**
	* 先停止
	*/
	stop();

	// 无音色转换的情况
	int resample_in_len = 0;
	int resample_out_len = 0;
	if(m_syn_state == RVC_LITE_RT_SYN_STATE_DEFAULT)
	{
	while (m_resample_queue->size() > 0) {
	resample_in_len = m_resample_buf_max_len;
	m_resample_queue->pop(m_resample_in_buf.get(), resample_in_len);
	m_latency_queue->push(m_resample_in_buf.get(), resample_in_len);
	m_latency_queue->pop(m_resample_in_buf.get(), resample_in_len);
	m_out_queue->push(m_resample_in_buf.get(), resample_in_len);
	}

	while(m_latency_queue->size() > 0)
	{
	resample_in_len = m_resample_buf_max_len;
	m_latency_queue->pop(m_resample_in_buf.get(), resample_in_len);
	m_out_queue->push(m_resample_in_buf.get(), resample_in_len);
	}

	len = m_out_queue->size();
	out_buf = new float[len];
	m_out_queue->pop(out_buf, len);
	return;
	}

	// 有音色转换的情况
	while (m_resample_queue->size() > 0) {
	resample_in_len = m_resample_buf_max_len;
	m_resample_queue->pop(m_resample_in_buf.get(), resample_in_len);
	// 输入的数据需要考虑channel
	resample_out_len = m_resample16->get_out_samples(resample_in_len / m_channel);
	m_resample16->resample(m_resample_in_buf.get(), resample_in_len / m_channel, m_resample_out_buf.get(),
	resample_out_len);
	// 输出是16k单声道，不需要考虑
	m_input_queue->push(m_resample_out_buf.get(), resample_out_len);
	}
	memset(m_input_tmp_buf.get(), 0, sizeof(float) * m_input_tmp_buf_len);
	int add_size = m_input_tmp_buf_len - m_input_queue->size() % m_input_tmp_buf_len;
	if (add_size != 0 && add_size < m_input_tmp_buf_len) {
	m_input_queue->push(m_input_tmp_buf.get(), add_size);
	}
	int num = m_input_queue->size() / m_input_tmp_buf_len;
	for (int i = 0; i < num; i++) {
	rvc_process_step();
	}

	// 将所有数据拷贝出来
	len = m_out_queue->size();
	out_buf = new float[len];
	m_out_queue->pop(out_buf, len);
	}

	int CRvcLiteOnlineRealTime::get_latency_ms() {
	return m_rvc_inst->get_latency_ms() + 2000;
	}

	/*****************************对内函数*************************************/
	void CRvcLiteOnlineRealTime::uninit() {
	if (!m_init) {
	return;
	}
	stop();
	}

	void CRvcLiteOnlineRealTime::stop() {
	// 释放thread_pool的数据,先通知一下rvc_process,防止是在等待中
	m_rvc_stop = true;
	if (m_thread_pool) {
	m_rvc_cond.notify_all();
	m_thread_pool->stop();
	}
	}

	void CRvcLiteOnlineRealTime::rvc_process_step() {

	struct timeval start;
	struct timeval end;
	int sample_out_len = 0;
	// 开始处理
	if (m_input_queue->size() < m_input_tmp_buf_len) {
	return;
	}
	gettimeofday(&start, NULL);
	m_input_queue->pop(m_input_tmp_buf.get(), m_input_tmp_buf_len);
	m_rvc_inst->process_block(m_input_tmp_buf.get(), m_input_tmp_buf_len,
	m_output_tmp_buf.get(), m_output_tmp_buf_len);
	gettimeofday(&end, NULL);
	LOGD("RvcLite", "rvc_process process sp %f ms",
	(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);

	// 重采样
	// 考虑到此处采样率变大，但是最多也不到两倍，但是通道数有可能扩展到两倍，所以按照1/4进行设置
	gettimeofday(&start, NULL);
	bool last = false;
	int step = m_resample_buf_max_len / 4;
	for (int i = 0; i < m_output_tmp_buf_len; i += step) {
	if (i + step >= m_output_tmp_buf_len) {
	step = m_output_tmp_buf_len - i;
	last = true;
	}
	// 此时的输入是单声道，采样点数量和总长度一致
	sample_out_len = m_resample2src->get_out_samples(step);
	m_resample2src->resample(m_output_tmp_buf.get() + i, step, m_resample_out_buf.get(), sample_out_len);

	// 从有到无
	if(last && m_syn_state == RVC_LITE_RT_SYN_STATE_EFFECT2DEFAULT)
	{
	// 因为不加音效也需要延迟对齐，所以此处只要做fade_out就行了
	for(int ii =0; ii < sample_out_len * m_channel; ii+=m_channel)
	{
	float rate = ii * 1.0 / step;
	for(int jj = 0; jj < m_channel; jj++)
	{
	m_resample_out_buf.get()[ii+jj] = m_resample_out_buf.get()[ii+jj] * (1 - rate);
	}
	}
	m_syn_state = RVC_LITE_RT_SYN_STATE_BEFORE_DEFAULT;
	}

	{
	std::unique_lock<std::mutex> lock(m_rvc_mutex);
	m_out_queue->push(m_resample_out_buf.get(), sample_out_len * m_channel);
	}
	}
	gettimeofday(&end, NULL);
	LOGD("RvcLite", "rvc_process re_resample sp %f ms",
	(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);
	printf("finish ...\n");
	}

	void CRvcLiteOnlineRealTime::rvc_process() {
	int sample_in_len;
	int sample_out_len = 0;
	while (!m_rvc_stop) {
	{
	// 重采样
	std::unique_lock<std::mutex> lock(m_rvc_mutex);
	if (m_resample_queue->size() < m_resample_buf_max_len) {
	// 睡眠前检查下情况
	if (m_rvc_stop) {
	return;
	}
	m_rvc_cond.wait(lock);
	continue;
	}
	sample_in_len = m_resample_buf_max_len;
	m_resample_queue->pop(m_resample_in_buf.get(), sample_in_len);
	}

	/**
	* 此处有三种情况:
	* 因为无论哪种变换，有延迟的存在，导致输入的数据都是需要塞0进去，所以对当前的数据做fade_out即可
	* 1. 无到有:对无到有的部分做个fade_out,对下一帧要塞入音效器的部分做fade_in
	* 2. 有到无:对无到有的部分做个fade_out,对下一帧要塞入音效器的部分做fade_in
	* 3. 有到有[这个不用考虑，内部自己做了处理]
	*/
	if (m_synth_path != m_new_synth_path) {

	// 从无到有，此时对本帧做fade_out,对下一帧输入做fade_in
	if(m_synth_path.empty() && !m_new_synth_path.empty())
	{
	m_syn_state = RVC_LITE_RT_SYN_STATE_DEFAULT2EFFECT;
	}

	// 从有到无
	if (!m_synth_path.empty() && m_new_synth_path.empty())
	{
	m_syn_state = RVC_LITE_RT_SYN_STATE_EFFECT2DEFAULT;
	}

	{
	std::unique_lock<std::mutex> lock(m_rvc_mutex);
	m_synth_path = m_new_synth_path;
	}
	m_rvc_inst->switch_synth_model(m_new_synth_path.c_str());
	}

	// 刚切过来第一次做效果
	if(m_syn_state == RVC_LITE_RT_SYN_STATE_BEFORE_DEFAULT)
	{
	// 刚从有到无，需要清空数据,以及对输入的队列添加fade_in
	m_latency_queue->reset();
	// 算法本身有延迟，所有为了保证延迟一致，在无效果的时候需要添加该延迟
	memset(m_output_tmp_buf.get(), 0, sizeof(float) * m_output_tmp_buf_len);
	int latency_len = gs_crossfade_time * m_sample_rate * m_channel;
	for (int j = 0; j < latency_len / m_output_tmp_buf_len; j++) {
	m_latency_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len);
	}
	m_latency_queue->push(m_output_tmp_buf.get(), latency_len % m_output_tmp_buf_len);

	// 对输入做fade_in
	for(int i = 0; i < sample_in_len; i+=m_channel)
	{
	float rate = i * 1.0 / sample_in_len;
	for(int j = 0; j < m_channel; j++)
	{
	m_resample_in_buf.get()[i+j] *= rate;
	}
	}
	m_syn_state = RVC_LITE_RT_SYN_STATE_DEFAULT;
	}

	// 不做效果
	if(m_syn_state == RVC_LITE_RT_SYN_STATE_DEFAULT)
	{
	m_latency_queue->push(m_resample_in_buf.get(), sample_in_len);
	m_latency_queue->pop(m_resample_in_buf.get(), sample_in_len);
	{
	std::unique_lock<std::mutex> lock(m_rvc_mutex);
	m_out_queue->push(m_resample_in_buf.get(), sample_in_len);
	}
	continue;
	}

	// 从无到有的转换
	if (m_syn_state == RVC_LITE_RT_SYN_STATE_DEFAULT2EFFECT)
	{
	// 做fade_out
	for(int i = 0; i < sample_in_len; i+=m_channel)
	{
	float rate = i * 1.0 / sample_in_len;
	for(int j = 0; j < m_channel; j++)
	{
	m_resample_in_buf.get()[i+j] *= 1 - rate;
	}
	}
	m_latency_queue->push(m_resample_in_buf.get(), sample_in_len);
	m_latency_queue->pop(m_resample_in_buf.get(), sample_in_len);
	{
	std::unique_lock<std::mutex> lock(m_rvc_mutex);
	m_out_queue->push(m_resample_in_buf.get(), sample_in_len);
	}

	// 此时对于rvc来说输入的数据不连贯了，所以清空内部数据重新搞
	m_syn_state = RVC_LITE_RT_SYN_STATE_EFFECT;
	m_rvc_inst->reset();
	continue;
	}

	// 重采样到16k,此处采样率变低，所以不会出现sample_out_len > sample_in_len的情况
	sample_out_len = m_resample16->get_out_samples(sample_in_len / m_channel);
	m_resample16->resample(m_resample_in_buf.get(), sample_in_len / m_channel, m_resample_out_buf.get(),
	sample_out_len);
	m_input_queue->push(m_resample_out_buf.get(), sample_out_len);
	rvc_process_step();
	}
	}
	\ No newline at end of file
	diff --git a/mnn_demo/src/CRvcLiteOnlineV2.cpp b/mnn_demo/src/CRvcLiteOnlineV2.cpp
	index 0269a7d..97a67fb 100644
	--- a/mnn_demo/src/CRvcLiteOnlineV2.cpp
	+++ b/mnn_demo/src/CRvcLiteOnlineV2.cpp
	@@ -1,215 +1,215 @@
	//
	// Created by Administrator on 2024/1/22.
	//

	#include "CRvcLiteOnlineV2.h"
	#include "CRvcCircleBuffer.h"
	#include <unistd.h>

	inline bool file_exists1 (const std::string& name) {
	return ( access( name.c_str(), F_OK ) != -1 );
	}

	CRvcLiteOnlineV2::CRvcLiteOnlineV2()
	{

	}

	CRvcLiteOnlineV2::~CRvcLiteOnlineV2()
	{

	}
	/***************************************对内函数*************************************************************/
	void CRvcLiteOnlineV2::set_cur_state(bool reset)
	{
	/**
	* 一共三种状态
	* 从无到有: 让不做效果的fade_out,做效果的fade_in
	* 从有到无: 让做效果的fade_out, 不做效果的fade_in即可
	* 从有到有，这种情况不考虑，内部自己会做fade
	*/
	if (m_syn_model != m_new_syn_model)
	{
	// 从无到有
	if (m_syn_model.empty() && !m_new_syn_model.empty())
	{
	m_sync_state = CRVC_V2_STATE_DEFAULT2EFFECT;

	// 如果此时已经发生了reset，则不需要做切换，直接做就行
	if (reset)
	{
	m_sync_state = CRVC_V2_STATE_EFFECT;
	}
	m_syn_model = m_new_syn_model;
	m_rvc_inst->switch_model(m_syn_model.c_str());
	}

	// 从有到无
	if (!m_syn_model.empty() && m_new_syn_model.empty())
	{
	m_sync_state = CRVC_V2_STATE_EFFECT2DEFAULT;
	// 如果此时已经发生了reset，则不需要做切换，直接做就行
	if (reset)
	{
	m_sync_state = CRVC_V2_STATE_DEFAULT;
	}
	m_syn_model = m_new_syn_model;
	}
	}
	}

	/***************************************对外函数*************************************************************/
	int CRvcLiteOnlineV2::init(const char *hubert_model, int sample_rate, int channel)
	{
	m_rvc_inst = std::make_shared<CRvcLiteSynthesizer>();
	- m_block_len = sample_rate * channel - 100 * channel;
	+ m_block_len = int(sample_rate * 0.9) * channel; // 每900ms处理一次
	m_tmp_buf_len = m_block_len * 2;
	m_reset = true;
	m_syn_model = "";
	m_new_syn_model = "";
	m_sync_state = CRVC_V2_STATE_DEFAULT;
	m_fade_len = int(sample_rate * 0.05) * channel; // 50ms的时长用来做fade
	m_channel = channel;

	m_tmp_in_buf = std::shared_ptr<float>(new float[m_tmp_buf_len], std::default_delete<float[]>());
	m_tmp_out_buf = std::shared_ptr<float>(new float[m_tmp_buf_len], std::default_delete<float[]>());
	m_in_queue = std::make_shared<CRvcCircleBuffer>(m_tmp_buf_len * 2);
	m_out_queue = std::make_shared<CRvcCircleBuffer>(m_tmp_buf_len * 2);
	m_input_latency_output_frame = 0;
	return m_rvc_inst->init(hubert_model, sample_rate, channel);
	}

	int CRvcLiteOnlineV2::switch_model(const char *synth_model)
	{
	if (synth_model != "" && !file_exists1(synth_model))
	{
	return ERR_RVC_LITE_MODEL_NOT_EXISTS;
	}

	m_new_syn_model = synth_model;
	return ERR_RVC_LITE_SUCCESS;
	}

	void CRvcLiteOnlineV2::set_up_key(int key)
	{
	// 内部是线程安全的，所以直接设置即可
	m_rvc_inst->set_up_key(key);
	}

	void CRvcLiteOnlineV2::reset()
	{
	m_reset = true;
	}


	int CRvcLiteOnlineV2::push(float *buf, int len, bool last)
	{
	bool reset = m_reset;
	if (m_reset)
	{
	m_reset = false;
	m_input_latency_output_frame = 0;
	m_in_queue->reset();
	m_out_queue->reset();
	m_rvc_inst->reset();
	}

	set_cur_state(reset);

	if (CRVC_V2_STATE_DEFAULT == m_sync_state)
	{
	std::unique_lock<std::mutex> lock(m_rvc_mutex);
	m_out_queue->push(buf, len);
	return ERR_RVC_LITE_SUCCESS;
	}

	// 此时无论怎样，都要让模型跑一下，得到结果再说
	m_in_queue->push(buf, len);
	while(m_in_queue->size() >= m_block_len \|\| last) {
	if (m_in_queue->size() <= 0)
	{
	return ERR_RVC_LITE_SUCCESS;
	}

	int cur_in_len = m_block_len;
	int cur_out_len = m_block_len;
	m_in_queue->pop(m_tmp_in_buf.get(), cur_in_len);
	int err = m_rvc_inst->process(m_tmp_in_buf.get(), cur_in_len, m_tmp_out_buf.get(), cur_out_len);
	if (err != ERR_RVC_LITE_SUCCESS) {
	return err;
	}

	// 此时对于effect做fade_out,default做fade_in
	if (m_sync_state == CRVC_V2_STATE_EFFECT2DEFAULT)
	{
	// 此时由于m_rvc_inst本身存在延迟输出的情况[虽然头部的静音帧已经被砍掉了]，但是其输入的数据和输出的数据并不是完美对应的，存在延迟差
	// 所以此时输入的头部和输出的头部之前存在延迟差，但是不加音效是没有这个延迟差的
	// 所以需要将输入的头部对应到其应该对应的输出真实数据的头部
	// 比如: 输入: 1,2,3,4,5 输出: l1,l2,1,2,3 ,其中l1和l2是延迟采样点，也就是1,2，对应的是输出+延迟采样点才对
	for(int i = 0; i < m_fade_len; i+=m_channel)
	{
	float rate = i * 1.0 / m_fade_len;
	for(int j = 0; j < m_channel; j+=1)
	{
	m_tmp_in_buf.get()[i+j] = m_tmp_in_buf.get()[i+j] * rate + m_tmp_out_buf.get()[i+j+m_input_latency_output_frame] * (1 - rate);
	}
	}
	{
	std::unique_lock<std::mutex> lock(m_rvc_mutex);
	// 将之前要输入的那块塞进去
	m_out_queue->push(m_tmp_out_buf.get(), m_input_latency_output_frame);
	m_out_queue->push(m_tmp_in_buf.get(), cur_in_len);
	}

	m_sync_state = CRVC_V2_STATE_DEFAULT;
	m_input_latency_output_frame = 0;

	while(m_in_queue->size() > 0)
	{
	cur_in_len = m_block_len;
	m_in_queue->pop(m_tmp_in_buf.get(), cur_in_len);
	{
	std::unique_lock<std::mutex> lock(m_rvc_mutex);
	m_out_queue->push(m_tmp_in_buf.get(), cur_in_len);
	}
	}
	return ERR_RVC_LITE_SUCCESS;
	}

	// 此时对effect做fade_in,default做fade_out
	if (m_sync_state == CRVC_V2_STATE_DEFAULT2EFFECT)
	{
	for(int i = 0; i < m_fade_len; i+=m_channel)
	{
	float rate = i * 1.0 / m_fade_len;
	for(int j = 0; j < m_channel; j+=1)
	{
	m_tmp_out_buf.get()[i+j] = m_tmp_out_buf.get()[i+j] * rate + m_tmp_in_buf.get()[i+j] * (1 - rate);
	}
	}
	// 设置状态
	m_sync_state = CRVC_V2_STATE_EFFECT;
	}

	// effect会存在输入和输出长度不一致的情况
	m_input_latency_output_frame += cur_in_len - cur_out_len;

	// 加锁塞入数据
	{
	std::unique_lock<std::mutex> lock(m_rvc_mutex);
	m_out_queue->push(m_tmp_out_buf.get(), cur_out_len);
	}
	}
	return ERR_RVC_LITE_SUCCESS;
	}

	int CRvcLiteOnlineV2::size()
	{
	return m_out_queue->size();
	}

	void CRvcLiteOnlineV2::pop(float *buf, int &len)
	{
	std::unique_lock<std::mutex> lock(m_rvc_mutex);
	m_out_queue->pop(buf, len);
	}
	diff --git a/mnn_demo/src/CRvcLiteSynthesizer.cpp b/mnn_demo/src/CRvcLiteSynthesizer.cpp
	index 6ff952b..711a6ed 100644
	--- a/mnn_demo/src/CRvcLiteSynthesizer.cpp
	+++ b/mnn_demo/src/CRvcLiteSynthesizer.cpp
	@@ -1,128 +1,127 @@
	//
	// Created by Administrator on 2024/1/21.
	//

	#include "CRvcLiteSynthesizer.h"
	#include <cstring>
	#include <sys/time.h>

	CRvcLiteSynthesizer::CRvcLiteSynthesizer(){}

	CRvcLiteSynthesizer::~CRvcLiteSynthesizer() {}

	int CRvcLiteSynthesizer::init(const char *hubert_model, int sample_rate, int channel)
	{
	m_rvc_inst = std::make_shared<CRvcLiteOnline>();
	int err = m_rvc_inst->init(hubert_model);
	if (err != ERR_RVC_LITE_SUCCESS)
	{
	return err;
	}

	m_resample2_16 = std::make_shared<CResample>();
	m_resample2_16->init(sample_rate, gs_src_samplerate, channel, 1);
	m_resample2src = std::make_shared<CResample>();
	m_resample2src->init(gs_dst_samplerate, sample_rate, 1, channel);

	m_channel = channel;
	m_sample_rate = sample_rate;

	m_buf_tmp_16k_len = 0;
	m_buf_tmp_16k_cap = 0;
	m_buf_tmp_32k_len = 0;
	m_buf_tmp_32k_cap = 0;
	m_buf_tmp_src_len = 0;
	m_buf_tmp_src_cap = 0;
	m_first = true;
	return ERR_RVC_LITE_SUCCESS;
	}

	int CRvcLiteSynthesizer::switch_model(const char *synth_model)
	{
	return m_rvc_inst->switch_synth_model(synth_model);
	}

	void CRvcLiteSynthesizer::set_up_key(int key)
	{
	m_rvc_inst->set_up_key(key);
	}

	void CRvcLiteSynthesizer::reset()
	{
	m_rvc_inst->reset();
	m_first = true;
	}

	int CRvcLiteSynthesizer::process(float in_buf, int in_len, float out_buf, int &out_len) {
	// 1 重采样 2 推理 3 再次重采样
	int resample_out_len = m_resample2_16->get_out_samples(in_len / m_channel);
	// 控制逻辑，不能超过该长度
	if (resample_out_len > gs_src_samplerate) {
	return ERR_RVC_LITE_BLOCK_TOO_LONG;
	}
	-
	if (m_buf_tmp_16k_cap < resample_out_len) {
	m_buf_tmp_16k_cap = resample_out_len;
	m_buf_tmp_16k = std::shared_ptr<float>(new float[m_buf_tmp_16k_cap], std::default_delete<float[]>());
	}
	m_buf_tmp_16k_len = resample_out_len;
	int err = m_resample2_16->resample(in_buf, in_len / m_channel, m_buf_tmp_16k.get(), m_buf_tmp_16k_len);
	if (err != ERR_RVC_LITE_SUCCESS) {
	return err;
	}
	if (m_buf_tmp_32k_cap < m_buf_tmp_16k_len * 2) {
	m_buf_tmp_32k_cap = m_buf_tmp_16k_len * 2;
	m_buf_tmp_32k = std::shared_ptr<float>(new float[m_buf_tmp_32k_cap], std::default_delete<float[]>());
	}
	m_buf_tmp_32k_len = m_buf_tmp_16k_len * 2;

	// 推理
	err = m_rvc_inst->process_block(m_buf_tmp_16k.get(), m_buf_tmp_16k_len, m_buf_tmp_32k.get(), m_buf_tmp_32k_len);
	if (err != ERR_RVC_LITE_SUCCESS) {
	return err;
	}
	// 重采样回来
	int out_frame = m_resample2src->get_out_samples(m_buf_tmp_32k_len);
	if (m_buf_tmp_src_cap < out_frame * m_channel) {
	m_buf_tmp_src_cap = out_frame * m_channel;
	m_buf_tmp_src = std::shared_ptr<float>(new float[m_buf_tmp_src_cap], std::default_delete<float[]>());
	}
	m_buf_tmp_src_len = out_frame;
	err = m_resample2src->resample(m_buf_tmp_32k.get(), m_buf_tmp_32k_len, m_buf_tmp_src.get(), m_buf_tmp_src_len);
	if (err != ERR_RVC_LITE_SUCCESS) {
	return err;
	}

	// 取较小的值
	if (out_len > m_buf_tmp_src_len * m_channel)
	{
	out_len = m_buf_tmp_src_len * m_channel;
	}

	// 第一次过来，将头部的延迟块切掉
	int latency_frame = 0;
	if (m_first)
	{
	m_first = false;
	latency_frame = int(m_rvc_inst->get_latency_ms() * 1.0 / 1000 * m_sample_rate) * m_channel;
	out_len -= latency_frame;
	}
	memcpy(out_buf, m_buf_tmp_src.get()+latency_frame, sizeof(float) * out_len);
	return ERR_RVC_LITE_SUCCESS;
	}


	float CRvcLiteSynthesizer::get_rtf()
	{
	struct timeval start;
	struct timeval end;
	gettimeofday(&start, NULL);
	int in_len = m_sample_rate * m_channel - 100 *m_channel;
	int out_len = in_len;
	float* in_buf = new float[in_len];
	process(in_buf, in_len, in_buf, in_len);
	delete [] in_buf;
	gettimeofday(&end, NULL);
	double sp = (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0;
	return sp / 1000;
	}
	\ No newline at end of file
	diff --git a/mnn_demo/src/CSynthesizer.cpp b/mnn_demo/src/CSynthesizer.cpp
	index 2346fe9..427d6c8 100644
	--- a/mnn_demo/src/CSynthesizer.cpp
	+++ b/mnn_demo/src/CSynthesizer.cpp
	@@ -1,73 +1,86 @@
	//
	// Created by ZIHAO GUO on 2023/11/16.
	//

	#include "CSynthesizer.h"
	#include <cstring>
	#include <sys/time.h>

	CSynthesizer::CSynthesizer() = default;

	CSynthesizer::~CSynthesizer() {
	uninit();
	}

	int CSynthesizer::init(const char *model_path) {
	m_config.type = MNN_FORWARD_CPU;
	m_runtime_info = MNN::Interpreter::createRuntime({m_config});
	m_net = std::shared_ptr<MNN::Interpreter>(MNN::Interpreter::createFromFile(model_path));
	m_session = m_net->createSession(m_config, m_runtime_info);
	m_input_tensor = m_net->getSessionInput(m_session, nullptr);
	return 0;
	}

	float CSynthesizer::process(std::vector<std::vector<std::vector<float>>> &contentvec_input, std::vector<std::vector<std::vector<float>>> &ret) {
	std::vector<int> input_dims{1, 205, 258};
	auto input_tensor = MNN::Tensor::create<float>(input_dims, nullptr, MNN::Tensor::CAFFE);
	auto input_data = input_tensor->host<float>();
	auto input_size = input_tensor->size();
	// ::memcpy(input_data, contentvec_input.data(), input_size);
	for (int i = 0; i < 205; i++)
	{
	std::memcpy(input_data+i*258, contentvec_input[0][i].data(), input_size / 205);
	}
	m_input_tensor->copyFromHostTensor(input_tensor);
	delete input_tensor;

	struct timeval start;
	struct timeval end;
	gettimeofday(&start, NULL);
	m_net->runSession(m_session);
	gettimeofday(&end, NULL);
	float time = (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0;

	auto output_tensor = m_net->getSessionOutput(m_session, nullptr);
	std::vector<int> shape = output_tensor->shape();
	auto output = MNN::Tensor::create<float>(shape, nullptr, MNN::Tensor::CAFFE);
	auto output_data = output->host<float>();
	auto output_size = output->size();
	output_tensor->copyToHostTensor(output);
	+
	for (int i = 0; i < shape[0]; i++)
	{
	+ if (shape[0] > ret.size())
	+ {
	+ ret.resize(shape[0]);
	+ }
	for (int j = 0; j < shape[1]; j++)
	{
	+ if (shape[1] > ret[j].size())
	+ {
	+ ret[j].resize(shape[1]);
	+ }
	for (int k = 0; k < shape[2]; k++)
	{
	+ if (shape[2] > ret[i][j].size())
	+ {
	+ ret[i][j].resize(shape[2]);
	+ }
	ret[i][j][k] = (output_data + i 35840 + k);
	}
	}
	}
	return time;
	}

	void CSynthesizer::uninit() {
	if (m_net != nullptr)
	{
	m_net->releaseModel();
	}
	m_net = nullptr;
	m_session = nullptr;
	m_input_tensor = nullptr;
	}


	diff --git a/mnn_demo/third_party/espyin-v1.0/ESPYIN.cpp b/mnn_demo/third_party/espyin-v1.0/ESPYIN.cpp
	index a0c762d..604ab5a 100644
	--- a/mnn_demo/third_party/espyin-v1.0/ESPYIN.cpp
	+++ b/mnn_demo/third_party/espyin-v1.0/ESPYIN.cpp
	@@ -1,163 +1,162 @@
	/* -- c-basic-offset: 4 indent-tabs-mode: nil -- vi:set ts=8 sts=4 sw=4: */

	/*
	pYIN - A fundamental frequency estimator for monophonic audio
	Centre for Digital Music, Queen Mary, University of London.

	This program is free software; you can redistribute it and/or
	modify it under the terms of the GNU General Public License as
	published by the Free Software Foundation; either version 2 of the
	License, or (at your option) any later version. See the file
	COPYING included with this distribution for more information.
	*/

	#include "ESPYIN.h"
	#include "ESMonoPitch.h"

	#include <vector>
	#include <algorithm>

	#include <cstdio>
	#include <cmath>
	#include <complex>

	using std::string;
	using std::vector;

	ESPYIN::ESPYIN(float inputSampleRate, size_t stepSize, size_t blockSize, size_t fmin, size_t fmax) :
	m_stepSize(stepSize),
	m_blockSize(blockSize),
	m_fmin(fmin),
	m_fmax(fmax),
	m_yin(blockSize, inputSampleRate, 0.0),
	m_oF0Candidates(0),
	m_oF0Probs(1),
	m_oVoicedProb(2),
	m_oCandidateSalience(3),
	m_oSmoothedPitchTrack(4),
	m_threshDistr(2.0f),
	m_outputUnvoiced(2.0f),
	m_pitchProb(0)
	{
	reset();
	}

	ESPYIN::~ESPYIN()
	{
	}

	void
	ESPYIN::reset()
	{
	m_yin.setThresholdDistr(m_threshDistr);
	m_yin.setFrameSize(m_blockSize);

	m_pitchProb.clear();
	}

	void
	ESPYIN::updata(int reserve_frame_num)
	{
	vector<vector<pair<double, double> > > temp_pitchProb(m_pitchProb);

	if (!temp_pitchProb.empty()) {
	int frame_num = int(temp_pitchProb.size());

	if (reserve_frame_num <= 0 \|\| reserve_frame_num > frame_num) {
	return;
	}

	for (int i = 0; i < reserve_frame_num; ++i) {
	temp_pitchProb[i] = temp_pitchProb[frame_num - reserve_frame_num + i];
	}

	temp_pitchProb.resize(reserve_frame_num);
	m_pitchProb = temp_pitchProb;
	}
	}

	-ESFeatureSet
	-ESPYIN::process(const float * const inputBuffers)
	+ESFeatureSet ESPYIN::process(const float * const inputBuffers)
	{
	ESFeatureSet fs;
	double *dInputBuffers = new double[m_blockSize];
	for (size_t i = 0; i < m_blockSize; ++i) dInputBuffers[i] = inputBuffers[i];
	ESYin::YinOutput yo = m_yin.processProbabilisticYin(dInputBuffers);
	ESFeature f;
	for (size_t i = 0; i < yo.freqProb.size(); ++i)
	{
	f.values.push_back(yo.freqProb[i].first);
	}
	fs[m_oF0Candidates].push_back(f);
	f.values.clear();
	float voicedProb = 0;
	for (size_t i = 0; i < yo.freqProb.size(); ++i)
	{
	f.values.push_back(yo.freqProb[i].second);
	voicedProb += yo.freqProb[i].second;
	}
	fs[m_oF0Probs].push_back(f);

	f.values.clear();
	f.values.push_back(voicedProb);
	fs[m_oVoicedProb].push_back(f);

	f.values.clear();
	float salienceSum = 0;
	for (size_t iBin = 0; iBin < yo.salience.size(); ++iBin)
	{
	f.values.push_back(yo.salience[iBin]);
	salienceSum += yo.salience[iBin];
	}
	fs[m_oCandidateSalience].push_back(f);
	delete [] dInputBuffers;

	vector<pair<double, double> > tempPitchProb;
	for (size_t iCandidate = 0; iCandidate < yo.freqProb.size(); ++iCandidate)
	{
	double tempPitch = 12 * std::log(yo.freqProb[iCandidate].first/440)/std::log(2.) + 69;
	tempPitchProb.push_back(pair<double, double>
	(tempPitch, yo.freqProb[iCandidate].second));
	}
	m_pitchProb.push_back(tempPitchProb);

	return fs;
	}

	ESFeatureSet
	ESPYIN::getRemainingFeatures(int reso_type)
	{
	ESFeatureSet fs;
	ESFeature f;
	vector<vector<pair<double, double> > > temp_pitchProb(m_pitchProb);

	if (temp_pitchProb.empty()) {
	return fs;
	}

	// MONO-PITCH STUFF
	ESMonoPitch mp(reso_type);
	// std::cerr << "before viterbi" << std::endl;
	vector<float> mpOut = mp.process(temp_pitchProb);
	// std::cerr << "after viterbi " << mpOut.size() << " "<< m_timestamp.size() << std::endl;
	for (size_t iFrame = 0; iFrame < mpOut.size(); ++iFrame)
	{
	if (mpOut[iFrame] < 0 && (m_outputUnvoiced==0)) continue;
	f.values.clear();
	if (m_outputUnvoiced == 1)
	{
	f.values.push_back(abs(mpOut[iFrame]));
	} else {
	f.values.push_back(mpOut[iFrame]);
	}

	fs[m_oSmoothedPitchTrack].push_back(f);
	}

	return fs;
	}

	int
	ESPYIN::getFrames() {
	return int(m_pitchProb.size());
	}

File Metadata

Mime Type: text/x-diff
Expires: Sun, Jan 12, 08:33 (1 d, 15 h)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 1347192
Default Alt Text: (63 KB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions