Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F4880319
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
67 KB
Subscribers
None
View Options
diff --git a/mnn_demo/inc/CRvcLiteOnline.h b/mnn_demo/inc/CRvcLiteOnline.h
index ad62799..b4100aa 100644
--- a/mnn_demo/inc/CRvcLiteOnline.h
+++ b/mnn_demo/inc/CRvcLiteOnline.h
@@ -1,309 +1,318 @@
//
// Created by jianli.yang on 2023/11/29.
//
#ifndef MNN_DEMO_CRVCLITEONLINE_H
#define MNN_DEMO_CRVCLITEONLINE_H
#define DEBUG
#ifdef __ANDROID__
#include <android/log.h>
#ifdef STRELEASE
#define LOGD(...)
#define LOGE(...)
#else
#define LOGD(TAG, ...) __android_log_print(ANDROID_LOG_DEBUG , TAG, __VA_ARGS__)
#define LOGE(TAG, ...) __android_log_print(ANDROID_LOG_ERROR , TAG, __VA_ARGS__)
#endif
#else
#ifdef DEBUG
#define LOGD(TAG, ...) printf("\nDebug: %s",TAG);printf(__VA_ARGS__);
#define LOGE(TAG, ...) printf("\nError: %s",TAG);printf(__VA_ARGS__);
#else
#define LOGD(TAG, ...)
#define LOGE(TAG, ...)
#endif
#endif
#include <mutex>
#include <string>
#include <memory>
#include <vector>
#include <condition_variable>
#define gs_src_samplerate 16000
#define gs_dst_samplerate 32000
#define gs_crossfade_time 0.08 // 单位是s
#define gs_block_time 1
#define gs_extra_time 1
#define gs_hubert_frame 206 // 和模型相关
#define gs_hubert_dim 256 // 和模型相关
#define gs_synth_input_frame 205 // 和模型相关
#define gs_synth_input_dim 258 // 和模型相关
#define gs_synth_output_frame 35840 // 和模型相关
enum {
ERR_RVC_LITE_SUCCESS = 0,
ERR_RVC_LITE_NOT_INIT = 1,
ERR_RVC_LITE_REINIT = 2,
ERR_RVC_LITE_RT_REINIT = 3,
ERR_RVC_LITE_RT_NOT_INIT = 4,
ERR_RVC_LITE_RT_NOT_ENOUGH_DATA = 5,
ERR_RVC_LITE_RT_INPUT_SAMPLE_ERR = 6, // 采样率小于16000
ERR_RVC_LITE_RT_RESAMPLE_OUTBUF_SHORT = 7, // 重采样后的buf太短
- ERR_RVC_LITE_NOT_SWITCH_MODEL = 8, // 重采样后的buf太短
+ ERR_RVC_LITE_NOT_SWITCH_MODEL = 8, // 没有选择音色模型
ERR_RVC_LITE_MODEL_NOT_EXISTS = 9, // 没有人声模型
ERR_RVC_LITE_BLOCK_TOO_LONG = 10, // 区块过大
};
const int RVC_LITE_RT_SYN_STATE_DEFAULT = 0;
const int RVC_LITE_RT_SYN_STATE_EFFECT = 1;
const int RVC_LITE_RT_SYN_STATE_DEFAULT2EFFECT = 2;
const int RVC_LITE_RT_SYN_STATE_EFFECT2DEFAULT = 3;
const int RVC_LITE_RT_SYN_STATE_BEFORE_DEFAULT = 4;
class Hubert;
class CSynthesizer;
class ESPYIN;
class CThreadPool;
class CRvcCircleBuffer;
class CFfmpegResampler;
/**
* Rvc轻量化实时推理代码
* 要求输入16k的音频数据,输出是目标采样率的数据
*/
class CRvcLiteOnline {
public:
CRvcLiteOnline();
~CRvcLiteOnline();
private:
void uninit();
void get_f0_post();
void get_pyin_f0();
void init_variable();
public:
/**
* 初始化函数
* @param hubert_model_path
* @return 0 表示正常
*/
int init(const char *hubert_model_path);
/**
* 换音色模型
* @param synth_model_path
* @return
*/
int switch_synth_model(const char* synth_model_path);
/**
+ * 设置key的变化[-12,12]
+ * @param key
+ */
+ void set_up_key(int key);
+
+ /**
* 处理定长的一帧数据
* 要求输入单声道16k音频
* @param in_buf
* @param in_len 长度小于等于gs_src_samplerate,最佳是gs_src_samplerate
* @param out_buf
* @param out_len 小于等于gs_dst_samplerate,最佳是gs_dst_samplerate[和输入有关,如果是32k,则恰好是输入的两倍]
* @return 0 表示正常
*/
int process_block(float *in_buf, int in_len, float *out_buf, int out_len);
/**
* 清空存储
* @return
*/
void reset();
/**
* 获取延迟时间
* @return
*/
int get_latency_ms();
private:
// 是否进行过init
bool m_init;
bool m_switch_model;
std::shared_ptr<Hubert> m_hubert_inst;
std::shared_ptr<CSynthesizer> m_synthesizer_inst;
std::shared_ptr<ESPYIN> m_es_pyin;
// 缓存使用的数据
// 要求输入的时间片长度,采样点数
int m_input_block_frame;
// 推理时额外需要的长度
int m_input_extra_frame;
// 推理时使用的buffer长度
int m_input_predict_buf_frame;
// 推理时使用的buffer
float *m_input_predict_buf;
std::vector<float> m_f0_data;
std::vector<float> m_f0_coarse_data;
// 输出的情况
int m_crossfade_frame;
int m_output_block_frame;
int m_output_cache_buf_frame;
float *m_crossfade_buf;
float *m_output_cache_buf;
bool m_fade_in;
+
+ float m_f0_new_up_key;
+ float m_f0_up_key;
// 各个实例的返回结果
std::vector<std::vector<std::vector<float>>> m_hubert_ret;
std::vector<std::vector<std::vector<float>>> m_synth_input;
std::vector<std::vector<std::vector<float>>> m_synth_out;
};
class CResample {
public:
CResample();
~CResample();
public:
int init(int in_samplerate, int out_samplerate, int in_channel=1, int out_channel=1);
// 返回的是单通道的采样点数
int get_out_samples(int num);
int get_latency();
void reset();
// 不考虑让内部缓存的情况,有多少拿多少,in_num和out_num均是单通道采样点个数
int resample(float * in_buf, int in_num, float * out_buf, int & out_num);
private:
std::shared_ptr<CFfmpegResampler> m_resample_inst;
int m_in_channel;
int m_out_channel;
};
/**
* 实时处理的类
* 入一帧出一帧,允许非常短的帧做输入,延迟较高,在2s左右
* 思路:
* 1. 构造函数设置变量
* 2. init初始化环境,开启处理线程
* 3. process,每次送一帧,触发一次判断逻辑
* 4. flush函数将输入的未处理的数据全部处理一次,联合之前没有被取出的数据一起刷出来
* 5. 析构时关闭处理线程,并释放所有空间
*/
class CRvcLiteOnlineRealTime {
public:
CRvcLiteOnlineRealTime();
~CRvcLiteOnlineRealTime();
private:
void init_variable();
void rvc_process();
void rvc_process_step();
void uninit();
void stop();
public:
/**
* 初始化函数
* @param hubert_model_path
* @param sample_rate
* @param channel
* @return
*/
int init(const char *hubert_model_path, int sample_rate, int channel);
/**
* 切换音色
* @param synth_model_path
* @return
*/
int switch_synth(const char *synth_model_path);
/**
* 清空缓存
*/
void reset();
/**
* 入一帧,出一帧,要求长度一致
* 两者可以是同一块buffer
* @param in_buf
* @param in_len
* @param out_buf
* @param out_len
* @return
*/
int process(float *in_buf, int in_len, float *out_buf, int out_len);
/**
* 将所有处理好的结果获取出来
* 因为不确定还有多少,所以由内部来开辟空间,外部进行释放
* @return
*/
void flush(float *&out_buf, int &len);
/**
* 获取延迟时间
*/
int get_latency_ms();
private:
int m_sample_rate;
int m_channel;
std::shared_ptr<CRvcCircleBuffer> m_resample_queue;
std::shared_ptr<CRvcCircleBuffer> m_input_queue;
std::shared_ptr<CRvcCircleBuffer> m_out_queue;
int m_input_tmp_buf_len;
int m_output_tmp_buf_len;
std::shared_ptr<float> m_input_tmp_buf;
std::shared_ptr<float> m_output_tmp_buf;
std::shared_ptr<CRvcLiteOnline> m_rvc_inst;
std::shared_ptr<CThreadPool> m_thread_pool;
// 逻辑变量
bool m_init;
// 处理线程相关
bool m_rvc_stop;
std::mutex m_rvc_mutex;
std::condition_variable m_rvc_cond;
// 重采样相关
std::shared_ptr<CResample> m_resample16;
std::shared_ptr<CResample> m_resample2src;
int m_resample_buf_max_len;
std::shared_ptr<float> m_resample_in_buf;
std::shared_ptr<float> m_resample_out_buf;
// 切换音色
std::string m_synth_path;
std::string m_new_synth_path;
// 合成的状态
int m_syn_state;
// 延迟器
std::shared_ptr<CRvcCircleBuffer> m_latency_queue;
};
#endif //MNN_DEMO_CRVCLITEONLINE_H
diff --git a/mnn_demo/inc/CRvcLiteOnlineV2.h b/mnn_demo/inc/CRvcLiteOnlineV2.h
new file mode 100644
index 0000000..5abeb15
--- /dev/null
+++ b/mnn_demo/inc/CRvcLiteOnlineV2.h
@@ -0,0 +1,105 @@
+//
+// Created by Administrator on 2024/1/22.
+//
+
+#ifndef MNN_DEMO_CRVCLITEONLINEV2_H
+#define MNN_DEMO_CRVCLITEONLINEV2_H
+#include "CRvcLiteSynthesizer.h"
+
+const int CRVC_V2_STATE_DEFAULT = 0;
+const int CRVC_V2_STATE_EFFECT = 1;
+const int CRVC_V2_STATE_DEFAULT2EFFECT = 2;
+const int CRVC_V2_STATE_EFFECT2DEFAULT = 3;
+
+/**
+ * 使用方式:
+ * 初始化之后: push和pop以及switch_model均异步处理即可
+ * 具体使用方式可以参考:main.cpp用法
+ */
+class CRvcLiteOnlineV2
+{
+public:
+ CRvcLiteOnlineV2();
+ ~CRvcLiteOnlineV2();
+
+private:
+ void set_cur_state(bool reset);
+
+public:
+ /**
+ * 初始化,给定HubertModel,采样率和通道数
+ * @param hubert_model
+ * @param sample_rate
+ * @param channel
+ * @return 0 表示正常
+ */
+ int init(const char* hubert_model, int sample_rate, int channel);
+
+ /**
+ * 设置人声模型地址,如果人声模型不存在,则会返回错误码
+ * @param synth_model
+ * @return 0 表示正常
+ */
+ int switch_model(const char* synth_model);
+
+ /**
+ * 设置变调,范围是[-12, 12]
+ * 有人声模型才生效,否则不生效
+ * 换人声模型,该状态不会丢失,并且在无人声的时候设置之后,有人声模型后也会生效
+ * @param key
+ */
+ void set_up_key(int key);
+
+ /**
+ * 清空缓存数据
+ */
+ void reset();
+
+ /**
+ * 输入人声数据,阻塞,
+ * @param buf 人声数据地址[为了省空间,会对这个buf做修改,reset之后的第一帧会进行fade_in操作]
+ * @param len 长度为len代表sample*channel
+ * @param last true代表最后一帧,不论如何都会进行推理,将结果获取出来
+ * @return 0 代表正常
+ */
+ int push(float* buf, int len, bool last=false);
+
+ /**
+ * 返回内部当前可用的数据总数
+ * frame*channel
+ * @return
+ */
+ int size();
+
+ /**
+ * 获取处理之后的结果
+ * @param buf buf地址
+ * @param len 当前buf的长度,返回时如果内部数据不足len的长度则会修改len,表明返回的长度,如果超过,则最多返回len
+ * len 长度代表buffer长度也就是frame*channel
+ */
+ void pop(float* buf, int& len);
+
+public:
+ // 处理逻辑
+ std::shared_ptr<CRvcLiteSynthesizer> m_rvc_inst;
+ // 输入的队列
+ std::shared_ptr<CRvcCircleBuffer> m_in_queue;
+ // 输出的队列
+ std::shared_ptr<CRvcCircleBuffer> m_out_queue;
+ std::shared_ptr<float> m_tmp_in_buf;
+ std::shared_ptr<float> m_tmp_out_buf;
+ int m_tmp_buf_len;
+ int m_block_len;
+ bool m_reset;
+ std::string m_syn_model;
+ std::string m_new_syn_model;
+ int m_sync_state;
+ int m_fade_len;
+ int m_channel;
+ // 输入和输出的差距
+ int m_input_latency_output_frame;
+ std::mutex m_rvc_mutex;
+};
+
+
+#endif //MNN_DEMO_CRVCLITEONLINEV2_H
diff --git a/mnn_demo/inc/CRvcLiteSynthesizer.h b/mnn_demo/inc/CRvcLiteSynthesizer.h
index bac8c21..cedcb2e 100644
--- a/mnn_demo/inc/CRvcLiteSynthesizer.h
+++ b/mnn_demo/inc/CRvcLiteSynthesizer.h
@@ -1,58 +1,82 @@
//
// Created by Administrator on 2024/1/21.
//
#ifndef MNN_DEMO_CRVCLITESYNTHESIZER_H
#define MNN_DEMO_CRVCLITESYNTHESIZER_H
#include "CRvcLiteOnline.h"
class CRvcLiteSynthesizer
{
public:
CRvcLiteSynthesizer();
~CRvcLiteSynthesizer();
public:
/**
* 初始化
* @param hubert_model 语义模型地址
- * @param synth_model 音色模型地址
* @param sample_rate 采样率
* @param channel 通道数
* @return 0 表示正常
*/
- int init(const char* hubert_model, const char* synth_model, int sample_rate, int channel);
+ int init(const char* hubert_model, int sample_rate, int channel);
+
+ /**
+ * 选择人声模型
+ * @param synth_model 音色模型地址
+ * @param enable 是否开启
+ * @return
+ */
+ int switch_model(const char* synth_model);
+
+ /**
+ * 设置变调,范围是[-12, 12]
+ * 有人声模型才生效,否则不生效
+ * 换人声模型,该状态不会丢失,并且在无人声的时候设置之后,有人声模型后也会生效
+ * @param key
+ */
+ void set_up_key(int key);
+
+ /**
+ * reset,清空内部数据
+ */
+ void reset();
/**
* 处理逻辑
* @param in_buf 输入的buf
* @param in_len 输入的Buf长度,frame*channel,建议输入小于等于1s的音频长度,尽量的大就好
* @param out_buf 输出的buf
* @param out_len 输出的buf长度, frame*channel
* 注意: 此处有可能出现输出的长度不一定等于in_len,输出的值会小于等于out_len,但是是连续的,所以out_len可以适当比in_len大一些,从而保证都能搞出来
* @return
*/
int process(float* in_buf, int in_len, float* out_buf, int &out_len);
- // 获取实时率,处理1s数据的真实耗时/1s
+ /**
+ * 获取实时率,处理1s数据的真实耗时/1s
+ * @return
+ */
float get_rtf();
private:
std::shared_ptr<CRvcLiteOnline> m_rvc_inst;
std::shared_ptr<CResample> m_resample2_16;
std::shared_ptr<CResample> m_resample2src;
int m_channel;
int m_sample_rate;
std::shared_ptr<float> m_buf_tmp_16k;
int m_buf_tmp_16k_len;
int m_buf_tmp_16k_cap;
std::shared_ptr<float> m_buf_tmp_32k;
int m_buf_tmp_32k_len;
int m_buf_tmp_32k_cap;
std::shared_ptr<float> m_buf_tmp_src;
int m_buf_tmp_src_len;
int m_buf_tmp_src_cap;
+ bool m_first;
};
#endif //MNN_DEMO_CRVCLITESYNTHESIZER_H
diff --git a/mnn_demo/main.cpp b/mnn_demo/main.cpp
index 8aa637d..0d6b685 100644
--- a/mnn_demo/main.cpp
+++ b/mnn_demo/main.cpp
@@ -1,221 +1,285 @@
#include <sys/time.h>
#include <thread>
#include <chrono>
#include "src/Hubert.h"
#include "src/CSynthesizer.h"
#include "CRvcLiteSynthesizer.h"
+#include "CRvcLiteOnlineV2.h"
int test_hubert() {
const char *hubert_model_path = "/mnt/d/dataset/svc/models/mnn/hubert_test_v1_fp16.mnn";
Hubert hubert;
int err_code = hubert.init(hubert_model_path);
std::vector<float> input(33280, 0.1);
std::vector<std::vector<std::vector<float>>> ret;
ret.resize(1);
ret[0].resize(205);
for (int i = 0; i < 205; i++) {
ret[0][i].resize(256);
}
float time = hubert.process(input.data(), ret);
return 0;
}
int test_contentvec() {
const char *contentvec_model_path = "/mnt/d/dataset/svc/models/mnn/contentvec_test_fp16.mnn";
CSynthesizer contentVec;
int err_code = contentVec.init(contentvec_model_path);
std::vector<std::vector<std::vector<float>>> input(1);
input[0].resize(205);
for (int i = 0; i < 205; i++) {
for (int j = 0; j < 258; j++) {
if (j == 256) {
input[0][i].push_back(0.2);
} else if (j == 257) {
input[0][i].push_back(1.0);
} else {
input[0][i].push_back(0.1);
}
}
}
std::vector<std::vector<std::vector<float>>> ret;
ret.resize(1);
for (int i = 0; i < 1; i++) {
ret[i].resize(1);
ret[i][0].resize(35840);
}
float tot = 0.f;
for (int i = 0; i < 10; i++) {
float time = contentVec.process(input, ret);
tot += time;
}
printf("time: %f \n", tot / 100.f);
return 0;
}
#include "CRvcLiteOnline.h"
#include "av_waves/waves/inc/STWaveFile.h"
void test() {
const char *hubert_model_path = "/mnt/d/dataset/svc/models/mnn/hubert_test_v2_fp16.mnn";
const char *contentvec_model_path = "/mnt/d/dataset/svc/models/mnn/contentvec_test_fp16.mnn";
const char *in_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01_16.wav";
// const char *in_wav = "/mnt/d/code/develop/svc/Retrieval-based-Voice-Conversion-WebUI/online/1_1.wav";
const char *out_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01_cpp_v1.wav";
CRvcLiteOnline rvc_inst;
rvc_inst.init(hubert_model_path);
// 读取音频文件, 要求16k,单声道
STCWaveFile wav_inst(in_wav, false);
int sample_rate = wav_inst.GetSampleRate();
int channel = wav_inst.GetChannels();
int len = wav_inst.GetTotalFrames() * channel;
float *data = new float[len];
float *outdata = new float[len * 2];
wav_inst.ReadFrameAsfloat(data, wav_inst.GetTotalFrames());
int step = sample_rate;
printf("start ..\n");
for (int i = 0; i < len; i += step) {
if (i + step > len) {
step = len - i;
}
struct timeval start;
struct timeval end;
gettimeofday(&start, NULL);
rvc_inst.process_block(data + i, step, outdata + 2 * i, 2 * step);
gettimeofday(&end, NULL);
printf("sp = %f ms\n", (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);
}
STCWaveFile wav_out_inst(out_wav, true);
wav_out_inst.SetSampleRate(32000);
wav_out_inst.SetChannels(1);
wav_out_inst.SetSampleFormat(SF_IEEE_FLOAT);
wav_out_inst.SetupDone();
wav_out_inst.WriteFrame(outdata, len * 2);
printf("finish2 ....\n");
}
void test_rvc_lite_synth()
{
const char *hubert_model_path = "/mnt/d/dataset/svc/models/layers_3/layer3_contentvec.mnn";
const char *syz_model = "/mnt/d/dataset/svc/models/layers_3/layer3_syz.mnn";
const char *out_wav = "/mnt/d/dataset/tmp/i_out3.wav";
const char *in_wav = "/mnt/d/dataset/tmp/t1.wav";
STCWaveFile wav_inst(in_wav, false);
int sample_rate = wav_inst.GetSampleRate();
int channel = wav_inst.GetChannels();
int len = wav_inst.GetTotalFrames() * channel;
float *data = new float[len];
float *outdata = new float[len];
wav_inst.ReadFrameAsfloat(data, wav_inst.GetTotalFrames());
CRvcLiteSynthesizer m_rvc_inst;
- int err = m_rvc_inst.init(hubert_model_path, syz_model, sample_rate, channel);
+ int err = m_rvc_inst.init(hubert_model_path, sample_rate, channel);
printf("init err=%d!\n", err);
printf("rtf=%f\n", m_rvc_inst.get_rtf());
int step = sample_rate * channel - 100 * channel;
int out_len = 0;
for(int i = 0; i < len; i+=step)
{
if (i + step > len) {
step = len - i;
}
int out_step = step;
err = m_rvc_inst.process(data+i, step, outdata+out_len, out_step);
if(err != ERR_RVC_LITE_SUCCESS)
{
- printf("process err!\n");
+ printf("process err=%d!\n", err);
return ;
}
out_len += out_step;
}
STCWaveFile wav_out_inst(out_wav, true);
wav_out_inst.SetSampleRate(sample_rate);
wav_out_inst.SetChannels(channel);
wav_out_inst.SetSampleFormat(SF_IEEE_FLOAT);
wav_out_inst.SetupDone();
wav_out_inst.WriteFrame(outdata, wav_inst.GetTotalFrames());
delete[] data;
delete[] outdata;
}
+void test_rvc_lite_v2()
+{
+ const char *hubert_model_path = "/mnt/d/dataset/svc/models/layers_3/layer3_contentvec.mnn";
+ const char *syz_model = "/mnt/d/dataset/svc/models/layers_3/layer3_syz.mnn";
+ const char *out_wav = "/mnt/d/dataset/tmp/i_out_01_r.wav";
+ const char *in_wav = "/mnt/d/dataset/tmp/t1.wav";
+
+ STCWaveFile wav_inst(in_wav, false);
+ int sample_rate = wav_inst.GetSampleRate();
+ int channel = wav_inst.GetChannels();
+ int len = wav_inst.GetTotalFrames() * channel;
+ float *data = new float[len];
+ float *outdata = new float[len];
+ wav_inst.ReadFrameAsfloat(data, wav_inst.GetTotalFrames());
+ CRvcLiteOnlineV2 m_rvc_inst;
+ int err = m_rvc_inst.init(hubert_model_path, sample_rate, channel);
+// m_rvc_inst.switch_model(syz_model);
+// m_rvc_inst.set_up_key(0);
+ printf("init err=%d!\n", err);
+ int step = sample_rate * channel - 100 * channel;
+ int out_len = 0;
+ bool last = false;
+ int flag = 0;
+ for(int i = 0; i < len; i+=step)
+ {
+ if (i + step > len) {
+ step = len - i;
+ last = true;
+ }
+ int out_step = step;
+ err = m_rvc_inst.push(data+i, step, last);
+ if(err != ERR_RVC_LITE_SUCCESS)
+ {
+ printf("process err=%d!\n", err);
+ return ;
+ }
+
+ if (i >= len / 3 && flag == 0)
+ {
+ flag = 1;
+ m_rvc_inst.switch_model(syz_model);
+ }
+
+ if (i >= len / 2 && flag == 1)
+ {
+ flag = 2;
+ m_rvc_inst.reset();
+ }
+
+ out_step = 2 * step;
+ m_rvc_inst.pop(outdata+out_len, out_step);
+ out_len += out_step;
+ }
+ STCWaveFile wav_out_inst(out_wav, true);
+ wav_out_inst.SetSampleRate(sample_rate);
+ wav_out_inst.SetChannels(channel);
+ wav_out_inst.SetSampleFormat(SF_IEEE_FLOAT);
+ wav_out_inst.SetupDone();
+ wav_out_inst.WriteFrame(outdata, wav_inst.GetTotalFrames());
+ delete[] data;
+ delete[] outdata;
+}
void test_rvc_lite_online() {
// const char *hubert_model_path = "/mnt/d/dataset/svc/models/mnn/hubert_test_v2_fp16.mnn";
// const char *hubert_model_path = "/mnt/d/dataset/svc/models/layer6_bingxiao_v1/mnn/layers6_checkpoint_14_1660000_1_hubert.mnn";
const char *hubert_model_path = "/mnt/d/dataset/svc/models/layers_3/layer3_contentvec.mnn";
// const char *contentvec_model_path = "/mnt/d/dataset/svc/models/mnn/contentvec_test_fp16.mnn";
// const char *syz_model = "/mnt/d/dataset/svc/models/layer6_bingxiao_v1/mnn/xusong_v1_6hubert_hifix_syz_base_vctk_kd_32k_hubert6_jianli_e225_s62775_205.mnn";
const char *xs_model = "/mnt/d/dataset/svc/models/layers_3/layer3_xusong.mnn";
const char *syz_model = "/mnt/d/dataset/svc/models/layers_3/layer3_syz.mnn";
// const char *contentvec_model_path = "/mnt/d/dataset/svc/models/layer6_bingxiao_v1/mnn/xiafan_fp16.mnn";
// const char *in_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01.wav";
const char *in_wav = "/mnt/d/dataset/tmp/t1.wav";
// const char* in_wav = "/mnt/d/dataset/svc/dataset/短数据样本/男声/qiankun.wav";
// const char* in_wav = "/mnt/d/dataset/tmp/i.wav";
// const char *in_wav = "/mnt/d/code/develop/svc/Retrieval-based-Voice-Conversion-WebUI/online/1_1.wav";
// const char *out_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01_cpp_v4.wav";
// const char *out_wav = "/mnt/d/dataset/svc/dataset/tests/qiankun_412_v4.wav";
const char *out_wav = "/mnt/d/dataset/tmp/i_out2.wav";
// 读取音频文件, 要求16k,单声道
STCWaveFile wav_inst(in_wav, false);
int sample_rate = wav_inst.GetSampleRate();
int channel = wav_inst.GetChannels();
int len = wav_inst.GetTotalFrames() * channel;
float *data = new float[len];
float *outdata = new float[len];
CRvcLiteOnlineRealTime rvc_inst;
rvc_inst.init(hubert_model_path, sample_rate, channel);
wav_inst.ReadFrameAsfloat(data, wav_inst.GetTotalFrames());
int step = 1024;
printf("start ..\n");
bool flag = true;
rvc_inst.switch_synth(syz_model);
for (int i = 0; i < len; i += step) {
if (i + step > len) {
step = len - i;
}
struct timeval start;
struct timeval end;
gettimeofday(&start, NULL);
int ret = rvc_inst.process(data + i, step, outdata+i, step);
std::this_thread::sleep_for(std::chrono::milliseconds (15));
gettimeofday(&end, NULL);
printf("ret = %d, sp = %f ms step=%d\n", ret,
(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0, step);
if (flag && i >= len / 3) {
flag = false;
rvc_inst.reset();
// rvc_inst.switch_synth(xs_model);
}
}
STCWaveFile wav_out_inst(out_wav, true);
wav_out_inst.SetSampleRate(sample_rate);
wav_out_inst.SetChannels(channel);
wav_out_inst.SetSampleFormat(SF_IEEE_FLOAT);
wav_out_inst.SetupDone();
wav_out_inst.WriteFrame(outdata, wav_inst.GetTotalFrames());
float* flush_data;
int flush_len;
rvc_inst.flush(flush_data, flush_len);
wav_out_inst.WriteFrame(flush_data, flush_len/channel);
printf("finish2 ....\n");
}
int main() {
// int ret_hubert = test_hubert();
// int ret_contentvec = test_contentvec();
// test();
// test();
// test_rvc_lite_online();
- test_rvc_lite_synth();
+// test_rvc_lite_synth();
+ test_rvc_lite_v2();
return 0;
}
diff --git a/mnn_demo/src/CRvcLiteOnline.cpp b/mnn_demo/src/CRvcLiteOnline.cpp
index 241c6b8..f9067f7 100644
--- a/mnn_demo/src/CRvcLiteOnline.cpp
+++ b/mnn_demo/src/CRvcLiteOnline.cpp
@@ -1,811 +1,831 @@
//
// Created by Administrator on 2023/11/29.
//
#include <cmath>
#include <cstring>
#include <sys/time.h>
#include "CRvcLiteOnline.h"
#include "Hubert.h"
#include "CSynthesizer.h"
#include "espyin-v1.0/ESPYIN.h"
#include "ThreadPool.h"
#include "CRvcCircleBuffer.h"
#include "FfmpegResampler.h"
#include <unistd.h>
inline bool file_exists (const std::string& name) {
return ( access( name.c_str(), F_OK ) != -1 );
}
// size代表了buf的长度
void stereo2mono(float *input, int size, float *output) {
for (int i = 0; i < size - 1; i += 2) {
output[i / 2] = (input[i] + input[i + 1]) / 2;
}
}
void mono2stereo(float *input, int size, float *output) {
for (int i = 0; i < size; i++) {
output[2 * i] = input[i];
output[2 * i + 1] = input[i];
}
}
CRvcLiteOnline::CRvcLiteOnline() {
init_variable();
m_init = false;
m_switch_model = false;
// 输入部分需要的变量
// 要求输入的时间片长度,采样点数
m_input_block_frame = int(gs_block_time * gs_src_samplerate);
// 推理时额外需要的长度
m_input_extra_frame = int(gs_extra_time * gs_src_samplerate);
int zc = gs_src_samplerate / 100; // 10ms的点数
int input_corssfade_frame = int(gs_crossfade_time * gs_src_samplerate);
// 推理时使用的buffer长度
m_input_predict_buf_frame = int(ceil((m_input_extra_frame + input_corssfade_frame + m_input_block_frame)
* 1.0 / zc) * zc);
// 推理时使用的buffer
m_input_predict_buf = new float[m_input_predict_buf_frame];
memset(m_input_predict_buf, 0, sizeof(float) * m_input_predict_buf_frame);
// 输出部分需要的变量
m_crossfade_frame = int(gs_crossfade_time * gs_dst_samplerate);
m_output_block_frame = int(gs_block_time * gs_dst_samplerate);
int output_extra_frame = int(gs_extra_time * gs_dst_samplerate);
zc = gs_dst_samplerate / 100;
m_output_cache_buf_frame = int(ceil((m_output_block_frame + m_crossfade_frame + output_extra_frame)
* 1.0 / zc) * zc);
m_output_cache_buf = new float[m_output_cache_buf_frame];
memset(m_output_cache_buf, 0, sizeof(float) * m_output_cache_buf_frame);
m_crossfade_buf = new float[m_crossfade_frame];
memset(m_crossfade_buf, 0, sizeof(float) * m_crossfade_frame);
// 对于模型的输入和输出进行缓存
// 此处是写死的和模型有关
m_hubert_ret.resize(1);
m_hubert_ret[0].resize(gs_hubert_frame);
for (int i = 0; i < gs_hubert_frame; i++) {
m_hubert_ret[0][i].resize(gs_hubert_dim);
}
// synth模型的输入
m_synth_input.resize(1);
m_synth_input[0].resize(gs_synth_input_frame);
for (int i = 0; i < gs_synth_input_frame; i++) {
m_synth_input[0][i].resize(gs_synth_input_dim);
}
m_synth_out.resize(1);
m_synth_out[0].resize(1);
m_synth_out[0][0].resize(gs_synth_output_frame);
}
CRvcLiteOnline::~CRvcLiteOnline() {
uninit();
}
/**********************************对内函数*********************************************/
void CRvcLiteOnline::uninit() {
if (m_input_predict_buf != NULL) {
delete[] m_input_predict_buf;
m_input_predict_buf = NULL;
}
if (m_output_cache_buf != NULL) {
delete[] m_output_cache_buf;
m_output_cache_buf = NULL;
}
if (m_crossfade_buf != NULL) {
delete[] m_crossfade_buf;
m_crossfade_buf = NULL;
}
init_variable();
}
void CRvcLiteOnline::get_pyin_f0() {
for (int i = 0; i < m_input_predict_buf_frame; i += 160) {
m_es_pyin->process(m_input_predict_buf + i);
}
m_f0_data.clear();
ESFeatureSet feats = m_es_pyin->getRemainingFeatures();
if (!feats.empty()) {
m_f0_data.resize(feats[4].size());
for (size_t i = 0; i < feats[4].size(); ++i) {
- // JL_DEBUG
- m_f0_data[i] = feats[4][i].values[0];
+ // 设置变调
+ m_f0_data[i] = feats[4][i].values[0] * m_f0_up_key;
if (m_f0_data[i] < 0) {
m_f0_data[i] = 0;
}
}
}
m_es_pyin->reset();
get_f0_post();
}
void CRvcLiteOnline::get_f0_post() {
int f0_min = 50;
int f0_max = 1100;
float f0_mel_min = 1127 * log2(1 + f0_min * 1.0 / 700);
float f0_mel_max = 1127 * log2(1 + f0_max * 1.0 / 700);
m_f0_coarse_data.clear();
m_f0_coarse_data.resize(m_f0_data.size());
for (int i = 0; i < m_f0_data.size(); i++) {
float f0_mel = 1127 * log2(1 + m_f0_data[i] / 700);
if (f0_mel > 0) {
f0_mel = (f0_mel - f0_mel_min) * 254.f / (f0_mel_max - f0_mel_min) + 1;
}
if (f0_mel <= 1) {
f0_mel = 1;
} else if (f0_mel > 255) {
f0_mel = 255;
}
m_f0_coarse_data[i] = float(int(f0_mel + 0.5));
}
}
void CRvcLiteOnline::init_variable() {
m_init = false;
m_switch_model = false;
// 缓存使用的数据
// 要求输入的时间片长度,采样点数
m_input_block_frame = 0;
m_input_extra_frame = 0;
m_input_predict_buf_frame = 0;
m_input_predict_buf = nullptr;
m_f0_data.clear();
m_f0_coarse_data.clear();
m_crossfade_frame = 0;
m_output_block_frame = 0;
m_output_cache_buf_frame = 0;
m_crossfade_buf = nullptr;
m_output_cache_buf = nullptr;
// 各个实例的返回结果
m_hubert_ret.clear();
m_synth_input.clear();
m_synth_out.clear();
m_fade_in = true;
+ m_f0_up_key = 1.f;
+ m_f0_new_up_key = 1.f;
}
/**********************************对外函数*********************************************/
int CRvcLiteOnline::init(const char *hubert_model_path) {
if (m_init) {
return ERR_RVC_LITE_REINIT;
}
m_hubert_inst = std::make_shared<Hubert>();
m_synthesizer_inst = std::make_shared<CSynthesizer>();
m_hubert_inst->init(hubert_model_path);
// m_synthesizer_inst->init(synth_model_path);
// 要求stepSize必须是2^n
m_es_pyin = std::make_shared<ESPYIN>(16000, 160, 1024, 50, 1100);
m_init = true;
m_switch_model = false;
m_fade_in = true;
+ m_f0_up_key = 1.f;
+ m_f0_new_up_key = 1.f;
return ERR_RVC_LITE_SUCCESS;
}
int CRvcLiteOnline::switch_synth_model(const char *synth_model_path) {
if (!m_init) {
return ERR_RVC_LITE_NOT_INIT;
}
if (file_exists(synth_model_path))
{
m_synthesizer_inst = std::make_shared<CSynthesizer>();
m_synthesizer_inst->init(synth_model_path);
m_switch_model = true;
return ERR_RVC_LITE_SUCCESS;
}
return ERR_RVC_LITE_MODEL_NOT_EXISTS;
}
+void CRvcLiteOnline::set_up_key(int key)
+{
+ if (key > 12)
+ {
+ key = 12;
+ }
+
+ if (key < -12)
+ {
+ key = -12;
+ }
+ m_f0_new_up_key = pow(2, key / 12.f);
+}
+
void CRvcLiteOnline::reset() {
memset(m_input_predict_buf, 0, sizeof(float) * m_input_predict_buf_frame);
memset(m_crossfade_buf, 0, sizeof(float) * m_crossfade_frame);
memset(m_output_cache_buf, 0, sizeof(float) * m_output_cache_buf_frame);
m_fade_in = true;
}
int CRvcLiteOnline::process_block(float *in_buf, int in_len, float *out_buf, int out_len) {
if (!m_init) {
return ERR_RVC_LITE_NOT_INIT;
}
if (!m_switch_model)
{
return ERR_RVC_LITE_NOT_SWITCH_MODEL;
}
// 外部数据产生不连贯,比如做了reset的时候,需要做fade_in
if (m_fade_in)
{
for(int i = 0; i < in_len; i++)
{
float rate = i * 1.0 / in_len;
in_buf[i] = in_buf[i] * rate;
}
m_fade_in = false;
}
// 剔除尾部的block的数据
memcpy(m_input_predict_buf, m_input_predict_buf + in_len,
sizeof(float) * (m_input_predict_buf_frame - in_len));
// 向尾部填充in_buf的数据
memcpy(m_input_predict_buf + (m_input_predict_buf_frame - in_len), in_buf,
sizeof(float) * in_len);
// 提取f0特征序列
struct timeval start;
struct timeval end;
gettimeofday(&start, NULL);
+ m_f0_up_key = m_f0_new_up_key;
get_pyin_f0();
gettimeofday(&end, NULL);
LOGE("CRvcLiteOnline", "get pyin sp = %f ms\n",
(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);
// 推理hubert
gettimeofday(&start, NULL);
m_hubert_inst->process(m_input_predict_buf, m_hubert_ret);
gettimeofday(&end, NULL);
LOGE("CRvcLiteOnline", "m_hubert_inst sp = %f ms\n",
(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);
// 合成语音
for (int i = 0; i < gs_synth_input_frame; i++) {
// 拷贝数据 1,gs_hubert_frame,258
for (int j = 0; j < gs_hubert_dim; j++) {
m_synth_input[0][i][j] = m_hubert_ret[0][i][j];
}
m_synth_input[0][i][256] = m_f0_coarse_data[i];
m_synth_input[0][i][257] = m_f0_data[i];
}
gettimeofday(&start, NULL);
m_synthesizer_inst->process(m_synth_input, m_synth_out);
gettimeofday(&end, NULL);
LOGE("CRvcLiteOnline", "m_synthesizer_inst sp = %f ms\n",
(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);
// 将结果全部放到缓存中
memcpy(m_output_cache_buf, m_output_cache_buf + gs_synth_output_frame,
sizeof(float) * (m_output_cache_buf_frame - gs_synth_output_frame));
memcpy(m_output_cache_buf + (m_output_cache_buf_frame - gs_synth_output_frame),
m_synth_out[0][0].data(), sizeof(float) * gs_synth_output_frame);
int start_pos = m_output_cache_buf_frame - m_crossfade_frame - out_len;
memcpy(out_buf, m_output_cache_buf + start_pos, sizeof(float) * out_len);
// 对头部数据做fade_in以及fadeout
for (int i = 0; i < m_crossfade_frame; i++) {
float rate = float(i * 1.f / m_crossfade_frame);
out_buf[i] = rate * out_buf[i] + m_crossfade_buf[i] * (1 - rate);
}
memcpy(m_crossfade_buf, m_output_cache_buf + (m_output_cache_buf_frame - m_crossfade_frame),
sizeof(float) * m_crossfade_frame);
return 0;
}
int CRvcLiteOnline::get_latency_ms() {
- return gs_crossfade_time * 1000;
+ // 此处除了block的延迟,还有推理时hubert理论上应该获取208,实际获取205帧,所以少的30ms
+ return gs_crossfade_time * 1000 + 30;
}
/*******************************对内的类**************************************/
CResample::CResample()
{
m_resample_inst = nullptr;
}
CResample::~CResample()
{
}
int CResample::init(int in_samplerate, int out_samplerate, int in_channel, int out_channel)
{
// 只是通道数不一致时走自驱逻辑
m_in_channel = in_channel;
m_out_channel = out_channel;
if (in_samplerate == out_samplerate && in_channel != out_channel) {
m_resample_inst = nullptr;
}
else {
m_resample_inst = std::make_shared<CFfmpegResampler>();
return m_resample_inst->init(in_samplerate, out_samplerate, in_channel, out_channel);
}
return ERR_RVC_LITE_SUCCESS;
}
int CResample::get_out_samples(int num)
{
if (m_resample_inst)
{
return m_resample_inst->get_out_samples(num);
}
return num;
}
void CResample::reset()
{
if (m_resample_inst)
{
return m_resample_inst->reset();
}
}
int CResample::get_latency()
{
if (m_resample_inst)
{
return m_resample_inst->get_latency();
}
return 0;
}
int CResample::resample(float *in_buf, int in_num, float *out_buf, int &out_num) {
if (m_resample_inst) {
return m_resample_inst->resample(in_buf, in_num, out_buf, out_num);
}
if (m_in_channel == 2 && m_out_channel == 1) {
if (out_num < in_num) {
return ERR_RVC_LITE_RT_RESAMPLE_OUTBUF_SHORT;
}
stereo2mono(in_buf, in_num, out_buf);
return ERR_RVC_LITE_SUCCESS;
}
if (m_in_channel == 1 && m_out_channel == 2) {
if (out_num < in_num) {
return ERR_RVC_LITE_RT_RESAMPLE_OUTBUF_SHORT;
}
mono2stereo(in_buf, in_num, out_buf);
return ERR_RVC_LITE_SUCCESS;
}
return ERR_RVC_LITE_SUCCESS;
}
/*******************************对外的类***************************************/
/*******************************对内函数***************************************/
void CRvcLiteOnlineRealTime::init_variable() {
m_init = false;
m_rvc_stop = true;
m_sample_rate = 44100;
m_channel = 1;
m_synth_path = "";
m_new_synth_path = "";
m_syn_state = RVC_LITE_RT_SYN_STATE_DEFAULT;
}
/*******************************对外函数***************************************/
CRvcLiteOnlineRealTime::CRvcLiteOnlineRealTime() {
init_variable();
}
CRvcLiteOnlineRealTime::~CRvcLiteOnlineRealTime() {
uninit();
}
int CRvcLiteOnlineRealTime::init(const char *hubert_model_path, int sample_rate, int channel) {
if (m_init) {
return ERR_RVC_LITE_RT_REINIT;
}
if (sample_rate < 16000) {
return ERR_RVC_LITE_RT_INPUT_SAMPLE_ERR;
}
init_variable();
m_sample_rate = sample_rate;
m_channel = channel;
m_synth_path = "";
m_new_synth_path = "";
m_syn_state = RVC_LITE_RT_SYN_STATE_DEFAULT;
int output_one_sec_number = m_sample_rate * m_channel; // 临时使用的数据
int latency_len = gs_crossfade_time * m_sample_rate * m_channel;
CThreadPool::Task task = std::bind(&CRvcLiteOnlineRealTime::rvc_process, this);
m_rvc_inst = std::make_shared<CRvcLiteOnline>();
int err = m_rvc_inst->init(hubert_model_path);
if (ERR_RVC_LITE_SUCCESS != err) {
goto exit;
}
// 重采样部分
m_resample_queue = std::make_shared<CRvcCircleBuffer>(sample_rate * 3 * m_channel);
m_resample16 = std::make_shared<CResample>();
err = m_resample16->init(m_sample_rate, gs_src_samplerate, m_channel, 1);
if (ERR_RVC_LITE_SUCCESS != err) {
goto exit;
}
m_resample2src = std::make_shared<CResample>();
err = m_resample2src->init(gs_dst_samplerate, m_sample_rate, 1, m_channel);
if (ERR_RVC_LITE_SUCCESS != err) {
goto exit;
}
m_resample_buf_max_len = 2048; // 此时空间最大是2048,保证不超即可
m_resample_in_buf = std::shared_ptr<float>(new float[m_resample_buf_max_len], std::default_delete<float[]>());
m_resample_out_buf = std::shared_ptr<float>(new float[m_resample_buf_max_len], std::default_delete<float[]>());
// 核心处理部分
m_input_tmp_buf_len = gs_src_samplerate;
m_output_tmp_buf_len = gs_dst_samplerate;
m_input_tmp_buf = std::shared_ptr<float>(new float[m_input_tmp_buf_len], std::default_delete<float[]>());
m_output_tmp_buf = std::shared_ptr<float>(new float[m_output_tmp_buf_len], std::default_delete<float[]>());
memset(m_input_tmp_buf.get(), 0, sizeof(float) * m_input_tmp_buf_len);
memset(m_output_tmp_buf.get(), 0, sizeof(float) * m_output_tmp_buf_len);
// 循环buffer
m_input_queue = std::make_shared<CRvcCircleBuffer>(m_input_tmp_buf_len * 3);
// 对外的是目标的采样率和通道数的数据
m_out_queue = std::make_shared<CRvcCircleBuffer>(output_one_sec_number * 3);
m_latency_queue = std::make_shared<CRvcCircleBuffer>(latency_len);
// 提前塞入两组,保证延迟稳定在2s
for (int i = 0; i < 2; i++) {
// 塞入1s数据
for (int j = 0; j < output_one_sec_number / m_output_tmp_buf_len; j++) {
m_out_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len);
}
m_out_queue->push(m_output_tmp_buf.get(), output_one_sec_number % m_output_tmp_buf_len);
}
// 算法本身有延迟,所有为了保证延迟一致,在无效果的时候需要添加该延迟
for (int j = 0; j < latency_len / m_output_tmp_buf_len; j++) {
m_latency_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len);
}
m_latency_queue->push(m_output_tmp_buf.get(), latency_len % m_output_tmp_buf_len);
// 开始处理线程
m_thread_pool = std::make_shared<CThreadPool>();
m_thread_pool->start(1);
m_rvc_stop = false;
m_thread_pool->run(task);
m_init = true;
exit:
if (ERR_RVC_LITE_SUCCESS != err) {
m_init = true;
uninit();
}
return err;
}
int CRvcLiteOnlineRealTime::switch_synth(const char *synth_model_path) {
if (!m_init) {
return ERR_RVC_LITE_RT_NOT_INIT;
}
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_new_synth_path = synth_model_path;
}
return ERR_RVC_LITE_SUCCESS;
}
int CRvcLiteOnlineRealTime::process(float *in_buf, int in_len, float *out_buf, int out_len) {
if (!m_init) {
return ERR_RVC_LITE_RT_NOT_INIT;
}
// 写入数据
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_resample_queue->push(in_buf, in_len);
m_rvc_cond.notify_all();
}
memset(out_buf, 0, sizeof(float) * out_len);
int tmp_out_len = out_len;
// 获取数据
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_out_queue->pop(out_buf, tmp_out_len);
}
if (tmp_out_len != out_len) {
return ERR_RVC_LITE_RT_NOT_ENOUGH_DATA;
}
return ERR_RVC_LITE_SUCCESS;
}
void CRvcLiteOnlineRealTime::reset() {
if (!m_init) {
return;
}
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_resample_queue->reset();
m_resample16->reset();
m_resample2src->reset();
m_input_queue->reset();
m_out_queue->reset();
m_rvc_inst->reset();
m_latency_queue->reset();
// 提前塞入两组,保证延迟稳定在2s
int output_one_sec_number = m_sample_rate * m_channel; // 临时使用的数据
memset(m_output_tmp_buf.get(), 0, sizeof(float) * m_output_tmp_buf_len);
for (int i = 0; i < 2; i++) {
for (int j = 0; j < output_one_sec_number / m_output_tmp_buf_len; j++) {
m_out_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len);
}
m_out_queue->push(m_output_tmp_buf.get(), output_one_sec_number % m_output_tmp_buf_len);
}
// 算法本身有延迟,所有为了保证延迟一致,在无效果的时候需要添加该延迟
int latency_len = gs_crossfade_time * m_sample_rate * m_channel;
for (int j = 0; j < latency_len / m_output_tmp_buf_len; j++) {
m_latency_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len);
}
m_latency_queue->push(m_output_tmp_buf.get(), latency_len % m_output_tmp_buf_len);
}
}
void CRvcLiteOnlineRealTime::flush(float *&out_buf, int &len) {
// 将内部的所有的数据吐出来
/**
* 先停止
*/
stop();
// 无音色转换的情况
int resample_in_len = 0;
int resample_out_len = 0;
if(m_syn_state == RVC_LITE_RT_SYN_STATE_DEFAULT)
{
while (m_resample_queue->size() > 0) {
resample_in_len = m_resample_buf_max_len;
m_resample_queue->pop(m_resample_in_buf.get(), resample_in_len);
m_latency_queue->push(m_resample_in_buf.get(), resample_in_len);
m_latency_queue->pop(m_resample_in_buf.get(), resample_in_len);
m_out_queue->push(m_resample_in_buf.get(), resample_in_len);
}
while(m_latency_queue->size() > 0)
{
resample_in_len = m_resample_buf_max_len;
m_latency_queue->pop(m_resample_in_buf.get(), resample_in_len);
m_out_queue->push(m_resample_in_buf.get(), resample_in_len);
}
len = m_out_queue->size();
out_buf = new float[len];
m_out_queue->pop(out_buf, len);
return;
}
// 有音色转换的情况
while (m_resample_queue->size() > 0) {
resample_in_len = m_resample_buf_max_len;
m_resample_queue->pop(m_resample_in_buf.get(), resample_in_len);
// 输入的数据需要考虑channel
resample_out_len = m_resample16->get_out_samples(resample_in_len / m_channel);
m_resample16->resample(m_resample_in_buf.get(), resample_in_len / m_channel, m_resample_out_buf.get(),
resample_out_len);
// 输出是16k单声道,不需要考虑
m_input_queue->push(m_resample_out_buf.get(), resample_out_len);
}
memset(m_input_tmp_buf.get(), 0, sizeof(float) * m_input_tmp_buf_len);
int add_size = m_input_tmp_buf_len - m_input_queue->size() % m_input_tmp_buf_len;
if (add_size != 0 && add_size < m_input_tmp_buf_len) {
m_input_queue->push(m_input_tmp_buf.get(), add_size);
}
int num = m_input_queue->size() / m_input_tmp_buf_len;
for (int i = 0; i < num; i++) {
rvc_process_step();
}
// 将所有数据拷贝出来
len = m_out_queue->size();
out_buf = new float[len];
m_out_queue->pop(out_buf, len);
}
int CRvcLiteOnlineRealTime::get_latency_ms() {
return m_rvc_inst->get_latency_ms() + 2000;
}
/*******************************对内函数***************************************/
void CRvcLiteOnlineRealTime::uninit() {
if (!m_init) {
return;
}
stop();
}
void CRvcLiteOnlineRealTime::stop() {
// 释放thread_pool的数据,先通知一下rvc_process,防止是在等待中
m_rvc_stop = true;
if (m_thread_pool) {
m_rvc_cond.notify_all();
m_thread_pool->stop();
}
}
void CRvcLiteOnlineRealTime::rvc_process_step() {
struct timeval start;
struct timeval end;
int sample_out_len = 0;
// 开始处理
if (m_input_queue->size() < m_input_tmp_buf_len) {
return;
}
gettimeofday(&start, NULL);
m_input_queue->pop(m_input_tmp_buf.get(), m_input_tmp_buf_len);
m_rvc_inst->process_block(m_input_tmp_buf.get(), m_input_tmp_buf_len,
m_output_tmp_buf.get(), m_output_tmp_buf_len);
gettimeofday(&end, NULL);
LOGD("RvcLite", "rvc_process process sp %f ms",
(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);
// 重采样
// 考虑到此处采样率变大,但是最多也不到两倍,但是通道数有可能扩展到两倍,所以按照1/4进行设置
gettimeofday(&start, NULL);
bool last = false;
int step = m_resample_buf_max_len / 4;
for (int i = 0; i < m_output_tmp_buf_len; i += step) {
if (i + step >= m_output_tmp_buf_len) {
step = m_output_tmp_buf_len - i;
last = true;
}
// 此时的输入是单声道,采样点数量和总长度一致
sample_out_len = m_resample2src->get_out_samples(step);
m_resample2src->resample(m_output_tmp_buf.get() + i, step, m_resample_out_buf.get(), sample_out_len);
// 从有到无
if(last && m_syn_state == RVC_LITE_RT_SYN_STATE_EFFECT2DEFAULT)
{
// 因为不加音效也需要延迟对齐,所以此处只要做fade_out就行了
for(int ii =0; ii < sample_out_len * m_channel; ii+=m_channel)
{
float rate = ii * 1.0 / step;
for(int jj = 0; jj < m_channel; jj++)
{
m_resample_out_buf.get()[ii+jj] = m_resample_out_buf.get()[ii+jj] * (1 - rate);
}
}
m_syn_state = RVC_LITE_RT_SYN_STATE_BEFORE_DEFAULT;
}
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_out_queue->push(m_resample_out_buf.get(), sample_out_len * m_channel);
}
}
gettimeofday(&end, NULL);
LOGD("RvcLite", "rvc_process re_resample sp %f ms",
(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);
printf("finish ...\n");
}
void CRvcLiteOnlineRealTime::rvc_process() {
int sample_in_len;
int sample_out_len = 0;
while (!m_rvc_stop) {
{
// 重采样
std::unique_lock<std::mutex> lock(m_rvc_mutex);
if (m_resample_queue->size() < m_resample_buf_max_len) {
// 睡眠前检查下情况
if (m_rvc_stop) {
return;
}
m_rvc_cond.wait(lock);
continue;
}
sample_in_len = m_resample_buf_max_len;
m_resample_queue->pop(m_resample_in_buf.get(), sample_in_len);
}
/**
* 此处有三种情况:
* 因为无论哪种变换,有延迟的存在,导致输入的数据都是需要塞0进去,所以对当前的数据做fade_out即可
* 1. 无到有:对无到有的部分做个fade_out,对下一帧要塞入音效器的部分做fade_in
* 2. 有到无:对无到有的部分做个fade_out,对下一帧要塞入音效器的部分做fade_in
* 3. 有到有[这个不用考虑,内部自己做了处理]
*/
if (m_synth_path != m_new_synth_path) {
// 从无到有,此时对本帧做fade_out,对下一帧输入做fade_in
if(m_synth_path.empty() && !m_new_synth_path.empty())
{
m_syn_state = RVC_LITE_RT_SYN_STATE_DEFAULT2EFFECT;
}
// 从有到无
if (!m_synth_path.empty() && m_new_synth_path.empty())
{
m_syn_state = RVC_LITE_RT_SYN_STATE_EFFECT2DEFAULT;
}
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_synth_path = m_new_synth_path;
}
m_rvc_inst->switch_synth_model(m_new_synth_path.c_str());
}
// 刚切过来第一次做效果
if(m_syn_state == RVC_LITE_RT_SYN_STATE_BEFORE_DEFAULT)
{
// 刚从有到无,需要清空数据,以及对输入的队列添加fade_in
m_latency_queue->reset();
// 算法本身有延迟,所有为了保证延迟一致,在无效果的时候需要添加该延迟
memset(m_output_tmp_buf.get(), 0, sizeof(float) * m_output_tmp_buf_len);
int latency_len = gs_crossfade_time * m_sample_rate * m_channel;
for (int j = 0; j < latency_len / m_output_tmp_buf_len; j++) {
m_latency_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len);
}
m_latency_queue->push(m_output_tmp_buf.get(), latency_len % m_output_tmp_buf_len);
// 对输入做fade_in
for(int i = 0; i < sample_in_len; i+=m_channel)
{
float rate = i * 1.0 / sample_in_len;
for(int j = 0; j < m_channel; j++)
{
m_resample_in_buf.get()[i+j] *= rate;
}
}
m_syn_state = RVC_LITE_RT_SYN_STATE_DEFAULT;
}
// 不做效果
if(m_syn_state == RVC_LITE_RT_SYN_STATE_DEFAULT)
{
m_latency_queue->push(m_resample_in_buf.get(), sample_in_len);
m_latency_queue->pop(m_resample_in_buf.get(), sample_in_len);
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_out_queue->push(m_resample_in_buf.get(), sample_in_len);
}
continue;
}
// 从无到有的转换
if (m_syn_state == RVC_LITE_RT_SYN_STATE_DEFAULT2EFFECT)
{
// 做fade_out
for(int i = 0; i < sample_in_len; i+=m_channel)
{
float rate = i * 1.0 / sample_in_len;
for(int j = 0; j < m_channel; j++)
{
m_resample_in_buf.get()[i+j] *= 1 - rate;
}
}
m_latency_queue->push(m_resample_in_buf.get(), sample_in_len);
m_latency_queue->pop(m_resample_in_buf.get(), sample_in_len);
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_out_queue->push(m_resample_in_buf.get(), sample_in_len);
}
// 此时对于rvc来说输入的数据不连贯了,所以清空内部数据重新搞
m_syn_state = RVC_LITE_RT_SYN_STATE_EFFECT;
m_rvc_inst->reset();
continue;
}
// 重采样到16k,此处采样率变低,所以不会出现sample_out_len > sample_in_len的情况
sample_out_len = m_resample16->get_out_samples(sample_in_len / m_channel);
m_resample16->resample(m_resample_in_buf.get(), sample_in_len / m_channel, m_resample_out_buf.get(),
sample_out_len);
m_input_queue->push(m_resample_out_buf.get(), sample_out_len);
rvc_process_step();
}
}
\ No newline at end of file
diff --git a/mnn_demo/src/CRvcLiteOnlineV2.cpp b/mnn_demo/src/CRvcLiteOnlineV2.cpp
new file mode 100644
index 0000000..0269a7d
--- /dev/null
+++ b/mnn_demo/src/CRvcLiteOnlineV2.cpp
@@ -0,0 +1,215 @@
+//
+// Created by Administrator on 2024/1/22.
+//
+
+#include "CRvcLiteOnlineV2.h"
+#include "CRvcCircleBuffer.h"
+#include <unistd.h>
+
+inline bool file_exists1 (const std::string& name) {
+ return ( access( name.c_str(), F_OK ) != -1 );
+}
+
+CRvcLiteOnlineV2::CRvcLiteOnlineV2()
+{
+
+}
+
+CRvcLiteOnlineV2::~CRvcLiteOnlineV2()
+{
+
+}
+/*****************************************对内函数***************************************************************/
+void CRvcLiteOnlineV2::set_cur_state(bool reset)
+{
+ /**
+ * 一共三种状态
+ * 从无到有: 让不做效果的fade_out,做效果的fade_in
+ * 从有到无: 让做效果的fade_out, 不做效果的fade_in即可
+ * 从有到有,这种情况不考虑,内部自己会做fade
+ */
+ if (m_syn_model != m_new_syn_model)
+ {
+ // 从无到有
+ if (m_syn_model.empty() && !m_new_syn_model.empty())
+ {
+ m_sync_state = CRVC_V2_STATE_DEFAULT2EFFECT;
+
+ // 如果此时已经发生了reset,则不需要做切换,直接做就行
+ if (reset)
+ {
+ m_sync_state = CRVC_V2_STATE_EFFECT;
+ }
+ m_syn_model = m_new_syn_model;
+ m_rvc_inst->switch_model(m_syn_model.c_str());
+ }
+
+ // 从有到无
+ if (!m_syn_model.empty() && m_new_syn_model.empty())
+ {
+ m_sync_state = CRVC_V2_STATE_EFFECT2DEFAULT;
+ // 如果此时已经发生了reset,则不需要做切换,直接做就行
+ if (reset)
+ {
+ m_sync_state = CRVC_V2_STATE_DEFAULT;
+ }
+ m_syn_model = m_new_syn_model;
+ }
+ }
+}
+
+/*****************************************对外函数***************************************************************/
+int CRvcLiteOnlineV2::init(const char *hubert_model, int sample_rate, int channel)
+{
+ m_rvc_inst = std::make_shared<CRvcLiteSynthesizer>();
+ m_block_len = sample_rate * channel - 100 * channel;
+ m_tmp_buf_len = m_block_len * 2;
+ m_reset = true;
+ m_syn_model = "";
+ m_new_syn_model = "";
+ m_sync_state = CRVC_V2_STATE_DEFAULT;
+ m_fade_len = int(sample_rate * 0.05) * channel; // 50ms的时长用来做fade
+ m_channel = channel;
+
+ m_tmp_in_buf = std::shared_ptr<float>(new float[m_tmp_buf_len], std::default_delete<float[]>());
+ m_tmp_out_buf = std::shared_ptr<float>(new float[m_tmp_buf_len], std::default_delete<float[]>());
+ m_in_queue = std::make_shared<CRvcCircleBuffer>(m_tmp_buf_len * 2);
+ m_out_queue = std::make_shared<CRvcCircleBuffer>(m_tmp_buf_len * 2);
+ m_input_latency_output_frame = 0;
+ return m_rvc_inst->init(hubert_model, sample_rate, channel);
+}
+
+int CRvcLiteOnlineV2::switch_model(const char *synth_model)
+{
+ if (synth_model != "" && !file_exists1(synth_model))
+ {
+ return ERR_RVC_LITE_MODEL_NOT_EXISTS;
+ }
+
+ m_new_syn_model = synth_model;
+ return ERR_RVC_LITE_SUCCESS;
+}
+
+void CRvcLiteOnlineV2::set_up_key(int key)
+{
+ // 内部是线程安全的,所以直接设置即可
+ m_rvc_inst->set_up_key(key);
+}
+
+void CRvcLiteOnlineV2::reset()
+{
+ m_reset = true;
+}
+
+
+int CRvcLiteOnlineV2::push(float *buf, int len, bool last)
+{
+ bool reset = m_reset;
+ if (m_reset)
+ {
+ m_reset = false;
+ m_input_latency_output_frame = 0;
+ m_in_queue->reset();
+ m_out_queue->reset();
+ m_rvc_inst->reset();
+ }
+
+ set_cur_state(reset);
+
+ if (CRVC_V2_STATE_DEFAULT == m_sync_state)
+ {
+ std::unique_lock<std::mutex> lock(m_rvc_mutex);
+ m_out_queue->push(buf, len);
+ return ERR_RVC_LITE_SUCCESS;
+ }
+
+ // 此时无论怎样,都要让模型跑一下,得到结果再说
+ m_in_queue->push(buf, len);
+ while(m_in_queue->size() >= m_block_len || last) {
+ if (m_in_queue->size() <= 0)
+ {
+ return ERR_RVC_LITE_SUCCESS;
+ }
+
+ int cur_in_len = m_block_len;
+ int cur_out_len = m_block_len;
+ m_in_queue->pop(m_tmp_in_buf.get(), cur_in_len);
+ int err = m_rvc_inst->process(m_tmp_in_buf.get(), cur_in_len, m_tmp_out_buf.get(), cur_out_len);
+ if (err != ERR_RVC_LITE_SUCCESS) {
+ return err;
+ }
+
+ // 此时对于effect做fade_out,default做fade_in
+ if (m_sync_state == CRVC_V2_STATE_EFFECT2DEFAULT)
+ {
+ // 此时由于m_rvc_inst本身存在延迟输出的情况[虽然头部的静音帧已经被砍掉了],但是其输入的数据和输出的数据并不是完美对应的,存在延迟差
+ // 所以此时输入的头部和输出的头部之前存在延迟差,但是不加音效是没有这个延迟差的
+ // 所以需要将输入的头部对应到其应该对应的输出真实数据的头部
+ // 比如: 输入: 1,2,3,4,5 输出: l1,l2,1,2,3 ,其中l1和l2是延迟采样点,也就是1,2,对应的是输出+延迟采样点才对
+ for(int i = 0; i < m_fade_len; i+=m_channel)
+ {
+ float rate = i * 1.0 / m_fade_len;
+ for(int j = 0; j < m_channel; j+=1)
+ {
+ m_tmp_in_buf.get()[i+j] = m_tmp_in_buf.get()[i+j] * rate + m_tmp_out_buf.get()[i+j+m_input_latency_output_frame] * (1 - rate);
+ }
+ }
+ {
+ std::unique_lock<std::mutex> lock(m_rvc_mutex);
+ // 将之前要输入的那块塞进去
+ m_out_queue->push(m_tmp_out_buf.get(), m_input_latency_output_frame);
+ m_out_queue->push(m_tmp_in_buf.get(), cur_in_len);
+ }
+
+ m_sync_state = CRVC_V2_STATE_DEFAULT;
+ m_input_latency_output_frame = 0;
+
+ while(m_in_queue->size() > 0)
+ {
+ cur_in_len = m_block_len;
+ m_in_queue->pop(m_tmp_in_buf.get(), cur_in_len);
+ {
+ std::unique_lock<std::mutex> lock(m_rvc_mutex);
+ m_out_queue->push(m_tmp_in_buf.get(), cur_in_len);
+ }
+ }
+ return ERR_RVC_LITE_SUCCESS;
+ }
+
+ // 此时对effect做fade_in,default做fade_out
+ if (m_sync_state == CRVC_V2_STATE_DEFAULT2EFFECT)
+ {
+ for(int i = 0; i < m_fade_len; i+=m_channel)
+ {
+ float rate = i * 1.0 / m_fade_len;
+ for(int j = 0; j < m_channel; j+=1)
+ {
+ m_tmp_out_buf.get()[i+j] = m_tmp_out_buf.get()[i+j] * rate + m_tmp_in_buf.get()[i+j] * (1 - rate);
+ }
+ }
+ // 设置状态
+ m_sync_state = CRVC_V2_STATE_EFFECT;
+ }
+
+ // effect会存在输入和输出长度不一致的情况
+ m_input_latency_output_frame += cur_in_len - cur_out_len;
+
+ // 加锁塞入数据
+ {
+ std::unique_lock<std::mutex> lock(m_rvc_mutex);
+ m_out_queue->push(m_tmp_out_buf.get(), cur_out_len);
+ }
+ }
+ return ERR_RVC_LITE_SUCCESS;
+}
+
+int CRvcLiteOnlineV2::size()
+{
+ return m_out_queue->size();
+}
+
+void CRvcLiteOnlineV2::pop(float *buf, int &len)
+{
+ std::unique_lock<std::mutex> lock(m_rvc_mutex);
+ m_out_queue->pop(buf, len);
+}
diff --git a/mnn_demo/src/CRvcLiteSynthesizer.cpp b/mnn_demo/src/CRvcLiteSynthesizer.cpp
index 9bce8d7..6ff952b 100644
--- a/mnn_demo/src/CRvcLiteSynthesizer.cpp
+++ b/mnn_demo/src/CRvcLiteSynthesizer.cpp
@@ -1,106 +1,128 @@
//
// Created by Administrator on 2024/1/21.
//
#include "CRvcLiteSynthesizer.h"
#include <cstring>
#include <sys/time.h>
CRvcLiteSynthesizer::CRvcLiteSynthesizer(){}
CRvcLiteSynthesizer::~CRvcLiteSynthesizer() {}
-int CRvcLiteSynthesizer::init(const char *hubert_model, const char *synth_model, int sample_rate, int channel)
+int CRvcLiteSynthesizer::init(const char *hubert_model, int sample_rate, int channel)
{
m_rvc_inst = std::make_shared<CRvcLiteOnline>();
int err = m_rvc_inst->init(hubert_model);
if (err != ERR_RVC_LITE_SUCCESS)
{
return err;
}
- err = m_rvc_inst->switch_synth_model(synth_model);
- if (err != ERR_RVC_LITE_SUCCESS)
- {
- return err;
- }
+
m_resample2_16 = std::make_shared<CResample>();
m_resample2_16->init(sample_rate, gs_src_samplerate, channel, 1);
m_resample2src = std::make_shared<CResample>();
m_resample2src->init(gs_dst_samplerate, sample_rate, 1, channel);
m_channel = channel;
m_sample_rate = sample_rate;
m_buf_tmp_16k_len = 0;
m_buf_tmp_16k_cap = 0;
m_buf_tmp_32k_len = 0;
m_buf_tmp_32k_cap = 0;
m_buf_tmp_src_len = 0;
m_buf_tmp_src_cap = 0;
+ m_first = true;
return ERR_RVC_LITE_SUCCESS;
}
+int CRvcLiteSynthesizer::switch_model(const char *synth_model)
+{
+ return m_rvc_inst->switch_synth_model(synth_model);
+}
+
+void CRvcLiteSynthesizer::set_up_key(int key)
+{
+ m_rvc_inst->set_up_key(key);
+}
+
+void CRvcLiteSynthesizer::reset()
+{
+ m_rvc_inst->reset();
+ m_first = true;
+}
+
int CRvcLiteSynthesizer::process(float *in_buf, int in_len, float *out_buf, int &out_len) {
// 1 重采样 2 推理 3 再次重采样
int resample_out_len = m_resample2_16->get_out_samples(in_len / m_channel);
// 控制逻辑,不能超过该长度
if (resample_out_len > gs_src_samplerate) {
return ERR_RVC_LITE_BLOCK_TOO_LONG;
}
if (m_buf_tmp_16k_cap < resample_out_len) {
m_buf_tmp_16k_cap = resample_out_len;
m_buf_tmp_16k = std::shared_ptr<float>(new float[m_buf_tmp_16k_cap], std::default_delete<float[]>());
}
m_buf_tmp_16k_len = resample_out_len;
int err = m_resample2_16->resample(in_buf, in_len / m_channel, m_buf_tmp_16k.get(), m_buf_tmp_16k_len);
if (err != ERR_RVC_LITE_SUCCESS) {
return err;
}
if (m_buf_tmp_32k_cap < m_buf_tmp_16k_len * 2) {
m_buf_tmp_32k_cap = m_buf_tmp_16k_len * 2;
m_buf_tmp_32k = std::shared_ptr<float>(new float[m_buf_tmp_32k_cap], std::default_delete<float[]>());
}
m_buf_tmp_32k_len = m_buf_tmp_16k_len * 2;
// 推理
err = m_rvc_inst->process_block(m_buf_tmp_16k.get(), m_buf_tmp_16k_len, m_buf_tmp_32k.get(), m_buf_tmp_32k_len);
if (err != ERR_RVC_LITE_SUCCESS) {
return err;
}
// 重采样回来
int out_frame = m_resample2src->get_out_samples(m_buf_tmp_32k_len);
if (m_buf_tmp_src_cap < out_frame * m_channel) {
m_buf_tmp_src_cap = out_frame * m_channel;
m_buf_tmp_src = std::shared_ptr<float>(new float[m_buf_tmp_src_cap], std::default_delete<float[]>());
}
m_buf_tmp_src_len = out_frame;
err = m_resample2src->resample(m_buf_tmp_32k.get(), m_buf_tmp_32k_len, m_buf_tmp_src.get(), m_buf_tmp_src_len);
if (err != ERR_RVC_LITE_SUCCESS) {
return err;
}
// 取较小的值
if (out_len > m_buf_tmp_src_len * m_channel)
{
out_len = m_buf_tmp_src_len * m_channel;
}
- memcpy(out_buf, m_buf_tmp_src.get(), sizeof(float) * out_len);
+ // 第一次过来,将头部的延迟块切掉
+ int latency_frame = 0;
+ if (m_first)
+ {
+ m_first = false;
+ latency_frame = int(m_rvc_inst->get_latency_ms() * 1.0 / 1000 * m_sample_rate) * m_channel;
+ out_len -= latency_frame;
+ }
+ memcpy(out_buf, m_buf_tmp_src.get()+latency_frame, sizeof(float) * out_len);
return ERR_RVC_LITE_SUCCESS;
}
+
float CRvcLiteSynthesizer::get_rtf()
{
struct timeval start;
struct timeval end;
gettimeofday(&start, NULL);
int in_len = m_sample_rate * m_channel - 100 *m_channel;
int out_len = in_len;
float* in_buf = new float[in_len];
process(in_buf, in_len, in_buf, in_len);
delete [] in_buf;
gettimeofday(&end, NULL);
double sp = (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0;
return sp / 1000;
}
\ No newline at end of file
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Sun, Jan 12, 08:33 (1 d, 11 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1347191
Default Alt Text
(67 KB)
Attached To
R350 av_svc
Event Timeline
Log In to Comment