Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F4880317
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
56 KB
Subscribers
None
View Options
diff --git a/mnn_demo/CMakeLists.txt b/mnn_demo/CMakeLists.txt
index 0faf584..895b2c7 100644
--- a/mnn_demo/CMakeLists.txt
+++ b/mnn_demo/CMakeLists.txt
@@ -1,70 +1,69 @@
cmake_minimum_required(VERSION 3.0)
project(mnn_demo)
set(CMAKE_CXX_STANDARD 14)
set(MNN_DIR /opt/soft/MNN)
set(FFMPEG_DIR /opt/soft/ffmpeg/ffmpeg)
set(LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/lib)
IF (WIN32)
MESSAGE(STATUS "Now is windows")
set(MNN_DIR D:/soft/mnn/MNN)
set(FFMPEG_DIR D:/soft/ffmpeg/linux_ffmpeg)
ENDIF ()
# MNN
include_directories(${MNN_DIR}/include)
include_directories(${MNN_DIR}/include/MNN)
# ffmpeg
include_directories(${FFMPEG_DIR}/include)
# 项目头文件
include_directories(inc src)
include_directories(ref)
include_directories(ref/av_waves/waves/inc)
include_directories(ref/av_resample/audio_resample/inc)
include_directories(ref/thread)
include_directories(third_party)
# 第三方依赖
add_subdirectory(ref)
# 源数据
FILE(GLOB SRC_DIR ${PROJECT_SOURCE_DIR}/src/*cpp)
FILE(GLOB THIRD_SRC_DIR ${PROJECT_SOURCE_DIR}/third_party/espyin-v1.0/*cpp)
#add_library(svc_lite ${SRC_DIR})
-add_executable(mnn_demo main.cpp ${SRC_DIR} ${THIRD_SRC_DIR}
- tests/test_flatbuffer.cpp)
+add_executable(mnn_demo main.cpp ${SRC_DIR} ${THIRD_SRC_DIR})
#add_executable(test_circle_buffer tests/test_CRvcCircleBuffer.cpp src/CRvcCircleBuffer.cpp)
target_link_libraries(mnn_demo
/opt/soft/MNN/build/libMNN.a
${LIBRARY_OUTPUT_PATH}/libwaves.a
${LIBRARY_OUTPUT_PATH}/libthread.a
${LIBRARY_OUTPUT_PATH}/libaudio_resample.a
)
target_link_libraries(mnn_demo
${FFMPEG_DIR}/lib/libavfilter.a
${FFMPEG_DIR}/lib/libavformat.a
${FFMPEG_DIR}/lib/libavcodec.a
${FFMPEG_DIR}/lib/libswresample.a
${FFMPEG_DIR}/lib/libswscale.a
${FFMPEG_DIR}/lib/libavutil.a
)
# 测试代码
#include_directories(${MNN_DIR}/tools)
#include_directories(${MNN_DIR}/3rd_party/flatbuffers/include)
#
#add_executable(test_flat_buffer tests/test_flatbuffer.cpp ${FLAT_SRC_DIR}
# ${MNN_DIR}/3rd_party/flatbuffers/src/code_generators.cpp
# ${MNN_DIR}/3rd_party/flatbuffers/src/idl_gen_text.cpp
# ${MNN_DIR}/3rd_party/flatbuffers/src/idl_parser.cpp
# ${MNN_DIR}/3rd_party/flatbuffers/src/util.cpp
# tests/half.hpp
#)
\ No newline at end of file
diff --git a/mnn_demo/inc/CRvcLiteOnline.h b/mnn_demo/inc/CRvcLiteOnline.h
index f1c7677..ad62799 100644
--- a/mnn_demo/inc/CRvcLiteOnline.h
+++ b/mnn_demo/inc/CRvcLiteOnline.h
@@ -1,296 +1,309 @@
//
// Created by jianli.yang on 2023/11/29.
//
#ifndef MNN_DEMO_CRVCLITEONLINE_H
#define MNN_DEMO_CRVCLITEONLINE_H
#define DEBUG
#ifdef __ANDROID__
#include <android/log.h>
#ifdef STRELEASE
#define LOGD(...)
#define LOGE(...)
#else
#define LOGD(TAG, ...) __android_log_print(ANDROID_LOG_DEBUG , TAG, __VA_ARGS__)
#define LOGE(TAG, ...) __android_log_print(ANDROID_LOG_ERROR , TAG, __VA_ARGS__)
#endif
#else
#ifdef DEBUG
#define LOGD(TAG, ...) printf("\nDebug: %s",TAG);printf(__VA_ARGS__);
#define LOGE(TAG, ...) printf("\nError: %s",TAG);printf(__VA_ARGS__);
#else
#define LOGD(TAG, ...)
#define LOGE(TAG, ...)
#endif
#endif
#include <mutex>
#include <string>
#include <memory>
#include <vector>
#include <condition_variable>
#define gs_src_samplerate 16000
#define gs_dst_samplerate 32000
#define gs_crossfade_time 0.08 // 单位是s
#define gs_block_time 1
#define gs_extra_time 1
#define gs_hubert_frame 206 // 和模型相关
#define gs_hubert_dim 256 // 和模型相关
#define gs_synth_input_frame 205 // 和模型相关
#define gs_synth_input_dim 258 // 和模型相关
#define gs_synth_output_frame 35840 // 和模型相关
enum {
ERR_RVC_LITE_SUCCESS = 0,
ERR_RVC_LITE_NOT_INIT = 1,
ERR_RVC_LITE_REINIT = 2,
ERR_RVC_LITE_RT_REINIT = 3,
ERR_RVC_LITE_RT_NOT_INIT = 4,
ERR_RVC_LITE_RT_NOT_ENOUGH_DATA = 5,
ERR_RVC_LITE_RT_INPUT_SAMPLE_ERR = 6, // 采样率小于16000
ERR_RVC_LITE_RT_RESAMPLE_OUTBUF_SHORT = 7, // 重采样后的buf太短
+ ERR_RVC_LITE_NOT_SWITCH_MODEL = 8, // 重采样后的buf太短
+ ERR_RVC_LITE_MODEL_NOT_EXISTS = 9, // 没有人声模型
+ ERR_RVC_LITE_BLOCK_TOO_LONG = 10, // 区块过大
};
+const int RVC_LITE_RT_SYN_STATE_DEFAULT = 0;
+const int RVC_LITE_RT_SYN_STATE_EFFECT = 1;
+const int RVC_LITE_RT_SYN_STATE_DEFAULT2EFFECT = 2;
+const int RVC_LITE_RT_SYN_STATE_EFFECT2DEFAULT = 3;
+const int RVC_LITE_RT_SYN_STATE_BEFORE_DEFAULT = 4;
class Hubert;
class CSynthesizer;
class ESPYIN;
class CThreadPool;
class CRvcCircleBuffer;
class CFfmpegResampler;
/**
* Rvc轻量化实时推理代码
* 要求输入16k的音频数据,输出是目标采样率的数据
*/
class CRvcLiteOnline {
public:
CRvcLiteOnline();
~CRvcLiteOnline();
private:
void uninit();
void get_f0_post();
void get_pyin_f0();
void init_variable();
public:
/**
* 初始化函数
* @param hubert_model_path
- * @param synth_model_path
* @return 0 表示正常
*/
- int init(const char *hubert_model_path, const char *synth_model_path);
+ int init(const char *hubert_model_path);
/**
* 换音色模型
* @param synth_model_path
* @return
*/
int switch_synth_model(const char* synth_model_path);
/**
* 处理定长的一帧数据
* 要求输入单声道16k音频
* @param in_buf
* @param in_len 长度小于等于gs_src_samplerate,最佳是gs_src_samplerate
* @param out_buf
* @param out_len 小于等于gs_dst_samplerate,最佳是gs_dst_samplerate[和输入有关,如果是32k,则恰好是输入的两倍]
* @return 0 表示正常
*/
int process_block(float *in_buf, int in_len, float *out_buf, int out_len);
/**
* 清空存储
* @return
*/
void reset();
/**
* 获取延迟时间
* @return
*/
int get_latency_ms();
private:
// 是否进行过init
bool m_init;
+ bool m_switch_model;
std::shared_ptr<Hubert> m_hubert_inst;
std::shared_ptr<CSynthesizer> m_synthesizer_inst;
std::shared_ptr<ESPYIN> m_es_pyin;
// 缓存使用的数据
// 要求输入的时间片长度,采样点数
int m_input_block_frame;
// 推理时额外需要的长度
int m_input_extra_frame;
// 推理时使用的buffer长度
int m_input_predict_buf_frame;
// 推理时使用的buffer
float *m_input_predict_buf;
std::vector<float> m_f0_data;
std::vector<float> m_f0_coarse_data;
// 输出的情况
int m_crossfade_frame;
int m_output_block_frame;
int m_output_cache_buf_frame;
float *m_crossfade_buf;
float *m_output_cache_buf;
+ bool m_fade_in;
// 各个实例的返回结果
std::vector<std::vector<std::vector<float>>> m_hubert_ret;
std::vector<std::vector<std::vector<float>>> m_synth_input;
std::vector<std::vector<std::vector<float>>> m_synth_out;
};
class CResample {
public:
CResample();
~CResample();
public:
int init(int in_samplerate, int out_samplerate, int in_channel=1, int out_channel=1);
// 返回的是单通道的采样点数
int get_out_samples(int num);
int get_latency();
void reset();
// 不考虑让内部缓存的情况,有多少拿多少,in_num和out_num均是单通道采样点个数
int resample(float * in_buf, int in_num, float * out_buf, int & out_num);
private:
std::shared_ptr<CFfmpegResampler> m_resample_inst;
int m_in_channel;
int m_out_channel;
};
/**
* 实时处理的类
* 入一帧出一帧,允许非常短的帧做输入,延迟较高,在2s左右
* 思路:
* 1. 构造函数设置变量
* 2. init初始化环境,开启处理线程
* 3. process,每次送一帧,触发一次判断逻辑
* 4. flush函数将输入的未处理的数据全部处理一次,联合之前没有被取出的数据一起刷出来
* 5. 析构时关闭处理线程,并释放所有空间
*/
class CRvcLiteOnlineRealTime {
public:
CRvcLiteOnlineRealTime();
~CRvcLiteOnlineRealTime();
private:
void init_variable();
void rvc_process();
void rvc_process_step();
void uninit();
void stop();
public:
/**
* 初始化函数
* @param hubert_model_path
- * @param synth_model_path
* @param sample_rate
* @param channel
* @return
*/
- int init(const char *hubert_model_path, const char *synth_model_path, int sample_rate, int channel);
+ int init(const char *hubert_model_path, int sample_rate, int channel);
/**
* 切换音色
* @param synth_model_path
* @return
*/
int switch_synth(const char *synth_model_path);
/**
* 清空缓存
*/
void reset();
/**
* 入一帧,出一帧,要求长度一致
* 两者可以是同一块buffer
* @param in_buf
* @param in_len
* @param out_buf
* @param out_len
* @return
*/
int process(float *in_buf, int in_len, float *out_buf, int out_len);
/**
* 将所有处理好的结果获取出来
* 因为不确定还有多少,所以由内部来开辟空间,外部进行释放
* @return
*/
void flush(float *&out_buf, int &len);
/**
* 获取延迟时间
*/
int get_latency_ms();
private:
int m_sample_rate;
int m_channel;
std::shared_ptr<CRvcCircleBuffer> m_resample_queue;
std::shared_ptr<CRvcCircleBuffer> m_input_queue;
std::shared_ptr<CRvcCircleBuffer> m_out_queue;
int m_input_tmp_buf_len;
int m_output_tmp_buf_len;
std::shared_ptr<float> m_input_tmp_buf;
std::shared_ptr<float> m_output_tmp_buf;
std::shared_ptr<CRvcLiteOnline> m_rvc_inst;
std::shared_ptr<CThreadPool> m_thread_pool;
// 逻辑变量
bool m_init;
// 处理线程相关
bool m_rvc_stop;
std::mutex m_rvc_mutex;
std::condition_variable m_rvc_cond;
// 重采样相关
std::shared_ptr<CResample> m_resample16;
std::shared_ptr<CResample> m_resample2src;
int m_resample_buf_max_len;
std::shared_ptr<float> m_resample_in_buf;
std::shared_ptr<float> m_resample_out_buf;
// 切换音色
std::string m_synth_path;
std::string m_new_synth_path;
+
+ // 合成的状态
+ int m_syn_state;
+ // 延迟器
+ std::shared_ptr<CRvcCircleBuffer> m_latency_queue;
};
#endif //MNN_DEMO_CRVCLITEONLINE_H
diff --git a/mnn_demo/inc/CRvcLiteSynthesizer.h b/mnn_demo/inc/CRvcLiteSynthesizer.h
new file mode 100644
index 0000000..bac8c21
--- /dev/null
+++ b/mnn_demo/inc/CRvcLiteSynthesizer.h
@@ -0,0 +1,58 @@
+//
+// Created by Administrator on 2024/1/21.
+//
+
+#ifndef MNN_DEMO_CRVCLITESYNTHESIZER_H
+#define MNN_DEMO_CRVCLITESYNTHESIZER_H
+#include "CRvcLiteOnline.h"
+
+class CRvcLiteSynthesizer
+{
+public:
+ CRvcLiteSynthesizer();
+ ~CRvcLiteSynthesizer();
+
+public:
+ /**
+ * 初始化
+ * @param hubert_model 语义模型地址
+ * @param synth_model 音色模型地址
+ * @param sample_rate 采样率
+ * @param channel 通道数
+ * @return 0 表示正常
+ */
+ int init(const char* hubert_model, const char* synth_model, int sample_rate, int channel);
+
+ /**
+ * 处理逻辑
+ * @param in_buf 输入的buf
+ * @param in_len 输入的Buf长度,frame*channel,建议输入小于等于1s的音频长度,尽量的大就好
+ * @param out_buf 输出的buf
+ * @param out_len 输出的buf长度, frame*channel
+ * 注意: 此处有可能出现输出的长度不一定等于in_len,输出的值会小于等于out_len,但是是连续的,所以out_len可以适当比in_len大一些,从而保证都能搞出来
+ * @return
+ */
+ int process(float* in_buf, int in_len, float* out_buf, int &out_len);
+
+ // 获取实时率,处理1s数据的真实耗时/1s
+ float get_rtf();
+
+private:
+ std::shared_ptr<CRvcLiteOnline> m_rvc_inst;
+ std::shared_ptr<CResample> m_resample2_16;
+ std::shared_ptr<CResample> m_resample2src;
+ int m_channel;
+ int m_sample_rate;
+ std::shared_ptr<float> m_buf_tmp_16k;
+ int m_buf_tmp_16k_len;
+ int m_buf_tmp_16k_cap;
+ std::shared_ptr<float> m_buf_tmp_32k;
+ int m_buf_tmp_32k_len;
+ int m_buf_tmp_32k_cap;
+ std::shared_ptr<float> m_buf_tmp_src;
+ int m_buf_tmp_src_len;
+ int m_buf_tmp_src_cap;
+};
+
+
+#endif //MNN_DEMO_CRVCLITESYNTHESIZER_H
diff --git a/mnn_demo/main.cpp b/mnn_demo/main.cpp
index 7b4f6dc..8aa637d 100644
--- a/mnn_demo/main.cpp
+++ b/mnn_demo/main.cpp
@@ -1,171 +1,221 @@
#include <sys/time.h>
#include <thread>
#include <chrono>
#include "src/Hubert.h"
#include "src/CSynthesizer.h"
-
+#include "CRvcLiteSynthesizer.h"
int test_hubert() {
const char *hubert_model_path = "/mnt/d/dataset/svc/models/mnn/hubert_test_v1_fp16.mnn";
Hubert hubert;
int err_code = hubert.init(hubert_model_path);
std::vector<float> input(33280, 0.1);
std::vector<std::vector<std::vector<float>>> ret;
ret.resize(1);
ret[0].resize(205);
for (int i = 0; i < 205; i++) {
ret[0][i].resize(256);
}
float time = hubert.process(input.data(), ret);
return 0;
}
int test_contentvec() {
const char *contentvec_model_path = "/mnt/d/dataset/svc/models/mnn/contentvec_test_fp16.mnn";
CSynthesizer contentVec;
int err_code = contentVec.init(contentvec_model_path);
std::vector<std::vector<std::vector<float>>> input(1);
input[0].resize(205);
for (int i = 0; i < 205; i++) {
for (int j = 0; j < 258; j++) {
if (j == 256) {
input[0][i].push_back(0.2);
} else if (j == 257) {
input[0][i].push_back(1.0);
} else {
input[0][i].push_back(0.1);
}
}
}
std::vector<std::vector<std::vector<float>>> ret;
ret.resize(1);
for (int i = 0; i < 1; i++) {
ret[i].resize(1);
ret[i][0].resize(35840);
}
float tot = 0.f;
for (int i = 0; i < 10; i++) {
float time = contentVec.process(input, ret);
tot += time;
}
printf("time: %f \n", tot / 100.f);
return 0;
}
#include "CRvcLiteOnline.h"
#include "av_waves/waves/inc/STWaveFile.h"
void test() {
const char *hubert_model_path = "/mnt/d/dataset/svc/models/mnn/hubert_test_v2_fp16.mnn";
const char *contentvec_model_path = "/mnt/d/dataset/svc/models/mnn/contentvec_test_fp16.mnn";
const char *in_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01_16.wav";
// const char *in_wav = "/mnt/d/code/develop/svc/Retrieval-based-Voice-Conversion-WebUI/online/1_1.wav";
const char *out_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01_cpp_v1.wav";
CRvcLiteOnline rvc_inst;
- rvc_inst.init(hubert_model_path, contentvec_model_path);
+ rvc_inst.init(hubert_model_path);
// 读取音频文件, 要求16k,单声道
STCWaveFile wav_inst(in_wav, false);
int sample_rate = wav_inst.GetSampleRate();
int channel = wav_inst.GetChannels();
int len = wav_inst.GetTotalFrames() * channel;
float *data = new float[len];
float *outdata = new float[len * 2];
wav_inst.ReadFrameAsfloat(data, wav_inst.GetTotalFrames());
int step = sample_rate;
printf("start ..\n");
for (int i = 0; i < len; i += step) {
if (i + step > len) {
step = len - i;
}
struct timeval start;
struct timeval end;
gettimeofday(&start, NULL);
rvc_inst.process_block(data + i, step, outdata + 2 * i, 2 * step);
gettimeofday(&end, NULL);
printf("sp = %f ms\n", (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);
}
STCWaveFile wav_out_inst(out_wav, true);
wav_out_inst.SetSampleRate(32000);
wav_out_inst.SetChannels(1);
wav_out_inst.SetSampleFormat(SF_IEEE_FLOAT);
wav_out_inst.SetupDone();
wav_out_inst.WriteFrame(outdata, len * 2);
printf("finish2 ....\n");
}
+void test_rvc_lite_synth()
+{
+ const char *hubert_model_path = "/mnt/d/dataset/svc/models/layers_3/layer3_contentvec.mnn";
+ const char *syz_model = "/mnt/d/dataset/svc/models/layers_3/layer3_syz.mnn";
+ const char *out_wav = "/mnt/d/dataset/tmp/i_out3.wav";
+ const char *in_wav = "/mnt/d/dataset/tmp/t1.wav";
+
+ STCWaveFile wav_inst(in_wav, false);
+ int sample_rate = wav_inst.GetSampleRate();
+ int channel = wav_inst.GetChannels();
+ int len = wav_inst.GetTotalFrames() * channel;
+ float *data = new float[len];
+ float *outdata = new float[len];
+ wav_inst.ReadFrameAsfloat(data, wav_inst.GetTotalFrames());
+ CRvcLiteSynthesizer m_rvc_inst;
+ int err = m_rvc_inst.init(hubert_model_path, syz_model, sample_rate, channel);
+ printf("init err=%d!\n", err);
+ printf("rtf=%f\n", m_rvc_inst.get_rtf());
+ int step = sample_rate * channel - 100 * channel;
+ int out_len = 0;
+ for(int i = 0; i < len; i+=step)
+ {
+ if (i + step > len) {
+ step = len - i;
+ }
+ int out_step = step;
+ err = m_rvc_inst.process(data+i, step, outdata+out_len, out_step);
+ if(err != ERR_RVC_LITE_SUCCESS)
+ {
+ printf("process err!\n");
+ return ;
+ }
+ out_len += out_step;
+ }
+ STCWaveFile wav_out_inst(out_wav, true);
+ wav_out_inst.SetSampleRate(sample_rate);
+ wav_out_inst.SetChannels(channel);
+ wav_out_inst.SetSampleFormat(SF_IEEE_FLOAT);
+ wav_out_inst.SetupDone();
+ wav_out_inst.WriteFrame(outdata, wav_inst.GetTotalFrames());
+ delete[] data;
+ delete[] outdata;
+}
+
+
void test_rvc_lite_online() {
// const char *hubert_model_path = "/mnt/d/dataset/svc/models/mnn/hubert_test_v2_fp16.mnn";
// const char *hubert_model_path = "/mnt/d/dataset/svc/models/layer6_bingxiao_v1/mnn/layers6_checkpoint_14_1660000_1_hubert.mnn";
const char *hubert_model_path = "/mnt/d/dataset/svc/models/layers_3/layer3_contentvec.mnn";
// const char *contentvec_model_path = "/mnt/d/dataset/svc/models/mnn/contentvec_test_fp16.mnn";
// const char *syz_model = "/mnt/d/dataset/svc/models/layer6_bingxiao_v1/mnn/xusong_v1_6hubert_hifix_syz_base_vctk_kd_32k_hubert6_jianli_e225_s62775_205.mnn";
- const char *syz_model = "/mnt/d/dataset/svc/models/layers_3/layer3_xusong.mnn";
+ const char *xs_model = "/mnt/d/dataset/svc/models/layers_3/layer3_xusong.mnn";
+ const char *syz_model = "/mnt/d/dataset/svc/models/layers_3/layer3_syz.mnn";
// const char *contentvec_model_path = "/mnt/d/dataset/svc/models/layer6_bingxiao_v1/mnn/xiafan_fp16.mnn";
- const char *in_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01.wav";
+// const char *in_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01.wav";
+ const char *in_wav = "/mnt/d/dataset/tmp/t1.wav";
// const char* in_wav = "/mnt/d/dataset/svc/dataset/短数据样本/男声/qiankun.wav";
// const char* in_wav = "/mnt/d/dataset/tmp/i.wav";
// const char *in_wav = "/mnt/d/code/develop/svc/Retrieval-based-Voice-Conversion-WebUI/online/1_1.wav";
// const char *out_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01_cpp_v4.wav";
// const char *out_wav = "/mnt/d/dataset/svc/dataset/tests/qiankun_412_v4.wav";
const char *out_wav = "/mnt/d/dataset/tmp/i_out2.wav";
// 读取音频文件, 要求16k,单声道
STCWaveFile wav_inst(in_wav, false);
int sample_rate = wav_inst.GetSampleRate();
int channel = wav_inst.GetChannels();
int len = wav_inst.GetTotalFrames() * channel;
float *data = new float[len];
float *outdata = new float[len];
CRvcLiteOnlineRealTime rvc_inst;
- rvc_inst.init(hubert_model_path, syz_model, sample_rate, channel);
+ rvc_inst.init(hubert_model_path, sample_rate, channel);
wav_inst.ReadFrameAsfloat(data, wav_inst.GetTotalFrames());
int step = 1024;
printf("start ..\n");
bool flag = true;
+ rvc_inst.switch_synth(syz_model);
for (int i = 0; i < len; i += step) {
if (i + step > len) {
step = len - i;
}
struct timeval start;
struct timeval end;
gettimeofday(&start, NULL);
int ret = rvc_inst.process(data + i, step, outdata+i, step);
std::this_thread::sleep_for(std::chrono::milliseconds (15));
gettimeofday(&end, NULL);
printf("ret = %d, sp = %f ms step=%d\n", ret,
(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0, step);
if (flag && i >= len / 3) {
flag = false;
- rvc_inst.switch_synth(syz_model);
+ rvc_inst.reset();
+// rvc_inst.switch_synth(xs_model);
}
}
STCWaveFile wav_out_inst(out_wav, true);
wav_out_inst.SetSampleRate(sample_rate);
wav_out_inst.SetChannels(channel);
wav_out_inst.SetSampleFormat(SF_IEEE_FLOAT);
wav_out_inst.SetupDone();
wav_out_inst.WriteFrame(outdata, wav_inst.GetTotalFrames());
float* flush_data;
int flush_len;
rvc_inst.flush(flush_data, flush_len);
wav_out_inst.WriteFrame(flush_data, flush_len/channel);
printf("finish2 ....\n");
}
int main() {
// int ret_hubert = test_hubert();
// int ret_contentvec = test_contentvec();
// test();
// test();
- test_rvc_lite_online();
+// test_rvc_lite_online();
+ test_rvc_lite_synth();
return 0;
}
diff --git a/mnn_demo/src/CRvcLiteOnline.cpp b/mnn_demo/src/CRvcLiteOnline.cpp
index 3a08e27..4f9c833 100644
--- a/mnn_demo/src/CRvcLiteOnline.cpp
+++ b/mnn_demo/src/CRvcLiteOnline.cpp
@@ -1,633 +1,811 @@
//
// Created by Administrator on 2023/11/29.
//
#include <cmath>
#include <cstring>
#include <sys/time.h>
#include "CRvcLiteOnline.h"
#include "Hubert.h"
#include "CSynthesizer.h"
#include "espyin-v1.0/ESPYIN.h"
#include "ThreadPool.h"
#include "CRvcCircleBuffer.h"
#include "FfmpegResampler.h"
+#include <unistd.h>
+
+inline bool file_exists (const std::string& name) {
+ return ( access( name.c_str(), F_OK ) != -1 );
+}
// size代表了buf的长度
void stereo2mono(float *input, int size, float *output) {
for (int i = 0; i < size - 1; i += 2) {
output[i / 2] = (input[i] + input[i + 1]) / 2;
}
}
void mono2stereo(float *input, int size, float *output) {
for (int i = 0; i < size; i++) {
output[2 * i] = input[i];
output[2 * i + 1] = input[i];
}
}
CRvcLiteOnline::CRvcLiteOnline() {
init_variable();
m_init = false;
+ m_switch_model = false;
// 输入部分需要的变量
// 要求输入的时间片长度,采样点数
m_input_block_frame = int(gs_block_time * gs_src_samplerate);
// 推理时额外需要的长度
m_input_extra_frame = int(gs_extra_time * gs_src_samplerate);
int zc = gs_src_samplerate / 100; // 10ms的点数
int input_corssfade_frame = int(gs_crossfade_time * gs_src_samplerate);
// 推理时使用的buffer长度
m_input_predict_buf_frame = int(ceil((m_input_extra_frame + input_corssfade_frame + m_input_block_frame)
* 1.0 / zc) * zc);
// 推理时使用的buffer
m_input_predict_buf = new float[m_input_predict_buf_frame];
memset(m_input_predict_buf, 0, sizeof(float) * m_input_predict_buf_frame);
// 输出部分需要的变量
m_crossfade_frame = int(gs_crossfade_time * gs_dst_samplerate);
m_output_block_frame = int(gs_block_time * gs_dst_samplerate);
int output_extra_frame = int(gs_extra_time * gs_dst_samplerate);
zc = gs_dst_samplerate / 100;
m_output_cache_buf_frame = int(ceil((m_output_block_frame + m_crossfade_frame + output_extra_frame)
* 1.0 / zc) * zc);
m_output_cache_buf = new float[m_output_cache_buf_frame];
memset(m_output_cache_buf, 0, sizeof(float) * m_output_cache_buf_frame);
m_crossfade_buf = new float[m_crossfade_frame];
memset(m_crossfade_buf, 0, sizeof(float) * m_crossfade_frame);
// 对于模型的输入和输出进行缓存
// 此处是写死的和模型有关
m_hubert_ret.resize(1);
m_hubert_ret[0].resize(gs_hubert_frame);
for (int i = 0; i < gs_hubert_frame; i++) {
m_hubert_ret[0][i].resize(gs_hubert_dim);
}
// synth模型的输入
m_synth_input.resize(1);
m_synth_input[0].resize(gs_synth_input_frame);
for (int i = 0; i < gs_synth_input_frame; i++) {
m_synth_input[0][i].resize(gs_synth_input_dim);
}
m_synth_out.resize(1);
m_synth_out[0].resize(1);
m_synth_out[0][0].resize(gs_synth_output_frame);
}
CRvcLiteOnline::~CRvcLiteOnline() {
uninit();
}
/**********************************对内函数*********************************************/
void CRvcLiteOnline::uninit() {
if (m_input_predict_buf != NULL) {
delete[] m_input_predict_buf;
m_input_predict_buf = NULL;
}
if (m_output_cache_buf != NULL) {
delete[] m_output_cache_buf;
m_output_cache_buf = NULL;
}
if (m_crossfade_buf != NULL) {
delete[] m_crossfade_buf;
m_crossfade_buf = NULL;
}
init_variable();
}
void CRvcLiteOnline::get_pyin_f0() {
for (int i = 0; i < m_input_predict_buf_frame; i += 160) {
m_es_pyin->process(m_input_predict_buf + i);
}
m_f0_data.clear();
ESFeatureSet feats = m_es_pyin->getRemainingFeatures();
if (!feats.empty()) {
m_f0_data.resize(feats[4].size());
for (size_t i = 0; i < feats[4].size(); ++i) {
// JL_DEBUG
m_f0_data[i] = feats[4][i].values[0];
if (m_f0_data[i] < 0) {
m_f0_data[i] = 0;
}
}
}
m_es_pyin->reset();
get_f0_post();
}
void CRvcLiteOnline::get_f0_post() {
int f0_min = 50;
int f0_max = 1100;
float f0_mel_min = 1127 * log2(1 + f0_min * 1.0 / 700);
float f0_mel_max = 1127 * log2(1 + f0_max * 1.0 / 700);
m_f0_coarse_data.clear();
m_f0_coarse_data.resize(m_f0_data.size());
for (int i = 0; i < m_f0_data.size(); i++) {
float f0_mel = 1127 * log2(1 + m_f0_data[i] / 700);
if (f0_mel > 0) {
f0_mel = (f0_mel - f0_mel_min) * 254.f / (f0_mel_max - f0_mel_min) + 1;
}
if (f0_mel <= 1) {
f0_mel = 1;
} else if (f0_mel > 255) {
f0_mel = 255;
}
m_f0_coarse_data[i] = float(int(f0_mel + 0.5));
}
}
void CRvcLiteOnline::init_variable() {
m_init = false;
+ m_switch_model = false;
// 缓存使用的数据
// 要求输入的时间片长度,采样点数
m_input_block_frame = 0;
m_input_extra_frame = 0;
m_input_predict_buf_frame = 0;
m_input_predict_buf = nullptr;
m_f0_data.clear();
m_f0_coarse_data.clear();
m_crossfade_frame = 0;
m_output_block_frame = 0;
m_output_cache_buf_frame = 0;
m_crossfade_buf = nullptr;
m_output_cache_buf = nullptr;
// 各个实例的返回结果
m_hubert_ret.clear();
m_synth_input.clear();
m_synth_out.clear();
+
+ m_fade_in = true;
}
/**********************************对外函数*********************************************/
-int CRvcLiteOnline::init(const char *hubert_model_path, const char *synth_model_path) {
+int CRvcLiteOnline::init(const char *hubert_model_path) {
if (m_init) {
return ERR_RVC_LITE_REINIT;
}
m_hubert_inst = std::make_shared<Hubert>();
m_synthesizer_inst = std::make_shared<CSynthesizer>();
m_hubert_inst->init(hubert_model_path);
- m_synthesizer_inst->init(synth_model_path);
+// m_synthesizer_inst->init(synth_model_path);
// 要求stepSize必须是2^n
m_es_pyin = std::make_shared<ESPYIN>(16000, 160, 1024, 50, 1100);
m_init = true;
+ m_switch_model = false;
+ m_fade_in = true;
return ERR_RVC_LITE_SUCCESS;
}
int CRvcLiteOnline::switch_synth_model(const char *synth_model_path) {
if (!m_init) {
return ERR_RVC_LITE_NOT_INIT;
}
- m_synthesizer_inst = std::make_shared<CSynthesizer>();
- m_synthesizer_inst->init(synth_model_path);
- return ERR_RVC_LITE_SUCCESS;
+ if (file_exists(synth_model_path))
+ {
+ m_synthesizer_inst = std::make_shared<CSynthesizer>();
+ m_synthesizer_inst->init(synth_model_path);
+ m_switch_model = true;
+ return ERR_RVC_LITE_SUCCESS;
+ }
+ return ERR_RVC_LITE_MODEL_NOT_EXISTS;
}
void CRvcLiteOnline::reset() {
memset(m_input_predict_buf, 0, sizeof(float) * m_input_predict_buf_frame);
memset(m_crossfade_buf, 0, sizeof(float) * m_crossfade_frame);
memset(m_output_cache_buf, 0, sizeof(float) * m_output_cache_buf_frame);
+ m_fade_in = true;
}
int CRvcLiteOnline::process_block(float *in_buf, int in_len, float *out_buf, int out_len) {
if (!m_init) {
return ERR_RVC_LITE_NOT_INIT;
}
+ if (!m_switch_model)
+ {
+ return ERR_RVC_LITE_NOT_SWITCH_MODEL;
+ }
+
+ // 外部数据产生不连贯,比如做了reset的时候,需要做fade_in
+ if (m_fade_in)
+ {
+ for(int i = 0; i < in_len; i++)
+ {
+ float rate = i * 1.0 / in_len;
+ in_buf[i] = in_buf[i] * rate;
+ }
+ m_fade_in = false;
+ }
+
// 剔除尾部的block的数据
memcpy(m_input_predict_buf, m_input_predict_buf + in_len,
sizeof(float) * (m_input_predict_buf_frame - in_len));
// 向尾部填充in_buf的数据
memcpy(m_input_predict_buf + (m_input_predict_buf_frame - in_len), in_buf,
sizeof(float) * in_len);
// 提取f0特征序列
struct timeval start;
struct timeval end;
gettimeofday(&start, NULL);
get_pyin_f0();
gettimeofday(&end, NULL);
LOGE("CRvcLiteOnline", "get pyin sp = %f ms\n",
(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);
// 推理hubert
gettimeofday(&start, NULL);
m_hubert_inst->process(m_input_predict_buf, m_hubert_ret);
gettimeofday(&end, NULL);
LOGE("CRvcLiteOnline", "m_hubert_inst sp = %f ms\n",
(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);
// 合成语音
for (int i = 0; i < gs_synth_input_frame; i++) {
// 拷贝数据 1,gs_hubert_frame,258
for (int j = 0; j < gs_hubert_dim; j++) {
m_synth_input[0][i][j] = m_hubert_ret[0][i][j];
}
m_synth_input[0][i][256] = m_f0_coarse_data[i];
m_synth_input[0][i][257] = m_f0_data[i];
}
gettimeofday(&start, NULL);
m_synthesizer_inst->process(m_synth_input, m_synth_out);
gettimeofday(&end, NULL);
LOGE("CRvcLiteOnline", "m_synthesizer_inst sp = %f ms\n",
(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);
// 将结果全部放到缓存中
memcpy(m_output_cache_buf, m_output_cache_buf + gs_synth_output_frame,
sizeof(float) * (m_output_cache_buf_frame - gs_synth_output_frame));
memcpy(m_output_cache_buf + (m_output_cache_buf_frame - gs_synth_output_frame),
m_synth_out[0][0].data(), sizeof(float) * gs_synth_output_frame);
int start_pos = m_output_cache_buf_frame - m_crossfade_frame - out_len;
memcpy(out_buf, m_output_cache_buf + start_pos, sizeof(float) * out_len);
// 对头部数据做fade_in以及fadeout
for (int i = 0; i < m_crossfade_frame; i++) {
float rate = float(i * 1.f / m_crossfade_frame);
out_buf[i] = rate * out_buf[i] + m_crossfade_buf[i] * (1 - rate);
}
memcpy(m_crossfade_buf, m_output_cache_buf + (m_output_cache_buf_frame - m_crossfade_frame),
sizeof(float) * m_crossfade_frame);
+
return 0;
}
int CRvcLiteOnline::get_latency_ms() {
return gs_crossfade_time * 1000;
}
/*******************************对内的类**************************************/
CResample::CResample()
{
m_resample_inst = nullptr;
}
CResample::~CResample()
{
}
int CResample::init(int in_samplerate, int out_samplerate, int in_channel, int out_channel)
{
// 只是通道数不一致时走自驱逻辑
m_in_channel = in_channel;
m_out_channel = out_channel;
if (in_samplerate == out_samplerate && in_channel != out_channel) {
m_resample_inst = nullptr;
}
else {
m_resample_inst = std::make_shared<CFfmpegResampler>();
return m_resample_inst->init(in_samplerate, out_samplerate, in_channel, out_channel);
}
return ERR_RVC_LITE_SUCCESS;
}
int CResample::get_out_samples(int num)
{
if (m_resample_inst)
{
return m_resample_inst->get_out_samples(num);
}
return num;
}
void CResample::reset()
{
if (m_resample_inst)
{
return m_resample_inst->reset();
}
}
int CResample::get_latency()
{
if (m_resample_inst)
{
return m_resample_inst->get_latency();
}
return 0;
}
int CResample::resample(float *in_buf, int in_num, float *out_buf, int &out_num) {
if (m_resample_inst) {
return m_resample_inst->resample(in_buf, in_num, out_buf, out_num);
}
if (m_in_channel == 2 && m_out_channel == 1) {
if (out_num < in_num) {
return ERR_RVC_LITE_RT_RESAMPLE_OUTBUF_SHORT;
}
stereo2mono(in_buf, in_num, out_buf);
return ERR_RVC_LITE_SUCCESS;
}
if (m_in_channel == 1 && m_out_channel == 2) {
if (out_num < in_num) {
return ERR_RVC_LITE_RT_RESAMPLE_OUTBUF_SHORT;
}
mono2stereo(in_buf, in_num, out_buf);
return ERR_RVC_LITE_SUCCESS;
}
return ERR_RVC_LITE_SUCCESS;
}
/*******************************对外的类***************************************/
/*******************************对内函数***************************************/
void CRvcLiteOnlineRealTime::init_variable() {
m_init = false;
m_rvc_stop = true;
m_sample_rate = 44100;
m_channel = 1;
m_synth_path = "";
m_new_synth_path = "";
+ m_syn_state = RVC_LITE_RT_SYN_STATE_DEFAULT;
}
/*******************************对外函数***************************************/
CRvcLiteOnlineRealTime::CRvcLiteOnlineRealTime() {
init_variable();
}
CRvcLiteOnlineRealTime::~CRvcLiteOnlineRealTime() {
uninit();
}
-int CRvcLiteOnlineRealTime::init(const char *hubert_model_path, const char *synth_model_path, int sample_rate,
- int channel) {
+int CRvcLiteOnlineRealTime::init(const char *hubert_model_path, int sample_rate, int channel) {
if (m_init) {
return ERR_RVC_LITE_RT_REINIT;
}
if (sample_rate < 16000) {
return ERR_RVC_LITE_RT_INPUT_SAMPLE_ERR;
}
init_variable();
m_sample_rate = sample_rate;
m_channel = channel;
- m_synth_path = synth_model_path;
- m_new_synth_path = synth_model_path;
+ m_synth_path = "";
+ m_new_synth_path = "";
+ m_syn_state = RVC_LITE_RT_SYN_STATE_DEFAULT;
int output_one_sec_number = m_sample_rate * m_channel; // 临时使用的数据
+ int latency_len = gs_crossfade_time * m_sample_rate * m_channel;
CThreadPool::Task task = std::bind(&CRvcLiteOnlineRealTime::rvc_process, this);
m_rvc_inst = std::make_shared<CRvcLiteOnline>();
- int err = m_rvc_inst->init(hubert_model_path, synth_model_path);
+ int err = m_rvc_inst->init(hubert_model_path);
if (ERR_RVC_LITE_SUCCESS != err) {
goto exit;
}
-
// 重采样部分
m_resample_queue = std::make_shared<CRvcCircleBuffer>(sample_rate * 3 * m_channel);
m_resample16 = std::make_shared<CResample>();
err = m_resample16->init(m_sample_rate, gs_src_samplerate, m_channel, 1);
if (ERR_RVC_LITE_SUCCESS != err) {
goto exit;
}
m_resample2src = std::make_shared<CResample>();
err = m_resample2src->init(gs_dst_samplerate, m_sample_rate, 1, m_channel);
if (ERR_RVC_LITE_SUCCESS != err) {
goto exit;
}
m_resample_buf_max_len = 2048; // 此时空间最大是2048,保证不超即可
m_resample_in_buf = std::shared_ptr<float>(new float[m_resample_buf_max_len], std::default_delete<float[]>());
m_resample_out_buf = std::shared_ptr<float>(new float[m_resample_buf_max_len], std::default_delete<float[]>());
// 核心处理部分
m_input_tmp_buf_len = gs_src_samplerate;
m_output_tmp_buf_len = gs_dst_samplerate;
m_input_tmp_buf = std::shared_ptr<float>(new float[m_input_tmp_buf_len], std::default_delete<float[]>());
m_output_tmp_buf = std::shared_ptr<float>(new float[m_output_tmp_buf_len], std::default_delete<float[]>());
memset(m_input_tmp_buf.get(), 0, sizeof(float) * m_input_tmp_buf_len);
memset(m_output_tmp_buf.get(), 0, sizeof(float) * m_output_tmp_buf_len);
// 循环buffer
m_input_queue = std::make_shared<CRvcCircleBuffer>(m_input_tmp_buf_len * 3);
// 对外的是目标的采样率和通道数的数据
m_out_queue = std::make_shared<CRvcCircleBuffer>(output_one_sec_number * 3);
-
+ m_latency_queue = std::make_shared<CRvcCircleBuffer>(latency_len);
// 提前塞入两组,保证延迟稳定在2s
for (int i = 0; i < 2; i++) {
// 塞入1s数据
for (int j = 0; j < output_one_sec_number / m_output_tmp_buf_len; j++) {
m_out_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len);
}
m_out_queue->push(m_output_tmp_buf.get(), output_one_sec_number % m_output_tmp_buf_len);
}
+ // 算法本身有延迟,所有为了保证延迟一致,在无效果的时候需要添加该延迟
+ for (int j = 0; j < latency_len / m_output_tmp_buf_len; j++) {
+ m_latency_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len);
+ }
+ m_latency_queue->push(m_output_tmp_buf.get(), latency_len % m_output_tmp_buf_len);
// 开始处理线程
m_thread_pool = std::make_shared<CThreadPool>();
m_thread_pool->start(1);
m_rvc_stop = false;
m_thread_pool->run(task);
m_init = true;
exit:
if (ERR_RVC_LITE_SUCCESS != err) {
m_init = true;
uninit();
}
return err;
}
int CRvcLiteOnlineRealTime::switch_synth(const char *synth_model_path) {
if (!m_init) {
return ERR_RVC_LITE_RT_NOT_INIT;
}
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_new_synth_path = synth_model_path;
}
return ERR_RVC_LITE_SUCCESS;
}
int CRvcLiteOnlineRealTime::process(float *in_buf, int in_len, float *out_buf, int out_len) {
if (!m_init) {
return ERR_RVC_LITE_RT_NOT_INIT;
}
// 写入数据
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_resample_queue->push(in_buf, in_len);
m_rvc_cond.notify_all();
}
memset(out_buf, 0, sizeof(float) * out_len);
int tmp_out_len = out_len;
// 获取数据
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_out_queue->pop(out_buf, tmp_out_len);
}
if (tmp_out_len != out_len) {
return ERR_RVC_LITE_RT_NOT_ENOUGH_DATA;
}
return ERR_RVC_LITE_SUCCESS;
}
void CRvcLiteOnlineRealTime::reset() {
if (!m_init) {
return;
}
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_resample_queue->reset();
m_resample16->reset();
m_resample2src->reset();
m_input_queue->reset();
m_out_queue->reset();
m_rvc_inst->reset();
+ m_latency_queue->reset();
+ // 提前塞入两组,保证延迟稳定在2s
+ int output_one_sec_number = m_sample_rate * m_channel; // 临时使用的数据
+ memset(m_output_tmp_buf.get(), 0, sizeof(float) * m_output_tmp_buf_len);
+ for (int i = 0; i < 2; i++) {
+ for (int j = 0; j < output_one_sec_number / m_output_tmp_buf_len; j++) {
+ m_out_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len);
+ }
+ m_out_queue->push(m_output_tmp_buf.get(), output_one_sec_number % m_output_tmp_buf_len);
+ }
+ // 算法本身有延迟,所有为了保证延迟一致,在无效果的时候需要添加该延迟
+ int latency_len = gs_crossfade_time * m_sample_rate * m_channel;
+ for (int j = 0; j < latency_len / m_output_tmp_buf_len; j++) {
+ m_latency_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len);
+ }
+ m_latency_queue->push(m_output_tmp_buf.get(), latency_len % m_output_tmp_buf_len);
}
}
void CRvcLiteOnlineRealTime::flush(float *&out_buf, int &len) {
// 将内部的所有的数据吐出来
/**
* 先停止
*/
stop();
- // 先将重采样的部分走完
+ // 无音色转换的情况
int resample_in_len = 0;
int resample_out_len = 0;
-// m_resample_queue->push(m_resample_in_buf.get(), m_resample_buf_max_len);
+ if(m_syn_state == RVC_LITE_RT_SYN_STATE_DEFAULT)
+ {
+ while (m_resample_queue->size() > 0) {
+ resample_in_len = m_resample_buf_max_len;
+ m_resample_queue->pop(m_resample_in_buf.get(), resample_in_len);
+ m_latency_queue->push(m_resample_in_buf.get(), resample_in_len);
+ m_latency_queue->pop(m_resample_in_buf.get(), resample_in_len);
+ m_out_queue->push(m_resample_in_buf.get(), resample_in_len);
+ }
+
+ while(m_latency_queue->size() > 0)
+ {
+ resample_in_len = m_resample_buf_max_len;
+ m_latency_queue->pop(m_resample_in_buf.get(), resample_in_len);
+ m_out_queue->push(m_resample_in_buf.get(), resample_in_len);
+ }
+
+ len = m_out_queue->size();
+ out_buf = new float[len];
+ m_out_queue->pop(out_buf, len);
+ return;
+ }
+
+ // 有音色转换的情况
while (m_resample_queue->size() > 0) {
resample_in_len = m_resample_buf_max_len;
m_resample_queue->pop(m_resample_in_buf.get(), resample_in_len);
// 输入的数据需要考虑channel
resample_out_len = m_resample16->get_out_samples(resample_in_len / m_channel);
m_resample16->resample(m_resample_in_buf.get(), resample_in_len / m_channel, m_resample_out_buf.get(),
resample_out_len);
// 输出是16k单声道,不需要考虑
m_input_queue->push(m_resample_out_buf.get(), resample_out_len);
}
memset(m_input_tmp_buf.get(), 0, sizeof(float) * m_input_tmp_buf_len);
int add_size = m_input_tmp_buf_len - m_input_queue->size() % m_input_tmp_buf_len;
if (add_size != 0 && add_size < m_input_tmp_buf_len) {
m_input_queue->push(m_input_tmp_buf.get(), add_size);
}
int num = m_input_queue->size() / m_input_tmp_buf_len;
for (int i = 0; i < num; i++) {
rvc_process_step();
}
// 将所有数据拷贝出来
len = m_out_queue->size();
out_buf = new float[len];
m_out_queue->pop(out_buf, len);
}
int CRvcLiteOnlineRealTime::get_latency_ms() {
return m_rvc_inst->get_latency_ms() + 2000;
}
/*******************************对内函数***************************************/
void CRvcLiteOnlineRealTime::uninit() {
if (!m_init) {
return;
}
stop();
}
void CRvcLiteOnlineRealTime::stop() {
// 释放thread_pool的数据,先通知一下rvc_process,防止是在等待中
m_rvc_stop = true;
if (m_thread_pool) {
m_rvc_cond.notify_all();
m_thread_pool->stop();
}
}
void CRvcLiteOnlineRealTime::rvc_process_step() {
struct timeval start;
struct timeval end;
int sample_out_len = 0;
// 开始处理
if (m_input_queue->size() < m_input_tmp_buf_len) {
return;
}
gettimeofday(&start, NULL);
m_input_queue->pop(m_input_tmp_buf.get(), m_input_tmp_buf_len);
m_rvc_inst->process_block(m_input_tmp_buf.get(), m_input_tmp_buf_len,
m_output_tmp_buf.get(), m_output_tmp_buf_len);
gettimeofday(&end, NULL);
LOGD("RvcLite", "rvc_process process sp %f ms",
(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);
// 重采样
// 考虑到此处采样率变大,但是最多也不到两倍,但是通道数有可能扩展到两倍,所以按照1/4进行设置
gettimeofday(&start, NULL);
+ bool last = false;
int step = m_resample_buf_max_len / 4;
for (int i = 0; i < m_output_tmp_buf_len; i += step) {
- if (i + step > m_output_tmp_buf_len) {
+ if (i + step >= m_output_tmp_buf_len) {
step = m_output_tmp_buf_len - i;
+ last = true;
}
// 此时的输入是单声道,采样点数量和总长度一致
sample_out_len = m_resample2src->get_out_samples(step);
m_resample2src->resample(m_output_tmp_buf.get() + i, step, m_resample_out_buf.get(), sample_out_len);
+
+ // 从有到无
+ if(last && m_syn_state == RVC_LITE_RT_SYN_STATE_EFFECT2DEFAULT)
+ {
+ // 因为不加音效也需要延迟对齐,所以此处只要做fade_out就行了
+ for(int ii =0; ii < sample_out_len * m_channel; ii+=m_channel)
+ {
+ float rate = ii * 1.0 / step;
+ for(int jj = 0; jj < m_channel; jj++)
+ {
+ m_resample_out_buf.get()[ii+jj] = m_resample_out_buf.get()[ii+jj] * (1 - rate);
+ }
+ }
+ m_syn_state = RVC_LITE_RT_SYN_STATE_BEFORE_DEFAULT;
+ }
+
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_out_queue->push(m_resample_out_buf.get(), sample_out_len * m_channel);
}
}
gettimeofday(&end, NULL);
LOGD("RvcLite", "rvc_process re_resample sp %f ms",
(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);
printf("finish ...\n");
}
void CRvcLiteOnlineRealTime::rvc_process() {
int sample_in_len;
int sample_out_len = 0;
while (!m_rvc_stop) {
{
// 重采样
std::unique_lock<std::mutex> lock(m_rvc_mutex);
if (m_resample_queue->size() < m_resample_buf_max_len) {
// 睡眠前检查下情况
if (m_rvc_stop) {
return;
}
m_rvc_cond.wait(lock);
continue;
}
sample_in_len = m_resample_buf_max_len;
m_resample_queue->pop(m_resample_in_buf.get(), sample_in_len);
}
- // 重采样到16k,此处采样率变低,所以不会出现sample_out_len > sample_in_len的情况
- sample_out_len = m_resample16->get_out_samples(sample_in_len / m_channel);
- m_resample16->resample(m_resample_in_buf.get(), sample_in_len / m_channel, m_resample_out_buf.get(),
- sample_out_len);
- m_input_queue->push(m_resample_out_buf.get(), sample_out_len);
-
- // 开始变声
+ /**
+ * 此处有三种情况:
+ * 因为无论哪种变换,有延迟的存在,导致输入的数据都是需要塞0进去,所以对当前的数据做fade_out即可
+ * 1. 无到有:对无到有的部分做个fade_out,对下一帧要塞入音效器的部分做fade_in
+ * 2. 有到无:对无到有的部分做个fade_out,对下一帧要塞入音效器的部分做fade_in
+ * 3. 有到有[这个不用考虑,内部自己做了处理]
+ */
if (m_synth_path != m_new_synth_path) {
+
+ // 从无到有,此时对本帧做fade_out,对下一帧输入做fade_in
+ if(m_synth_path.empty() && !m_new_synth_path.empty())
+ {
+ m_syn_state = RVC_LITE_RT_SYN_STATE_DEFAULT2EFFECT;
+ }
+
+ // 从有到无
+ if (!m_synth_path.empty() && m_new_synth_path.empty())
+ {
+ m_syn_state = RVC_LITE_RT_SYN_STATE_EFFECT2DEFAULT;
+ }
+
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_synth_path = m_new_synth_path;
}
m_rvc_inst->switch_synth_model(m_new_synth_path.c_str());
}
+
+ // 刚切过来第一次做效果
+ if(m_syn_state == RVC_LITE_RT_SYN_STATE_BEFORE_DEFAULT)
+ {
+ // 刚从有到无,需要清空数据,以及对输入的队列添加fade_in
+ m_latency_queue->reset();
+ // 算法本身有延迟,所有为了保证延迟一致,在无效果的时候需要添加该延迟
+ memset(m_output_tmp_buf.get(), 0, sizeof(float) * m_output_tmp_buf_len);
+ int latency_len = gs_crossfade_time * m_sample_rate * m_channel;
+ for (int j = 0; j < latency_len / m_output_tmp_buf_len; j++) {
+ m_latency_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len);
+ }
+ m_latency_queue->push(m_output_tmp_buf.get(), latency_len % m_output_tmp_buf_len);
+
+ // 对输入做fade_in
+ for(int i = 0; i < sample_in_len; i+=m_channel)
+ {
+ float rate = i * 1.0 / sample_in_len;
+ for(int j = 0; j < m_channel; j++)
+ {
+ m_resample_in_buf.get()[i+j] *= rate;
+ }
+ }
+ m_syn_state = RVC_LITE_RT_SYN_STATE_DEFAULT;
+ }
+
+ // 不做效果
+ if(m_syn_state == RVC_LITE_RT_SYN_STATE_DEFAULT)
+ {
+ m_latency_queue->push(m_resample_in_buf.get(), sample_in_len);
+ m_latency_queue->pop(m_resample_in_buf.get(), sample_in_len);
+ {
+ std::unique_lock<std::mutex> lock(m_rvc_mutex);
+ m_out_queue->push(m_resample_in_buf.get(), sample_in_len);
+ }
+ continue;
+ }
+
+ // 从无到有的转换
+ if (m_syn_state == RVC_LITE_RT_SYN_STATE_DEFAULT2EFFECT)
+ {
+ // 做fade_out
+ for(int i = 0; i < sample_in_len; i+=m_channel)
+ {
+ float rate = i * 1.0 / sample_in_len;
+ for(int j = 0; j < m_channel; j++)
+ {
+ m_resample_in_buf.get()[i+j] *= 1 - rate;
+ }
+ }
+ m_latency_queue->push(m_resample_in_buf.get(), sample_in_len);
+ m_latency_queue->pop(m_resample_in_buf.get(), sample_in_len);
+ {
+ std::unique_lock<std::mutex> lock(m_rvc_mutex);
+ m_out_queue->push(m_resample_in_buf.get(), sample_in_len);
+ }
+
+ // 此时对于rvc来说输入的数据不连贯了,所以清空内部数据重新搞
+ m_syn_state = RVC_LITE_RT_SYN_STATE_EFFECT;
+ m_rvc_inst->reset();
+ continue;
+ }
+
+ // 重采样到16k,此处采样率变低,所以不会出现sample_out_len > sample_in_len的情况
+ sample_out_len = m_resample16->get_out_samples(sample_in_len / m_channel);
+ m_resample16->resample(m_resample_in_buf.get(), sample_in_len / m_channel, m_resample_out_buf.get(),
+ sample_out_len);
+ m_input_queue->push(m_resample_out_buf.get(), sample_out_len);
rvc_process_step();
}
}
\ No newline at end of file
diff --git a/mnn_demo/src/CRvcLiteSynthesizer.cpp b/mnn_demo/src/CRvcLiteSynthesizer.cpp
new file mode 100644
index 0000000..9bce8d7
--- /dev/null
+++ b/mnn_demo/src/CRvcLiteSynthesizer.cpp
@@ -0,0 +1,106 @@
+//
+// Created by Administrator on 2024/1/21.
+//
+
+#include "CRvcLiteSynthesizer.h"
+#include <cstring>
+#include <sys/time.h>
+
+CRvcLiteSynthesizer::CRvcLiteSynthesizer(){}
+
+CRvcLiteSynthesizer::~CRvcLiteSynthesizer() {}
+
+int CRvcLiteSynthesizer::init(const char *hubert_model, const char *synth_model, int sample_rate, int channel)
+{
+ m_rvc_inst = std::make_shared<CRvcLiteOnline>();
+ int err = m_rvc_inst->init(hubert_model);
+ if (err != ERR_RVC_LITE_SUCCESS)
+ {
+ return err;
+ }
+ err = m_rvc_inst->switch_synth_model(synth_model);
+ if (err != ERR_RVC_LITE_SUCCESS)
+ {
+ return err;
+ }
+ m_resample2_16 = std::make_shared<CResample>();
+ m_resample2_16->init(sample_rate, gs_src_samplerate, channel, 1);
+ m_resample2src = std::make_shared<CResample>();
+ m_resample2src->init(gs_dst_samplerate, sample_rate, 1, channel);
+
+ m_channel = channel;
+ m_sample_rate = sample_rate;
+
+ m_buf_tmp_16k_len = 0;
+ m_buf_tmp_16k_cap = 0;
+ m_buf_tmp_32k_len = 0;
+ m_buf_tmp_32k_cap = 0;
+ m_buf_tmp_src_len = 0;
+ m_buf_tmp_src_cap = 0;
+ return ERR_RVC_LITE_SUCCESS;
+}
+
+int CRvcLiteSynthesizer::process(float *in_buf, int in_len, float *out_buf, int &out_len) {
+ // 1 重采样 2 推理 3 再次重采样
+ int resample_out_len = m_resample2_16->get_out_samples(in_len / m_channel);
+ // 控制逻辑,不能超过该长度
+ if (resample_out_len > gs_src_samplerate) {
+ return ERR_RVC_LITE_BLOCK_TOO_LONG;
+ }
+
+ if (m_buf_tmp_16k_cap < resample_out_len) {
+ m_buf_tmp_16k_cap = resample_out_len;
+ m_buf_tmp_16k = std::shared_ptr<float>(new float[m_buf_tmp_16k_cap], std::default_delete<float[]>());
+ }
+ m_buf_tmp_16k_len = resample_out_len;
+ int err = m_resample2_16->resample(in_buf, in_len / m_channel, m_buf_tmp_16k.get(), m_buf_tmp_16k_len);
+ if (err != ERR_RVC_LITE_SUCCESS) {
+ return err;
+ }
+ if (m_buf_tmp_32k_cap < m_buf_tmp_16k_len * 2) {
+ m_buf_tmp_32k_cap = m_buf_tmp_16k_len * 2;
+ m_buf_tmp_32k = std::shared_ptr<float>(new float[m_buf_tmp_32k_cap], std::default_delete<float[]>());
+ }
+ m_buf_tmp_32k_len = m_buf_tmp_16k_len * 2;
+
+ // 推理
+ err = m_rvc_inst->process_block(m_buf_tmp_16k.get(), m_buf_tmp_16k_len, m_buf_tmp_32k.get(), m_buf_tmp_32k_len);
+ if (err != ERR_RVC_LITE_SUCCESS) {
+ return err;
+ }
+ // 重采样回来
+ int out_frame = m_resample2src->get_out_samples(m_buf_tmp_32k_len);
+ if (m_buf_tmp_src_cap < out_frame * m_channel) {
+ m_buf_tmp_src_cap = out_frame * m_channel;
+ m_buf_tmp_src = std::shared_ptr<float>(new float[m_buf_tmp_src_cap], std::default_delete<float[]>());
+ }
+ m_buf_tmp_src_len = out_frame;
+ err = m_resample2src->resample(m_buf_tmp_32k.get(), m_buf_tmp_32k_len, m_buf_tmp_src.get(), m_buf_tmp_src_len);
+ if (err != ERR_RVC_LITE_SUCCESS) {
+ return err;
+ }
+
+ // 取较小的值
+ if (out_len > m_buf_tmp_src_len * m_channel)
+ {
+ out_len = m_buf_tmp_src_len * m_channel;
+ }
+
+ memcpy(out_buf, m_buf_tmp_src.get(), sizeof(float) * out_len);
+ return ERR_RVC_LITE_SUCCESS;
+}
+
+float CRvcLiteSynthesizer::get_rtf()
+{
+ struct timeval start;
+ struct timeval end;
+ gettimeofday(&start, NULL);
+ int in_len = m_sample_rate * m_channel - 100 *m_channel;
+ int out_len = in_len;
+ float* in_buf = new float[in_len];
+ process(in_buf, in_len, in_buf, in_len);
+ delete [] in_buf;
+ gettimeofday(&end, NULL);
+ double sp = (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0;
+ return sp / 1000;
+}
\ No newline at end of file
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Sun, Jan 12, 08:32 (1 d, 15 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1347189
Default Alt Text
(56 KB)
Attached To
R350 av_svc
Event Timeline
Log In to Comment