Page MenuHomePhabricator

No OneTemporary

diff --git a/mnn_demo/CMakeLists.txt b/mnn_demo/CMakeLists.txt
index 0faf584..895b2c7 100644
--- a/mnn_demo/CMakeLists.txt
+++ b/mnn_demo/CMakeLists.txt
@@ -1,70 +1,69 @@
cmake_minimum_required(VERSION 3.0)
project(mnn_demo)
set(CMAKE_CXX_STANDARD 14)
set(MNN_DIR /opt/soft/MNN)
set(FFMPEG_DIR /opt/soft/ffmpeg/ffmpeg)
set(LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/lib)
IF (WIN32)
MESSAGE(STATUS "Now is windows")
set(MNN_DIR D:/soft/mnn/MNN)
set(FFMPEG_DIR D:/soft/ffmpeg/linux_ffmpeg)
ENDIF ()
# MNN
include_directories(${MNN_DIR}/include)
include_directories(${MNN_DIR}/include/MNN)
# ffmpeg
include_directories(${FFMPEG_DIR}/include)
# 项目头文件
include_directories(inc src)
include_directories(ref)
include_directories(ref/av_waves/waves/inc)
include_directories(ref/av_resample/audio_resample/inc)
include_directories(ref/thread)
include_directories(third_party)
# 第三方依赖
add_subdirectory(ref)
# 源数据
FILE(GLOB SRC_DIR ${PROJECT_SOURCE_DIR}/src/*cpp)
FILE(GLOB THIRD_SRC_DIR ${PROJECT_SOURCE_DIR}/third_party/espyin-v1.0/*cpp)
#add_library(svc_lite ${SRC_DIR})
-add_executable(mnn_demo main.cpp ${SRC_DIR} ${THIRD_SRC_DIR}
- tests/test_flatbuffer.cpp)
+add_executable(mnn_demo main.cpp ${SRC_DIR} ${THIRD_SRC_DIR})
#add_executable(test_circle_buffer tests/test_CRvcCircleBuffer.cpp src/CRvcCircleBuffer.cpp)
target_link_libraries(mnn_demo
/opt/soft/MNN/build/libMNN.a
${LIBRARY_OUTPUT_PATH}/libwaves.a
${LIBRARY_OUTPUT_PATH}/libthread.a
${LIBRARY_OUTPUT_PATH}/libaudio_resample.a
)
target_link_libraries(mnn_demo
${FFMPEG_DIR}/lib/libavfilter.a
${FFMPEG_DIR}/lib/libavformat.a
${FFMPEG_DIR}/lib/libavcodec.a
${FFMPEG_DIR}/lib/libswresample.a
${FFMPEG_DIR}/lib/libswscale.a
${FFMPEG_DIR}/lib/libavutil.a
)
# 测试代码
#include_directories(${MNN_DIR}/tools)
#include_directories(${MNN_DIR}/3rd_party/flatbuffers/include)
#
#add_executable(test_flat_buffer tests/test_flatbuffer.cpp ${FLAT_SRC_DIR}
# ${MNN_DIR}/3rd_party/flatbuffers/src/code_generators.cpp
# ${MNN_DIR}/3rd_party/flatbuffers/src/idl_gen_text.cpp
# ${MNN_DIR}/3rd_party/flatbuffers/src/idl_parser.cpp
# ${MNN_DIR}/3rd_party/flatbuffers/src/util.cpp
# tests/half.hpp
#)
\ No newline at end of file
diff --git a/mnn_demo/inc/CRvcLiteOnline.h b/mnn_demo/inc/CRvcLiteOnline.h
index f1c7677..ad62799 100644
--- a/mnn_demo/inc/CRvcLiteOnline.h
+++ b/mnn_demo/inc/CRvcLiteOnline.h
@@ -1,296 +1,309 @@
//
// Created by jianli.yang on 2023/11/29.
//
#ifndef MNN_DEMO_CRVCLITEONLINE_H
#define MNN_DEMO_CRVCLITEONLINE_H
#define DEBUG
#ifdef __ANDROID__
#include <android/log.h>
#ifdef STRELEASE
#define LOGD(...)
#define LOGE(...)
#else
#define LOGD(TAG, ...) __android_log_print(ANDROID_LOG_DEBUG , TAG, __VA_ARGS__)
#define LOGE(TAG, ...) __android_log_print(ANDROID_LOG_ERROR , TAG, __VA_ARGS__)
#endif
#else
#ifdef DEBUG
#define LOGD(TAG, ...) printf("\nDebug: %s",TAG);printf(__VA_ARGS__);
#define LOGE(TAG, ...) printf("\nError: %s",TAG);printf(__VA_ARGS__);
#else
#define LOGD(TAG, ...)
#define LOGE(TAG, ...)
#endif
#endif
#include <mutex>
#include <string>
#include <memory>
#include <vector>
#include <condition_variable>
#define gs_src_samplerate 16000
#define gs_dst_samplerate 32000
#define gs_crossfade_time 0.08 // 单位是s
#define gs_block_time 1
#define gs_extra_time 1
#define gs_hubert_frame 206 // 和模型相关
#define gs_hubert_dim 256 // 和模型相关
#define gs_synth_input_frame 205 // 和模型相关
#define gs_synth_input_dim 258 // 和模型相关
#define gs_synth_output_frame 35840 // 和模型相关
enum {
ERR_RVC_LITE_SUCCESS = 0,
ERR_RVC_LITE_NOT_INIT = 1,
ERR_RVC_LITE_REINIT = 2,
ERR_RVC_LITE_RT_REINIT = 3,
ERR_RVC_LITE_RT_NOT_INIT = 4,
ERR_RVC_LITE_RT_NOT_ENOUGH_DATA = 5,
ERR_RVC_LITE_RT_INPUT_SAMPLE_ERR = 6, // 采样率小于16000
ERR_RVC_LITE_RT_RESAMPLE_OUTBUF_SHORT = 7, // 重采样后的buf太短
+ ERR_RVC_LITE_NOT_SWITCH_MODEL = 8, // 重采样后的buf太短
+ ERR_RVC_LITE_MODEL_NOT_EXISTS = 9, // 没有人声模型
+ ERR_RVC_LITE_BLOCK_TOO_LONG = 10, // 区块过大
};
+const int RVC_LITE_RT_SYN_STATE_DEFAULT = 0;
+const int RVC_LITE_RT_SYN_STATE_EFFECT = 1;
+const int RVC_LITE_RT_SYN_STATE_DEFAULT2EFFECT = 2;
+const int RVC_LITE_RT_SYN_STATE_EFFECT2DEFAULT = 3;
+const int RVC_LITE_RT_SYN_STATE_BEFORE_DEFAULT = 4;
class Hubert;
class CSynthesizer;
class ESPYIN;
class CThreadPool;
class CRvcCircleBuffer;
class CFfmpegResampler;
/**
* Rvc轻量化实时推理代码
* 要求输入16k的音频数据,输出是目标采样率的数据
*/
class CRvcLiteOnline {
public:
CRvcLiteOnline();
~CRvcLiteOnline();
private:
void uninit();
void get_f0_post();
void get_pyin_f0();
void init_variable();
public:
/**
* 初始化函数
* @param hubert_model_path
- * @param synth_model_path
* @return 0 表示正常
*/
- int init(const char *hubert_model_path, const char *synth_model_path);
+ int init(const char *hubert_model_path);
/**
* 换音色模型
* @param synth_model_path
* @return
*/
int switch_synth_model(const char* synth_model_path);
/**
* 处理定长的一帧数据
* 要求输入单声道16k音频
* @param in_buf
* @param in_len 长度小于等于gs_src_samplerate,最佳是gs_src_samplerate
* @param out_buf
* @param out_len 小于等于gs_dst_samplerate,最佳是gs_dst_samplerate[和输入有关,如果是32k,则恰好是输入的两倍]
* @return 0 表示正常
*/
int process_block(float *in_buf, int in_len, float *out_buf, int out_len);
/**
* 清空存储
* @return
*/
void reset();
/**
* 获取延迟时间
* @return
*/
int get_latency_ms();
private:
// 是否进行过init
bool m_init;
+ bool m_switch_model;
std::shared_ptr<Hubert> m_hubert_inst;
std::shared_ptr<CSynthesizer> m_synthesizer_inst;
std::shared_ptr<ESPYIN> m_es_pyin;
// 缓存使用的数据
// 要求输入的时间片长度,采样点数
int m_input_block_frame;
// 推理时额外需要的长度
int m_input_extra_frame;
// 推理时使用的buffer长度
int m_input_predict_buf_frame;
// 推理时使用的buffer
float *m_input_predict_buf;
std::vector<float> m_f0_data;
std::vector<float> m_f0_coarse_data;
// 输出的情况
int m_crossfade_frame;
int m_output_block_frame;
int m_output_cache_buf_frame;
float *m_crossfade_buf;
float *m_output_cache_buf;
+ bool m_fade_in;
// 各个实例的返回结果
std::vector<std::vector<std::vector<float>>> m_hubert_ret;
std::vector<std::vector<std::vector<float>>> m_synth_input;
std::vector<std::vector<std::vector<float>>> m_synth_out;
};
class CResample {
public:
CResample();
~CResample();
public:
int init(int in_samplerate, int out_samplerate, int in_channel=1, int out_channel=1);
// 返回的是单通道的采样点数
int get_out_samples(int num);
int get_latency();
void reset();
// 不考虑让内部缓存的情况,有多少拿多少,in_num和out_num均是单通道采样点个数
int resample(float * in_buf, int in_num, float * out_buf, int & out_num);
private:
std::shared_ptr<CFfmpegResampler> m_resample_inst;
int m_in_channel;
int m_out_channel;
};
/**
* 实时处理的类
* 入一帧出一帧,允许非常短的帧做输入,延迟较高,在2s左右
* 思路:
* 1. 构造函数设置变量
* 2. init初始化环境,开启处理线程
* 3. process,每次送一帧,触发一次判断逻辑
* 4. flush函数将输入的未处理的数据全部处理一次,联合之前没有被取出的数据一起刷出来
* 5. 析构时关闭处理线程,并释放所有空间
*/
class CRvcLiteOnlineRealTime {
public:
CRvcLiteOnlineRealTime();
~CRvcLiteOnlineRealTime();
private:
void init_variable();
void rvc_process();
void rvc_process_step();
void uninit();
void stop();
public:
/**
* 初始化函数
* @param hubert_model_path
- * @param synth_model_path
* @param sample_rate
* @param channel
* @return
*/
- int init(const char *hubert_model_path, const char *synth_model_path, int sample_rate, int channel);
+ int init(const char *hubert_model_path, int sample_rate, int channel);
/**
* 切换音色
* @param synth_model_path
* @return
*/
int switch_synth(const char *synth_model_path);
/**
* 清空缓存
*/
void reset();
/**
* 入一帧,出一帧,要求长度一致
* 两者可以是同一块buffer
* @param in_buf
* @param in_len
* @param out_buf
* @param out_len
* @return
*/
int process(float *in_buf, int in_len, float *out_buf, int out_len);
/**
* 将所有处理好的结果获取出来
* 因为不确定还有多少,所以由内部来开辟空间,外部进行释放
* @return
*/
void flush(float *&out_buf, int &len);
/**
* 获取延迟时间
*/
int get_latency_ms();
private:
int m_sample_rate;
int m_channel;
std::shared_ptr<CRvcCircleBuffer> m_resample_queue;
std::shared_ptr<CRvcCircleBuffer> m_input_queue;
std::shared_ptr<CRvcCircleBuffer> m_out_queue;
int m_input_tmp_buf_len;
int m_output_tmp_buf_len;
std::shared_ptr<float> m_input_tmp_buf;
std::shared_ptr<float> m_output_tmp_buf;
std::shared_ptr<CRvcLiteOnline> m_rvc_inst;
std::shared_ptr<CThreadPool> m_thread_pool;
// 逻辑变量
bool m_init;
// 处理线程相关
bool m_rvc_stop;
std::mutex m_rvc_mutex;
std::condition_variable m_rvc_cond;
// 重采样相关
std::shared_ptr<CResample> m_resample16;
std::shared_ptr<CResample> m_resample2src;
int m_resample_buf_max_len;
std::shared_ptr<float> m_resample_in_buf;
std::shared_ptr<float> m_resample_out_buf;
// 切换音色
std::string m_synth_path;
std::string m_new_synth_path;
+
+ // 合成的状态
+ int m_syn_state;
+ // 延迟器
+ std::shared_ptr<CRvcCircleBuffer> m_latency_queue;
};
#endif //MNN_DEMO_CRVCLITEONLINE_H
diff --git a/mnn_demo/inc/CRvcLiteSynthesizer.h b/mnn_demo/inc/CRvcLiteSynthesizer.h
new file mode 100644
index 0000000..bac8c21
--- /dev/null
+++ b/mnn_demo/inc/CRvcLiteSynthesizer.h
@@ -0,0 +1,58 @@
+//
+// Created by Administrator on 2024/1/21.
+//
+
+#ifndef MNN_DEMO_CRVCLITESYNTHESIZER_H
+#define MNN_DEMO_CRVCLITESYNTHESIZER_H
+#include "CRvcLiteOnline.h"
+
+class CRvcLiteSynthesizer
+{
+public:
+ CRvcLiteSynthesizer();
+ ~CRvcLiteSynthesizer();
+
+public:
+ /**
+ * 初始化
+ * @param hubert_model 语义模型地址
+ * @param synth_model 音色模型地址
+ * @param sample_rate 采样率
+ * @param channel 通道数
+ * @return 0 表示正常
+ */
+ int init(const char* hubert_model, const char* synth_model, int sample_rate, int channel);
+
+ /**
+ * 处理逻辑
+ * @param in_buf 输入的buf
+ * @param in_len 输入的Buf长度,frame*channel,建议输入小于等于1s的音频长度,尽量的大就好
+ * @param out_buf 输出的buf
+ * @param out_len 输出的buf长度, frame*channel
+ * 注意: 此处有可能出现输出的长度不一定等于in_len,输出的值会小于等于out_len,但是是连续的,所以out_len可以适当比in_len大一些,从而保证都能搞出来
+ * @return
+ */
+ int process(float* in_buf, int in_len, float* out_buf, int &out_len);
+
+ // 获取实时率,处理1s数据的真实耗时/1s
+ float get_rtf();
+
+private:
+ std::shared_ptr<CRvcLiteOnline> m_rvc_inst;
+ std::shared_ptr<CResample> m_resample2_16;
+ std::shared_ptr<CResample> m_resample2src;
+ int m_channel;
+ int m_sample_rate;
+ std::shared_ptr<float> m_buf_tmp_16k;
+ int m_buf_tmp_16k_len;
+ int m_buf_tmp_16k_cap;
+ std::shared_ptr<float> m_buf_tmp_32k;
+ int m_buf_tmp_32k_len;
+ int m_buf_tmp_32k_cap;
+ std::shared_ptr<float> m_buf_tmp_src;
+ int m_buf_tmp_src_len;
+ int m_buf_tmp_src_cap;
+};
+
+
+#endif //MNN_DEMO_CRVCLITESYNTHESIZER_H
diff --git a/mnn_demo/main.cpp b/mnn_demo/main.cpp
index 7b4f6dc..8aa637d 100644
--- a/mnn_demo/main.cpp
+++ b/mnn_demo/main.cpp
@@ -1,171 +1,221 @@
#include <sys/time.h>
#include <thread>
#include <chrono>
#include "src/Hubert.h"
#include "src/CSynthesizer.h"
-
+#include "CRvcLiteSynthesizer.h"
int test_hubert() {
const char *hubert_model_path = "/mnt/d/dataset/svc/models/mnn/hubert_test_v1_fp16.mnn";
Hubert hubert;
int err_code = hubert.init(hubert_model_path);
std::vector<float> input(33280, 0.1);
std::vector<std::vector<std::vector<float>>> ret;
ret.resize(1);
ret[0].resize(205);
for (int i = 0; i < 205; i++) {
ret[0][i].resize(256);
}
float time = hubert.process(input.data(), ret);
return 0;
}
int test_contentvec() {
const char *contentvec_model_path = "/mnt/d/dataset/svc/models/mnn/contentvec_test_fp16.mnn";
CSynthesizer contentVec;
int err_code = contentVec.init(contentvec_model_path);
std::vector<std::vector<std::vector<float>>> input(1);
input[0].resize(205);
for (int i = 0; i < 205; i++) {
for (int j = 0; j < 258; j++) {
if (j == 256) {
input[0][i].push_back(0.2);
} else if (j == 257) {
input[0][i].push_back(1.0);
} else {
input[0][i].push_back(0.1);
}
}
}
std::vector<std::vector<std::vector<float>>> ret;
ret.resize(1);
for (int i = 0; i < 1; i++) {
ret[i].resize(1);
ret[i][0].resize(35840);
}
float tot = 0.f;
for (int i = 0; i < 10; i++) {
float time = contentVec.process(input, ret);
tot += time;
}
printf("time: %f \n", tot / 100.f);
return 0;
}
#include "CRvcLiteOnline.h"
#include "av_waves/waves/inc/STWaveFile.h"
void test() {
const char *hubert_model_path = "/mnt/d/dataset/svc/models/mnn/hubert_test_v2_fp16.mnn";
const char *contentvec_model_path = "/mnt/d/dataset/svc/models/mnn/contentvec_test_fp16.mnn";
const char *in_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01_16.wav";
// const char *in_wav = "/mnt/d/code/develop/svc/Retrieval-based-Voice-Conversion-WebUI/online/1_1.wav";
const char *out_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01_cpp_v1.wav";
CRvcLiteOnline rvc_inst;
- rvc_inst.init(hubert_model_path, contentvec_model_path);
+ rvc_inst.init(hubert_model_path);
// 读取音频文件, 要求16k,单声道
STCWaveFile wav_inst(in_wav, false);
int sample_rate = wav_inst.GetSampleRate();
int channel = wav_inst.GetChannels();
int len = wav_inst.GetTotalFrames() * channel;
float *data = new float[len];
float *outdata = new float[len * 2];
wav_inst.ReadFrameAsfloat(data, wav_inst.GetTotalFrames());
int step = sample_rate;
printf("start ..\n");
for (int i = 0; i < len; i += step) {
if (i + step > len) {
step = len - i;
}
struct timeval start;
struct timeval end;
gettimeofday(&start, NULL);
rvc_inst.process_block(data + i, step, outdata + 2 * i, 2 * step);
gettimeofday(&end, NULL);
printf("sp = %f ms\n", (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);
}
STCWaveFile wav_out_inst(out_wav, true);
wav_out_inst.SetSampleRate(32000);
wav_out_inst.SetChannels(1);
wav_out_inst.SetSampleFormat(SF_IEEE_FLOAT);
wav_out_inst.SetupDone();
wav_out_inst.WriteFrame(outdata, len * 2);
printf("finish2 ....\n");
}
+void test_rvc_lite_synth()
+{
+ const char *hubert_model_path = "/mnt/d/dataset/svc/models/layers_3/layer3_contentvec.mnn";
+ const char *syz_model = "/mnt/d/dataset/svc/models/layers_3/layer3_syz.mnn";
+ const char *out_wav = "/mnt/d/dataset/tmp/i_out3.wav";
+ const char *in_wav = "/mnt/d/dataset/tmp/t1.wav";
+
+ STCWaveFile wav_inst(in_wav, false);
+ int sample_rate = wav_inst.GetSampleRate();
+ int channel = wav_inst.GetChannels();
+ int len = wav_inst.GetTotalFrames() * channel;
+ float *data = new float[len];
+ float *outdata = new float[len];
+ wav_inst.ReadFrameAsfloat(data, wav_inst.GetTotalFrames());
+ CRvcLiteSynthesizer m_rvc_inst;
+ int err = m_rvc_inst.init(hubert_model_path, syz_model, sample_rate, channel);
+ printf("init err=%d!\n", err);
+ printf("rtf=%f\n", m_rvc_inst.get_rtf());
+ int step = sample_rate * channel - 100 * channel;
+ int out_len = 0;
+ for(int i = 0; i < len; i+=step)
+ {
+ if (i + step > len) {
+ step = len - i;
+ }
+ int out_step = step;
+ err = m_rvc_inst.process(data+i, step, outdata+out_len, out_step);
+ if(err != ERR_RVC_LITE_SUCCESS)
+ {
+ printf("process err!\n");
+ return ;
+ }
+ out_len += out_step;
+ }
+ STCWaveFile wav_out_inst(out_wav, true);
+ wav_out_inst.SetSampleRate(sample_rate);
+ wav_out_inst.SetChannels(channel);
+ wav_out_inst.SetSampleFormat(SF_IEEE_FLOAT);
+ wav_out_inst.SetupDone();
+ wav_out_inst.WriteFrame(outdata, wav_inst.GetTotalFrames());
+ delete[] data;
+ delete[] outdata;
+}
+
+
void test_rvc_lite_online() {
// const char *hubert_model_path = "/mnt/d/dataset/svc/models/mnn/hubert_test_v2_fp16.mnn";
// const char *hubert_model_path = "/mnt/d/dataset/svc/models/layer6_bingxiao_v1/mnn/layers6_checkpoint_14_1660000_1_hubert.mnn";
const char *hubert_model_path = "/mnt/d/dataset/svc/models/layers_3/layer3_contentvec.mnn";
// const char *contentvec_model_path = "/mnt/d/dataset/svc/models/mnn/contentvec_test_fp16.mnn";
// const char *syz_model = "/mnt/d/dataset/svc/models/layer6_bingxiao_v1/mnn/xusong_v1_6hubert_hifix_syz_base_vctk_kd_32k_hubert6_jianli_e225_s62775_205.mnn";
- const char *syz_model = "/mnt/d/dataset/svc/models/layers_3/layer3_xusong.mnn";
+ const char *xs_model = "/mnt/d/dataset/svc/models/layers_3/layer3_xusong.mnn";
+ const char *syz_model = "/mnt/d/dataset/svc/models/layers_3/layer3_syz.mnn";
// const char *contentvec_model_path = "/mnt/d/dataset/svc/models/layer6_bingxiao_v1/mnn/xiafan_fp16.mnn";
- const char *in_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01.wav";
+// const char *in_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01.wav";
+ const char *in_wav = "/mnt/d/dataset/tmp/t1.wav";
// const char* in_wav = "/mnt/d/dataset/svc/dataset/短数据样本/男声/qiankun.wav";
// const char* in_wav = "/mnt/d/dataset/tmp/i.wav";
// const char *in_wav = "/mnt/d/code/develop/svc/Retrieval-based-Voice-Conversion-WebUI/online/1_1.wav";
// const char *out_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01_cpp_v4.wav";
// const char *out_wav = "/mnt/d/dataset/svc/dataset/tests/qiankun_412_v4.wav";
const char *out_wav = "/mnt/d/dataset/tmp/i_out2.wav";
// 读取音频文件, 要求16k,单声道
STCWaveFile wav_inst(in_wav, false);
int sample_rate = wav_inst.GetSampleRate();
int channel = wav_inst.GetChannels();
int len = wav_inst.GetTotalFrames() * channel;
float *data = new float[len];
float *outdata = new float[len];
CRvcLiteOnlineRealTime rvc_inst;
- rvc_inst.init(hubert_model_path, syz_model, sample_rate, channel);
+ rvc_inst.init(hubert_model_path, sample_rate, channel);
wav_inst.ReadFrameAsfloat(data, wav_inst.GetTotalFrames());
int step = 1024;
printf("start ..\n");
bool flag = true;
+ rvc_inst.switch_synth(syz_model);
for (int i = 0; i < len; i += step) {
if (i + step > len) {
step = len - i;
}
struct timeval start;
struct timeval end;
gettimeofday(&start, NULL);
int ret = rvc_inst.process(data + i, step, outdata+i, step);
std::this_thread::sleep_for(std::chrono::milliseconds (15));
gettimeofday(&end, NULL);
printf("ret = %d, sp = %f ms step=%d\n", ret,
(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0, step);
if (flag && i >= len / 3) {
flag = false;
- rvc_inst.switch_synth(syz_model);
+ rvc_inst.reset();
+// rvc_inst.switch_synth(xs_model);
}
}
STCWaveFile wav_out_inst(out_wav, true);
wav_out_inst.SetSampleRate(sample_rate);
wav_out_inst.SetChannels(channel);
wav_out_inst.SetSampleFormat(SF_IEEE_FLOAT);
wav_out_inst.SetupDone();
wav_out_inst.WriteFrame(outdata, wav_inst.GetTotalFrames());
float* flush_data;
int flush_len;
rvc_inst.flush(flush_data, flush_len);
wav_out_inst.WriteFrame(flush_data, flush_len/channel);
printf("finish2 ....\n");
}
int main() {
// int ret_hubert = test_hubert();
// int ret_contentvec = test_contentvec();
// test();
// test();
- test_rvc_lite_online();
+// test_rvc_lite_online();
+ test_rvc_lite_synth();
return 0;
}
diff --git a/mnn_demo/src/CRvcLiteOnline.cpp b/mnn_demo/src/CRvcLiteOnline.cpp
index 3a08e27..4f9c833 100644
--- a/mnn_demo/src/CRvcLiteOnline.cpp
+++ b/mnn_demo/src/CRvcLiteOnline.cpp
@@ -1,633 +1,811 @@
//
// Created by Administrator on 2023/11/29.
//
#include <cmath>
#include <cstring>
#include <sys/time.h>
#include "CRvcLiteOnline.h"
#include "Hubert.h"
#include "CSynthesizer.h"
#include "espyin-v1.0/ESPYIN.h"
#include "ThreadPool.h"
#include "CRvcCircleBuffer.h"
#include "FfmpegResampler.h"
+#include <unistd.h>
+
+inline bool file_exists (const std::string& name) {
+ return ( access( name.c_str(), F_OK ) != -1 );
+}
// size代表了buf的长度
void stereo2mono(float *input, int size, float *output) {
for (int i = 0; i < size - 1; i += 2) {
output[i / 2] = (input[i] + input[i + 1]) / 2;
}
}
void mono2stereo(float *input, int size, float *output) {
for (int i = 0; i < size; i++) {
output[2 * i] = input[i];
output[2 * i + 1] = input[i];
}
}
CRvcLiteOnline::CRvcLiteOnline() {
init_variable();
m_init = false;
+ m_switch_model = false;
// 输入部分需要的变量
// 要求输入的时间片长度,采样点数
m_input_block_frame = int(gs_block_time * gs_src_samplerate);
// 推理时额外需要的长度
m_input_extra_frame = int(gs_extra_time * gs_src_samplerate);
int zc = gs_src_samplerate / 100; // 10ms的点数
int input_corssfade_frame = int(gs_crossfade_time * gs_src_samplerate);
// 推理时使用的buffer长度
m_input_predict_buf_frame = int(ceil((m_input_extra_frame + input_corssfade_frame + m_input_block_frame)
* 1.0 / zc) * zc);
// 推理时使用的buffer
m_input_predict_buf = new float[m_input_predict_buf_frame];
memset(m_input_predict_buf, 0, sizeof(float) * m_input_predict_buf_frame);
// 输出部分需要的变量
m_crossfade_frame = int(gs_crossfade_time * gs_dst_samplerate);
m_output_block_frame = int(gs_block_time * gs_dst_samplerate);
int output_extra_frame = int(gs_extra_time * gs_dst_samplerate);
zc = gs_dst_samplerate / 100;
m_output_cache_buf_frame = int(ceil((m_output_block_frame + m_crossfade_frame + output_extra_frame)
* 1.0 / zc) * zc);
m_output_cache_buf = new float[m_output_cache_buf_frame];
memset(m_output_cache_buf, 0, sizeof(float) * m_output_cache_buf_frame);
m_crossfade_buf = new float[m_crossfade_frame];
memset(m_crossfade_buf, 0, sizeof(float) * m_crossfade_frame);
// 对于模型的输入和输出进行缓存
// 此处是写死的和模型有关
m_hubert_ret.resize(1);
m_hubert_ret[0].resize(gs_hubert_frame);
for (int i = 0; i < gs_hubert_frame; i++) {
m_hubert_ret[0][i].resize(gs_hubert_dim);
}
// synth模型的输入
m_synth_input.resize(1);
m_synth_input[0].resize(gs_synth_input_frame);
for (int i = 0; i < gs_synth_input_frame; i++) {
m_synth_input[0][i].resize(gs_synth_input_dim);
}
m_synth_out.resize(1);
m_synth_out[0].resize(1);
m_synth_out[0][0].resize(gs_synth_output_frame);
}
CRvcLiteOnline::~CRvcLiteOnline() {
uninit();
}
/**********************************对内函数*********************************************/
void CRvcLiteOnline::uninit() {
if (m_input_predict_buf != NULL) {
delete[] m_input_predict_buf;
m_input_predict_buf = NULL;
}
if (m_output_cache_buf != NULL) {
delete[] m_output_cache_buf;
m_output_cache_buf = NULL;
}
if (m_crossfade_buf != NULL) {
delete[] m_crossfade_buf;
m_crossfade_buf = NULL;
}
init_variable();
}
void CRvcLiteOnline::get_pyin_f0() {
for (int i = 0; i < m_input_predict_buf_frame; i += 160) {
m_es_pyin->process(m_input_predict_buf + i);
}
m_f0_data.clear();
ESFeatureSet feats = m_es_pyin->getRemainingFeatures();
if (!feats.empty()) {
m_f0_data.resize(feats[4].size());
for (size_t i = 0; i < feats[4].size(); ++i) {
// JL_DEBUG
m_f0_data[i] = feats[4][i].values[0];
if (m_f0_data[i] < 0) {
m_f0_data[i] = 0;
}
}
}
m_es_pyin->reset();
get_f0_post();
}
void CRvcLiteOnline::get_f0_post() {
int f0_min = 50;
int f0_max = 1100;
float f0_mel_min = 1127 * log2(1 + f0_min * 1.0 / 700);
float f0_mel_max = 1127 * log2(1 + f0_max * 1.0 / 700);
m_f0_coarse_data.clear();
m_f0_coarse_data.resize(m_f0_data.size());
for (int i = 0; i < m_f0_data.size(); i++) {
float f0_mel = 1127 * log2(1 + m_f0_data[i] / 700);
if (f0_mel > 0) {
f0_mel = (f0_mel - f0_mel_min) * 254.f / (f0_mel_max - f0_mel_min) + 1;
}
if (f0_mel <= 1) {
f0_mel = 1;
} else if (f0_mel > 255) {
f0_mel = 255;
}
m_f0_coarse_data[i] = float(int(f0_mel + 0.5));
}
}
void CRvcLiteOnline::init_variable() {
m_init = false;
+ m_switch_model = false;
// 缓存使用的数据
// 要求输入的时间片长度,采样点数
m_input_block_frame = 0;
m_input_extra_frame = 0;
m_input_predict_buf_frame = 0;
m_input_predict_buf = nullptr;
m_f0_data.clear();
m_f0_coarse_data.clear();
m_crossfade_frame = 0;
m_output_block_frame = 0;
m_output_cache_buf_frame = 0;
m_crossfade_buf = nullptr;
m_output_cache_buf = nullptr;
// 各个实例的返回结果
m_hubert_ret.clear();
m_synth_input.clear();
m_synth_out.clear();
+
+ m_fade_in = true;
}
/**********************************对外函数*********************************************/
-int CRvcLiteOnline::init(const char *hubert_model_path, const char *synth_model_path) {
+int CRvcLiteOnline::init(const char *hubert_model_path) {
if (m_init) {
return ERR_RVC_LITE_REINIT;
}
m_hubert_inst = std::make_shared<Hubert>();
m_synthesizer_inst = std::make_shared<CSynthesizer>();
m_hubert_inst->init(hubert_model_path);
- m_synthesizer_inst->init(synth_model_path);
+// m_synthesizer_inst->init(synth_model_path);
// 要求stepSize必须是2^n
m_es_pyin = std::make_shared<ESPYIN>(16000, 160, 1024, 50, 1100);
m_init = true;
+ m_switch_model = false;
+ m_fade_in = true;
return ERR_RVC_LITE_SUCCESS;
}
int CRvcLiteOnline::switch_synth_model(const char *synth_model_path) {
if (!m_init) {
return ERR_RVC_LITE_NOT_INIT;
}
- m_synthesizer_inst = std::make_shared<CSynthesizer>();
- m_synthesizer_inst->init(synth_model_path);
- return ERR_RVC_LITE_SUCCESS;
+ if (file_exists(synth_model_path))
+ {
+ m_synthesizer_inst = std::make_shared<CSynthesizer>();
+ m_synthesizer_inst->init(synth_model_path);
+ m_switch_model = true;
+ return ERR_RVC_LITE_SUCCESS;
+ }
+ return ERR_RVC_LITE_MODEL_NOT_EXISTS;
}
void CRvcLiteOnline::reset() {
memset(m_input_predict_buf, 0, sizeof(float) * m_input_predict_buf_frame);
memset(m_crossfade_buf, 0, sizeof(float) * m_crossfade_frame);
memset(m_output_cache_buf, 0, sizeof(float) * m_output_cache_buf_frame);
+ m_fade_in = true;
}
int CRvcLiteOnline::process_block(float *in_buf, int in_len, float *out_buf, int out_len) {
if (!m_init) {
return ERR_RVC_LITE_NOT_INIT;
}
+ if (!m_switch_model)
+ {
+ return ERR_RVC_LITE_NOT_SWITCH_MODEL;
+ }
+
+ // 外部数据产生不连贯,比如做了reset的时候,需要做fade_in
+ if (m_fade_in)
+ {
+ for(int i = 0; i < in_len; i++)
+ {
+ float rate = i * 1.0 / in_len;
+ in_buf[i] = in_buf[i] * rate;
+ }
+ m_fade_in = false;
+ }
+
// 剔除尾部的block的数据
memcpy(m_input_predict_buf, m_input_predict_buf + in_len,
sizeof(float) * (m_input_predict_buf_frame - in_len));
// 向尾部填充in_buf的数据
memcpy(m_input_predict_buf + (m_input_predict_buf_frame - in_len), in_buf,
sizeof(float) * in_len);
// 提取f0特征序列
struct timeval start;
struct timeval end;
gettimeofday(&start, NULL);
get_pyin_f0();
gettimeofday(&end, NULL);
LOGE("CRvcLiteOnline", "get pyin sp = %f ms\n",
(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);
// 推理hubert
gettimeofday(&start, NULL);
m_hubert_inst->process(m_input_predict_buf, m_hubert_ret);
gettimeofday(&end, NULL);
LOGE("CRvcLiteOnline", "m_hubert_inst sp = %f ms\n",
(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);
// 合成语音
for (int i = 0; i < gs_synth_input_frame; i++) {
// 拷贝数据 1,gs_hubert_frame,258
for (int j = 0; j < gs_hubert_dim; j++) {
m_synth_input[0][i][j] = m_hubert_ret[0][i][j];
}
m_synth_input[0][i][256] = m_f0_coarse_data[i];
m_synth_input[0][i][257] = m_f0_data[i];
}
gettimeofday(&start, NULL);
m_synthesizer_inst->process(m_synth_input, m_synth_out);
gettimeofday(&end, NULL);
LOGE("CRvcLiteOnline", "m_synthesizer_inst sp = %f ms\n",
(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);
// 将结果全部放到缓存中
memcpy(m_output_cache_buf, m_output_cache_buf + gs_synth_output_frame,
sizeof(float) * (m_output_cache_buf_frame - gs_synth_output_frame));
memcpy(m_output_cache_buf + (m_output_cache_buf_frame - gs_synth_output_frame),
m_synth_out[0][0].data(), sizeof(float) * gs_synth_output_frame);
int start_pos = m_output_cache_buf_frame - m_crossfade_frame - out_len;
memcpy(out_buf, m_output_cache_buf + start_pos, sizeof(float) * out_len);
// 对头部数据做fade_in以及fadeout
for (int i = 0; i < m_crossfade_frame; i++) {
float rate = float(i * 1.f / m_crossfade_frame);
out_buf[i] = rate * out_buf[i] + m_crossfade_buf[i] * (1 - rate);
}
memcpy(m_crossfade_buf, m_output_cache_buf + (m_output_cache_buf_frame - m_crossfade_frame),
sizeof(float) * m_crossfade_frame);
+
return 0;
}
int CRvcLiteOnline::get_latency_ms() {
return gs_crossfade_time * 1000;
}
/*******************************对内的类**************************************/
CResample::CResample()
{
m_resample_inst = nullptr;
}
CResample::~CResample()
{
}
int CResample::init(int in_samplerate, int out_samplerate, int in_channel, int out_channel)
{
// 只是通道数不一致时走自驱逻辑
m_in_channel = in_channel;
m_out_channel = out_channel;
if (in_samplerate == out_samplerate && in_channel != out_channel) {
m_resample_inst = nullptr;
}
else {
m_resample_inst = std::make_shared<CFfmpegResampler>();
return m_resample_inst->init(in_samplerate, out_samplerate, in_channel, out_channel);
}
return ERR_RVC_LITE_SUCCESS;
}
int CResample::get_out_samples(int num)
{
if (m_resample_inst)
{
return m_resample_inst->get_out_samples(num);
}
return num;
}
void CResample::reset()
{
if (m_resample_inst)
{
return m_resample_inst->reset();
}
}
int CResample::get_latency()
{
if (m_resample_inst)
{
return m_resample_inst->get_latency();
}
return 0;
}
int CResample::resample(float *in_buf, int in_num, float *out_buf, int &out_num) {
if (m_resample_inst) {
return m_resample_inst->resample(in_buf, in_num, out_buf, out_num);
}
if (m_in_channel == 2 && m_out_channel == 1) {
if (out_num < in_num) {
return ERR_RVC_LITE_RT_RESAMPLE_OUTBUF_SHORT;
}
stereo2mono(in_buf, in_num, out_buf);
return ERR_RVC_LITE_SUCCESS;
}
if (m_in_channel == 1 && m_out_channel == 2) {
if (out_num < in_num) {
return ERR_RVC_LITE_RT_RESAMPLE_OUTBUF_SHORT;
}
mono2stereo(in_buf, in_num, out_buf);
return ERR_RVC_LITE_SUCCESS;
}
return ERR_RVC_LITE_SUCCESS;
}
/*******************************对外的类***************************************/
/*******************************对内函数***************************************/
void CRvcLiteOnlineRealTime::init_variable() {
m_init = false;
m_rvc_stop = true;
m_sample_rate = 44100;
m_channel = 1;
m_synth_path = "";
m_new_synth_path = "";
+ m_syn_state = RVC_LITE_RT_SYN_STATE_DEFAULT;
}
/*******************************对外函数***************************************/
CRvcLiteOnlineRealTime::CRvcLiteOnlineRealTime() {
init_variable();
}
CRvcLiteOnlineRealTime::~CRvcLiteOnlineRealTime() {
uninit();
}
-int CRvcLiteOnlineRealTime::init(const char *hubert_model_path, const char *synth_model_path, int sample_rate,
- int channel) {
+int CRvcLiteOnlineRealTime::init(const char *hubert_model_path, int sample_rate, int channel) {
if (m_init) {
return ERR_RVC_LITE_RT_REINIT;
}
if (sample_rate < 16000) {
return ERR_RVC_LITE_RT_INPUT_SAMPLE_ERR;
}
init_variable();
m_sample_rate = sample_rate;
m_channel = channel;
- m_synth_path = synth_model_path;
- m_new_synth_path = synth_model_path;
+ m_synth_path = "";
+ m_new_synth_path = "";
+ m_syn_state = RVC_LITE_RT_SYN_STATE_DEFAULT;
int output_one_sec_number = m_sample_rate * m_channel; // 临时使用的数据
+ int latency_len = gs_crossfade_time * m_sample_rate * m_channel;
CThreadPool::Task task = std::bind(&CRvcLiteOnlineRealTime::rvc_process, this);
m_rvc_inst = std::make_shared<CRvcLiteOnline>();
- int err = m_rvc_inst->init(hubert_model_path, synth_model_path);
+ int err = m_rvc_inst->init(hubert_model_path);
if (ERR_RVC_LITE_SUCCESS != err) {
goto exit;
}
-
// 重采样部分
m_resample_queue = std::make_shared<CRvcCircleBuffer>(sample_rate * 3 * m_channel);
m_resample16 = std::make_shared<CResample>();
err = m_resample16->init(m_sample_rate, gs_src_samplerate, m_channel, 1);
if (ERR_RVC_LITE_SUCCESS != err) {
goto exit;
}
m_resample2src = std::make_shared<CResample>();
err = m_resample2src->init(gs_dst_samplerate, m_sample_rate, 1, m_channel);
if (ERR_RVC_LITE_SUCCESS != err) {
goto exit;
}
m_resample_buf_max_len = 2048; // 此时空间最大是2048,保证不超即可
m_resample_in_buf = std::shared_ptr<float>(new float[m_resample_buf_max_len], std::default_delete<float[]>());
m_resample_out_buf = std::shared_ptr<float>(new float[m_resample_buf_max_len], std::default_delete<float[]>());
// 核心处理部分
m_input_tmp_buf_len = gs_src_samplerate;
m_output_tmp_buf_len = gs_dst_samplerate;
m_input_tmp_buf = std::shared_ptr<float>(new float[m_input_tmp_buf_len], std::default_delete<float[]>());
m_output_tmp_buf = std::shared_ptr<float>(new float[m_output_tmp_buf_len], std::default_delete<float[]>());
memset(m_input_tmp_buf.get(), 0, sizeof(float) * m_input_tmp_buf_len);
memset(m_output_tmp_buf.get(), 0, sizeof(float) * m_output_tmp_buf_len);
// 循环buffer
m_input_queue = std::make_shared<CRvcCircleBuffer>(m_input_tmp_buf_len * 3);
// 对外的是目标的采样率和通道数的数据
m_out_queue = std::make_shared<CRvcCircleBuffer>(output_one_sec_number * 3);
-
+ m_latency_queue = std::make_shared<CRvcCircleBuffer>(latency_len);
// 提前塞入两组,保证延迟稳定在2s
for (int i = 0; i < 2; i++) {
// 塞入1s数据
for (int j = 0; j < output_one_sec_number / m_output_tmp_buf_len; j++) {
m_out_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len);
}
m_out_queue->push(m_output_tmp_buf.get(), output_one_sec_number % m_output_tmp_buf_len);
}
+ // 算法本身有延迟,所有为了保证延迟一致,在无效果的时候需要添加该延迟
+ for (int j = 0; j < latency_len / m_output_tmp_buf_len; j++) {
+ m_latency_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len);
+ }
+ m_latency_queue->push(m_output_tmp_buf.get(), latency_len % m_output_tmp_buf_len);
// 开始处理线程
m_thread_pool = std::make_shared<CThreadPool>();
m_thread_pool->start(1);
m_rvc_stop = false;
m_thread_pool->run(task);
m_init = true;
exit:
if (ERR_RVC_LITE_SUCCESS != err) {
m_init = true;
uninit();
}
return err;
}
int CRvcLiteOnlineRealTime::switch_synth(const char *synth_model_path) {
if (!m_init) {
return ERR_RVC_LITE_RT_NOT_INIT;
}
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_new_synth_path = synth_model_path;
}
return ERR_RVC_LITE_SUCCESS;
}
int CRvcLiteOnlineRealTime::process(float *in_buf, int in_len, float *out_buf, int out_len) {
if (!m_init) {
return ERR_RVC_LITE_RT_NOT_INIT;
}
// 写入数据
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_resample_queue->push(in_buf, in_len);
m_rvc_cond.notify_all();
}
memset(out_buf, 0, sizeof(float) * out_len);
int tmp_out_len = out_len;
// 获取数据
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_out_queue->pop(out_buf, tmp_out_len);
}
if (tmp_out_len != out_len) {
return ERR_RVC_LITE_RT_NOT_ENOUGH_DATA;
}
return ERR_RVC_LITE_SUCCESS;
}
void CRvcLiteOnlineRealTime::reset() {
if (!m_init) {
return;
}
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_resample_queue->reset();
m_resample16->reset();
m_resample2src->reset();
m_input_queue->reset();
m_out_queue->reset();
m_rvc_inst->reset();
+ m_latency_queue->reset();
+ // 提前塞入两组,保证延迟稳定在2s
+ int output_one_sec_number = m_sample_rate * m_channel; // 临时使用的数据
+ memset(m_output_tmp_buf.get(), 0, sizeof(float) * m_output_tmp_buf_len);
+ for (int i = 0; i < 2; i++) {
+ for (int j = 0; j < output_one_sec_number / m_output_tmp_buf_len; j++) {
+ m_out_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len);
+ }
+ m_out_queue->push(m_output_tmp_buf.get(), output_one_sec_number % m_output_tmp_buf_len);
+ }
+ // 算法本身有延迟,所有为了保证延迟一致,在无效果的时候需要添加该延迟
+ int latency_len = gs_crossfade_time * m_sample_rate * m_channel;
+ for (int j = 0; j < latency_len / m_output_tmp_buf_len; j++) {
+ m_latency_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len);
+ }
+ m_latency_queue->push(m_output_tmp_buf.get(), latency_len % m_output_tmp_buf_len);
}
}
void CRvcLiteOnlineRealTime::flush(float *&out_buf, int &len) {
// 将内部的所有的数据吐出来
/**
* 先停止
*/
stop();
- // 先将重采样的部分走完
+ // 无音色转换的情况
int resample_in_len = 0;
int resample_out_len = 0;
-// m_resample_queue->push(m_resample_in_buf.get(), m_resample_buf_max_len);
+ if(m_syn_state == RVC_LITE_RT_SYN_STATE_DEFAULT)
+ {
+ while (m_resample_queue->size() > 0) {
+ resample_in_len = m_resample_buf_max_len;
+ m_resample_queue->pop(m_resample_in_buf.get(), resample_in_len);
+ m_latency_queue->push(m_resample_in_buf.get(), resample_in_len);
+ m_latency_queue->pop(m_resample_in_buf.get(), resample_in_len);
+ m_out_queue->push(m_resample_in_buf.get(), resample_in_len);
+ }
+
+ while(m_latency_queue->size() > 0)
+ {
+ resample_in_len = m_resample_buf_max_len;
+ m_latency_queue->pop(m_resample_in_buf.get(), resample_in_len);
+ m_out_queue->push(m_resample_in_buf.get(), resample_in_len);
+ }
+
+ len = m_out_queue->size();
+ out_buf = new float[len];
+ m_out_queue->pop(out_buf, len);
+ return;
+ }
+
+ // 有音色转换的情况
while (m_resample_queue->size() > 0) {
resample_in_len = m_resample_buf_max_len;
m_resample_queue->pop(m_resample_in_buf.get(), resample_in_len);
// 输入的数据需要考虑channel
resample_out_len = m_resample16->get_out_samples(resample_in_len / m_channel);
m_resample16->resample(m_resample_in_buf.get(), resample_in_len / m_channel, m_resample_out_buf.get(),
resample_out_len);
// 输出是16k单声道,不需要考虑
m_input_queue->push(m_resample_out_buf.get(), resample_out_len);
}
memset(m_input_tmp_buf.get(), 0, sizeof(float) * m_input_tmp_buf_len);
int add_size = m_input_tmp_buf_len - m_input_queue->size() % m_input_tmp_buf_len;
if (add_size != 0 && add_size < m_input_tmp_buf_len) {
m_input_queue->push(m_input_tmp_buf.get(), add_size);
}
int num = m_input_queue->size() / m_input_tmp_buf_len;
for (int i = 0; i < num; i++) {
rvc_process_step();
}
// 将所有数据拷贝出来
len = m_out_queue->size();
out_buf = new float[len];
m_out_queue->pop(out_buf, len);
}
int CRvcLiteOnlineRealTime::get_latency_ms() {
return m_rvc_inst->get_latency_ms() + 2000;
}
/*******************************对内函数***************************************/
void CRvcLiteOnlineRealTime::uninit() {
if (!m_init) {
return;
}
stop();
}
void CRvcLiteOnlineRealTime::stop() {
// 释放thread_pool的数据,先通知一下rvc_process,防止是在等待中
m_rvc_stop = true;
if (m_thread_pool) {
m_rvc_cond.notify_all();
m_thread_pool->stop();
}
}
void CRvcLiteOnlineRealTime::rvc_process_step() {
struct timeval start;
struct timeval end;
int sample_out_len = 0;
// 开始处理
if (m_input_queue->size() < m_input_tmp_buf_len) {
return;
}
gettimeofday(&start, NULL);
m_input_queue->pop(m_input_tmp_buf.get(), m_input_tmp_buf_len);
m_rvc_inst->process_block(m_input_tmp_buf.get(), m_input_tmp_buf_len,
m_output_tmp_buf.get(), m_output_tmp_buf_len);
gettimeofday(&end, NULL);
LOGD("RvcLite", "rvc_process process sp %f ms",
(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);
// 重采样
// 考虑到此处采样率变大,但是最多也不到两倍,但是通道数有可能扩展到两倍,所以按照1/4进行设置
gettimeofday(&start, NULL);
+ bool last = false;
int step = m_resample_buf_max_len / 4;
for (int i = 0; i < m_output_tmp_buf_len; i += step) {
- if (i + step > m_output_tmp_buf_len) {
+ if (i + step >= m_output_tmp_buf_len) {
step = m_output_tmp_buf_len - i;
+ last = true;
}
// 此时的输入是单声道,采样点数量和总长度一致
sample_out_len = m_resample2src->get_out_samples(step);
m_resample2src->resample(m_output_tmp_buf.get() + i, step, m_resample_out_buf.get(), sample_out_len);
+
+ // 从有到无
+ if(last && m_syn_state == RVC_LITE_RT_SYN_STATE_EFFECT2DEFAULT)
+ {
+ // 因为不加音效也需要延迟对齐,所以此处只要做fade_out就行了
+ for(int ii =0; ii < sample_out_len * m_channel; ii+=m_channel)
+ {
+ float rate = ii * 1.0 / step;
+ for(int jj = 0; jj < m_channel; jj++)
+ {
+ m_resample_out_buf.get()[ii+jj] = m_resample_out_buf.get()[ii+jj] * (1 - rate);
+ }
+ }
+ m_syn_state = RVC_LITE_RT_SYN_STATE_BEFORE_DEFAULT;
+ }
+
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_out_queue->push(m_resample_out_buf.get(), sample_out_len * m_channel);
}
}
gettimeofday(&end, NULL);
LOGD("RvcLite", "rvc_process re_resample sp %f ms",
(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);
printf("finish ...\n");
}
void CRvcLiteOnlineRealTime::rvc_process() {
int sample_in_len;
int sample_out_len = 0;
while (!m_rvc_stop) {
{
// 重采样
std::unique_lock<std::mutex> lock(m_rvc_mutex);
if (m_resample_queue->size() < m_resample_buf_max_len) {
// 睡眠前检查下情况
if (m_rvc_stop) {
return;
}
m_rvc_cond.wait(lock);
continue;
}
sample_in_len = m_resample_buf_max_len;
m_resample_queue->pop(m_resample_in_buf.get(), sample_in_len);
}
- // 重采样到16k,此处采样率变低,所以不会出现sample_out_len > sample_in_len的情况
- sample_out_len = m_resample16->get_out_samples(sample_in_len / m_channel);
- m_resample16->resample(m_resample_in_buf.get(), sample_in_len / m_channel, m_resample_out_buf.get(),
- sample_out_len);
- m_input_queue->push(m_resample_out_buf.get(), sample_out_len);
-
- // 开始变声
+ /**
+ * 此处有三种情况:
+ * 因为无论哪种变换,有延迟的存在,导致输入的数据都是需要塞0进去,所以对当前的数据做fade_out即可
+ * 1. 无到有:对无到有的部分做个fade_out,对下一帧要塞入音效器的部分做fade_in
+ * 2. 有到无:对无到有的部分做个fade_out,对下一帧要塞入音效器的部分做fade_in
+ * 3. 有到有[这个不用考虑,内部自己做了处理]
+ */
if (m_synth_path != m_new_synth_path) {
+
+ // 从无到有,此时对本帧做fade_out,对下一帧输入做fade_in
+ if(m_synth_path.empty() && !m_new_synth_path.empty())
+ {
+ m_syn_state = RVC_LITE_RT_SYN_STATE_DEFAULT2EFFECT;
+ }
+
+ // 从有到无
+ if (!m_synth_path.empty() && m_new_synth_path.empty())
+ {
+ m_syn_state = RVC_LITE_RT_SYN_STATE_EFFECT2DEFAULT;
+ }
+
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_synth_path = m_new_synth_path;
}
m_rvc_inst->switch_synth_model(m_new_synth_path.c_str());
}
+
+ // 刚切过来第一次做效果
+ if(m_syn_state == RVC_LITE_RT_SYN_STATE_BEFORE_DEFAULT)
+ {
+ // 刚从有到无,需要清空数据,以及对输入的队列添加fade_in
+ m_latency_queue->reset();
+ // 算法本身有延迟,所有为了保证延迟一致,在无效果的时候需要添加该延迟
+ memset(m_output_tmp_buf.get(), 0, sizeof(float) * m_output_tmp_buf_len);
+ int latency_len = gs_crossfade_time * m_sample_rate * m_channel;
+ for (int j = 0; j < latency_len / m_output_tmp_buf_len; j++) {
+ m_latency_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len);
+ }
+ m_latency_queue->push(m_output_tmp_buf.get(), latency_len % m_output_tmp_buf_len);
+
+ // 对输入做fade_in
+ for(int i = 0; i < sample_in_len; i+=m_channel)
+ {
+ float rate = i * 1.0 / sample_in_len;
+ for(int j = 0; j < m_channel; j++)
+ {
+ m_resample_in_buf.get()[i+j] *= rate;
+ }
+ }
+ m_syn_state = RVC_LITE_RT_SYN_STATE_DEFAULT;
+ }
+
+ // 不做效果
+ if(m_syn_state == RVC_LITE_RT_SYN_STATE_DEFAULT)
+ {
+ m_latency_queue->push(m_resample_in_buf.get(), sample_in_len);
+ m_latency_queue->pop(m_resample_in_buf.get(), sample_in_len);
+ {
+ std::unique_lock<std::mutex> lock(m_rvc_mutex);
+ m_out_queue->push(m_resample_in_buf.get(), sample_in_len);
+ }
+ continue;
+ }
+
+ // 从无到有的转换
+ if (m_syn_state == RVC_LITE_RT_SYN_STATE_DEFAULT2EFFECT)
+ {
+ // 做fade_out
+ for(int i = 0; i < sample_in_len; i+=m_channel)
+ {
+ float rate = i * 1.0 / sample_in_len;
+ for(int j = 0; j < m_channel; j++)
+ {
+ m_resample_in_buf.get()[i+j] *= 1 - rate;
+ }
+ }
+ m_latency_queue->push(m_resample_in_buf.get(), sample_in_len);
+ m_latency_queue->pop(m_resample_in_buf.get(), sample_in_len);
+ {
+ std::unique_lock<std::mutex> lock(m_rvc_mutex);
+ m_out_queue->push(m_resample_in_buf.get(), sample_in_len);
+ }
+
+ // 此时对于rvc来说输入的数据不连贯了,所以清空内部数据重新搞
+ m_syn_state = RVC_LITE_RT_SYN_STATE_EFFECT;
+ m_rvc_inst->reset();
+ continue;
+ }
+
+ // 重采样到16k,此处采样率变低,所以不会出现sample_out_len > sample_in_len的情况
+ sample_out_len = m_resample16->get_out_samples(sample_in_len / m_channel);
+ m_resample16->resample(m_resample_in_buf.get(), sample_in_len / m_channel, m_resample_out_buf.get(),
+ sample_out_len);
+ m_input_queue->push(m_resample_out_buf.get(), sample_out_len);
rvc_process_step();
}
}
\ No newline at end of file
diff --git a/mnn_demo/src/CRvcLiteSynthesizer.cpp b/mnn_demo/src/CRvcLiteSynthesizer.cpp
new file mode 100644
index 0000000..9bce8d7
--- /dev/null
+++ b/mnn_demo/src/CRvcLiteSynthesizer.cpp
@@ -0,0 +1,106 @@
+//
+// Created by Administrator on 2024/1/21.
+//
+
+#include "CRvcLiteSynthesizer.h"
+#include <cstring>
+#include <sys/time.h>
+
+CRvcLiteSynthesizer::CRvcLiteSynthesizer(){}
+
+CRvcLiteSynthesizer::~CRvcLiteSynthesizer() {}
+
+int CRvcLiteSynthesizer::init(const char *hubert_model, const char *synth_model, int sample_rate, int channel)
+{
+ m_rvc_inst = std::make_shared<CRvcLiteOnline>();
+ int err = m_rvc_inst->init(hubert_model);
+ if (err != ERR_RVC_LITE_SUCCESS)
+ {
+ return err;
+ }
+ err = m_rvc_inst->switch_synth_model(synth_model);
+ if (err != ERR_RVC_LITE_SUCCESS)
+ {
+ return err;
+ }
+ m_resample2_16 = std::make_shared<CResample>();
+ m_resample2_16->init(sample_rate, gs_src_samplerate, channel, 1);
+ m_resample2src = std::make_shared<CResample>();
+ m_resample2src->init(gs_dst_samplerate, sample_rate, 1, channel);
+
+ m_channel = channel;
+ m_sample_rate = sample_rate;
+
+ m_buf_tmp_16k_len = 0;
+ m_buf_tmp_16k_cap = 0;
+ m_buf_tmp_32k_len = 0;
+ m_buf_tmp_32k_cap = 0;
+ m_buf_tmp_src_len = 0;
+ m_buf_tmp_src_cap = 0;
+ return ERR_RVC_LITE_SUCCESS;
+}
+
+int CRvcLiteSynthesizer::process(float *in_buf, int in_len, float *out_buf, int &out_len) {
+ // 1 重采样 2 推理 3 再次重采样
+ int resample_out_len = m_resample2_16->get_out_samples(in_len / m_channel);
+ // 控制逻辑,不能超过该长度
+ if (resample_out_len > gs_src_samplerate) {
+ return ERR_RVC_LITE_BLOCK_TOO_LONG;
+ }
+
+ if (m_buf_tmp_16k_cap < resample_out_len) {
+ m_buf_tmp_16k_cap = resample_out_len;
+ m_buf_tmp_16k = std::shared_ptr<float>(new float[m_buf_tmp_16k_cap], std::default_delete<float[]>());
+ }
+ m_buf_tmp_16k_len = resample_out_len;
+ int err = m_resample2_16->resample(in_buf, in_len / m_channel, m_buf_tmp_16k.get(), m_buf_tmp_16k_len);
+ if (err != ERR_RVC_LITE_SUCCESS) {
+ return err;
+ }
+ if (m_buf_tmp_32k_cap < m_buf_tmp_16k_len * 2) {
+ m_buf_tmp_32k_cap = m_buf_tmp_16k_len * 2;
+ m_buf_tmp_32k = std::shared_ptr<float>(new float[m_buf_tmp_32k_cap], std::default_delete<float[]>());
+ }
+ m_buf_tmp_32k_len = m_buf_tmp_16k_len * 2;
+
+ // 推理
+ err = m_rvc_inst->process_block(m_buf_tmp_16k.get(), m_buf_tmp_16k_len, m_buf_tmp_32k.get(), m_buf_tmp_32k_len);
+ if (err != ERR_RVC_LITE_SUCCESS) {
+ return err;
+ }
+ // 重采样回来
+ int out_frame = m_resample2src->get_out_samples(m_buf_tmp_32k_len);
+ if (m_buf_tmp_src_cap < out_frame * m_channel) {
+ m_buf_tmp_src_cap = out_frame * m_channel;
+ m_buf_tmp_src = std::shared_ptr<float>(new float[m_buf_tmp_src_cap], std::default_delete<float[]>());
+ }
+ m_buf_tmp_src_len = out_frame;
+ err = m_resample2src->resample(m_buf_tmp_32k.get(), m_buf_tmp_32k_len, m_buf_tmp_src.get(), m_buf_tmp_src_len);
+ if (err != ERR_RVC_LITE_SUCCESS) {
+ return err;
+ }
+
+ // 取较小的值
+ if (out_len > m_buf_tmp_src_len * m_channel)
+ {
+ out_len = m_buf_tmp_src_len * m_channel;
+ }
+
+ memcpy(out_buf, m_buf_tmp_src.get(), sizeof(float) * out_len);
+ return ERR_RVC_LITE_SUCCESS;
+}
+
+float CRvcLiteSynthesizer::get_rtf()
+{
+ struct timeval start;
+ struct timeval end;
+ gettimeofday(&start, NULL);
+ int in_len = m_sample_rate * m_channel - 100 *m_channel;
+ int out_len = in_len;
+ float* in_buf = new float[in_len];
+ process(in_buf, in_len, in_buf, in_len);
+ delete [] in_buf;
+ gettimeofday(&end, NULL);
+ double sp = (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0;
+ return sp / 1000;
+}
\ No newline at end of file

File Metadata

Mime Type
text/x-diff
Expires
Sun, Jan 12, 08:32 (1 d, 11 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1347189
Default Alt Text
(56 KB)

Event Timeline