Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F4880320
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
63 KB
Subscribers
None
View Options
diff --git a/.arcconfig b/.arcconfig
new file mode 100644
index 0000000..99e9d55
--- /dev/null
+++ b/.arcconfig
@@ -0,0 +1,4 @@
+{
+ "phabricator.uri" : "http://phabricator.ushow.media/",
+ "editor": "vim"
+}
diff --git a/mnn_demo/inc/CRvcLiteSynthesizer.h b/mnn_demo/inc/CRvcLiteSynthesizer.h
index cedcb2e..c25d0df 100644
--- a/mnn_demo/inc/CRvcLiteSynthesizer.h
+++ b/mnn_demo/inc/CRvcLiteSynthesizer.h
@@ -1,82 +1,82 @@
//
// Created by Administrator on 2024/1/21.
//
#ifndef MNN_DEMO_CRVCLITESYNTHESIZER_H
#define MNN_DEMO_CRVCLITESYNTHESIZER_H
#include "CRvcLiteOnline.h"
class CRvcLiteSynthesizer
{
public:
CRvcLiteSynthesizer();
~CRvcLiteSynthesizer();
public:
/**
* 初始化
* @param hubert_model 语义模型地址
* @param sample_rate 采样率
* @param channel 通道数
* @return 0 表示正常
*/
int init(const char* hubert_model, int sample_rate, int channel);
/**
* 选择人声模型
* @param synth_model 音色模型地址
* @param enable 是否开启
* @return
*/
int switch_model(const char* synth_model);
/**
* 设置变调,范围是[-12, 12]
* 有人声模型才生效,否则不生效
* 换人声模型,该状态不会丢失,并且在无人声的时候设置之后,有人声模型后也会生效
* @param key
*/
void set_up_key(int key);
/**
* reset,清空内部数据
*/
void reset();
/**
- * 处理逻辑
+ * 处理逻辑:每次输入的长度不要太长,建议在900ms左右即可
* @param in_buf 输入的buf
* @param in_len 输入的Buf长度,frame*channel,建议输入小于等于1s的音频长度,尽量的大就好
* @param out_buf 输出的buf
* @param out_len 输出的buf长度, frame*channel
* 注意: 此处有可能出现输出的长度不一定等于in_len,输出的值会小于等于out_len,但是是连续的,所以out_len可以适当比in_len大一些,从而保证都能搞出来
* @return
*/
int process(float* in_buf, int in_len, float* out_buf, int &out_len);
/**
* 获取实时率,处理1s数据的真实耗时/1s
* @return
*/
float get_rtf();
private:
std::shared_ptr<CRvcLiteOnline> m_rvc_inst;
std::shared_ptr<CResample> m_resample2_16;
std::shared_ptr<CResample> m_resample2src;
int m_channel;
int m_sample_rate;
std::shared_ptr<float> m_buf_tmp_16k;
int m_buf_tmp_16k_len;
int m_buf_tmp_16k_cap;
std::shared_ptr<float> m_buf_tmp_32k;
int m_buf_tmp_32k_len;
int m_buf_tmp_32k_cap;
std::shared_ptr<float> m_buf_tmp_src;
int m_buf_tmp_src_len;
int m_buf_tmp_src_cap;
bool m_first;
};
#endif //MNN_DEMO_CRVCLITESYNTHESIZER_H
diff --git a/mnn_demo/main.cpp b/mnn_demo/main.cpp
index 0d6b685..d742793 100644
--- a/mnn_demo/main.cpp
+++ b/mnn_demo/main.cpp
@@ -1,285 +1,286 @@
#include <sys/time.h>
#include <thread>
#include <chrono>
#include "src/Hubert.h"
#include "src/CSynthesizer.h"
#include "CRvcLiteSynthesizer.h"
#include "CRvcLiteOnlineV2.h"
int test_hubert() {
const char *hubert_model_path = "/mnt/d/dataset/svc/models/mnn/hubert_test_v1_fp16.mnn";
Hubert hubert;
int err_code = hubert.init(hubert_model_path);
std::vector<float> input(33280, 0.1);
std::vector<std::vector<std::vector<float>>> ret;
ret.resize(1);
ret[0].resize(205);
for (int i = 0; i < 205; i++) {
ret[0][i].resize(256);
}
float time = hubert.process(input.data(), ret);
return 0;
}
int test_contentvec() {
const char *contentvec_model_path = "/mnt/d/dataset/svc/models/mnn/contentvec_test_fp16.mnn";
CSynthesizer contentVec;
int err_code = contentVec.init(contentvec_model_path);
std::vector<std::vector<std::vector<float>>> input(1);
input[0].resize(205);
for (int i = 0; i < 205; i++) {
for (int j = 0; j < 258; j++) {
if (j == 256) {
input[0][i].push_back(0.2);
} else if (j == 257) {
input[0][i].push_back(1.0);
} else {
input[0][i].push_back(0.1);
}
}
}
std::vector<std::vector<std::vector<float>>> ret;
ret.resize(1);
for (int i = 0; i < 1; i++) {
ret[i].resize(1);
ret[i][0].resize(35840);
}
float tot = 0.f;
for (int i = 0; i < 10; i++) {
float time = contentVec.process(input, ret);
tot += time;
}
printf("time: %f \n", tot / 100.f);
return 0;
}
#include "CRvcLiteOnline.h"
#include "av_waves/waves/inc/STWaveFile.h"
void test() {
const char *hubert_model_path = "/mnt/d/dataset/svc/models/mnn/hubert_test_v2_fp16.mnn";
const char *contentvec_model_path = "/mnt/d/dataset/svc/models/mnn/contentvec_test_fp16.mnn";
const char *in_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01_16.wav";
// const char *in_wav = "/mnt/d/code/develop/svc/Retrieval-based-Voice-Conversion-WebUI/online/1_1.wav";
const char *out_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01_cpp_v1.wav";
CRvcLiteOnline rvc_inst;
rvc_inst.init(hubert_model_path);
// 读取音频文件, 要求16k,单声道
STCWaveFile wav_inst(in_wav, false);
int sample_rate = wav_inst.GetSampleRate();
int channel = wav_inst.GetChannels();
int len = wav_inst.GetTotalFrames() * channel;
float *data = new float[len];
float *outdata = new float[len * 2];
wav_inst.ReadFrameAsfloat(data, wav_inst.GetTotalFrames());
int step = sample_rate;
printf("start ..\n");
for (int i = 0; i < len; i += step) {
if (i + step > len) {
step = len - i;
}
struct timeval start;
struct timeval end;
gettimeofday(&start, NULL);
rvc_inst.process_block(data + i, step, outdata + 2 * i, 2 * step);
gettimeofday(&end, NULL);
printf("sp = %f ms\n", (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);
}
STCWaveFile wav_out_inst(out_wav, true);
wav_out_inst.SetSampleRate(32000);
wav_out_inst.SetChannels(1);
wav_out_inst.SetSampleFormat(SF_IEEE_FLOAT);
wav_out_inst.SetupDone();
wav_out_inst.WriteFrame(outdata, len * 2);
printf("finish2 ....\n");
}
void test_rvc_lite_synth()
{
const char *hubert_model_path = "/mnt/d/dataset/svc/models/layers_3/layer3_contentvec.mnn";
const char *syz_model = "/mnt/d/dataset/svc/models/layers_3/layer3_syz.mnn";
const char *out_wav = "/mnt/d/dataset/tmp/i_out3.wav";
const char *in_wav = "/mnt/d/dataset/tmp/t1.wav";
STCWaveFile wav_inst(in_wav, false);
int sample_rate = wav_inst.GetSampleRate();
int channel = wav_inst.GetChannels();
int len = wav_inst.GetTotalFrames() * channel;
float *data = new float[len];
float *outdata = new float[len];
wav_inst.ReadFrameAsfloat(data, wav_inst.GetTotalFrames());
CRvcLiteSynthesizer m_rvc_inst;
int err = m_rvc_inst.init(hubert_model_path, sample_rate, channel);
printf("init err=%d!\n", err);
printf("rtf=%f\n", m_rvc_inst.get_rtf());
int step = sample_rate * channel - 100 * channel;
int out_len = 0;
for(int i = 0; i < len; i+=step)
{
if (i + step > len) {
step = len - i;
}
int out_step = step;
err = m_rvc_inst.process(data+i, step, outdata+out_len, out_step);
if(err != ERR_RVC_LITE_SUCCESS)
{
printf("process err=%d!\n", err);
return ;
}
out_len += out_step;
}
STCWaveFile wav_out_inst(out_wav, true);
wav_out_inst.SetSampleRate(sample_rate);
wav_out_inst.SetChannels(channel);
wav_out_inst.SetSampleFormat(SF_IEEE_FLOAT);
wav_out_inst.SetupDone();
wav_out_inst.WriteFrame(outdata, wav_inst.GetTotalFrames());
delete[] data;
delete[] outdata;
}
void test_rvc_lite_v2()
{
const char *hubert_model_path = "/mnt/d/dataset/svc/models/layers_3/layer3_contentvec.mnn";
const char *syz_model = "/mnt/d/dataset/svc/models/layers_3/layer3_syz.mnn";
const char *out_wav = "/mnt/d/dataset/tmp/i_out_01_r.wav";
- const char *in_wav = "/mnt/d/dataset/tmp/t1.wav";
+ const char *in_wav = "/mnt/d/dataset/tmp/t1_48.wav";
STCWaveFile wav_inst(in_wav, false);
int sample_rate = wav_inst.GetSampleRate();
int channel = wav_inst.GetChannels();
int len = wav_inst.GetTotalFrames() * channel;
float *data = new float[len];
float *outdata = new float[len];
wav_inst.ReadFrameAsfloat(data, wav_inst.GetTotalFrames());
CRvcLiteOnlineV2 m_rvc_inst;
int err = m_rvc_inst.init(hubert_model_path, sample_rate, channel);
// m_rvc_inst.switch_model(syz_model);
// m_rvc_inst.set_up_key(0);
printf("init err=%d!\n", err);
int step = sample_rate * channel - 100 * channel;
int out_len = 0;
bool last = false;
int flag = 0;
for(int i = 0; i < len; i+=step)
{
if (i + step > len) {
step = len - i;
last = true;
}
int out_step = step;
err = m_rvc_inst.push(data+i, step, last);
if(err != ERR_RVC_LITE_SUCCESS)
{
printf("process err=%d!\n", err);
return ;
}
if (i >= len / 3 && flag == 0)
{
flag = 1;
m_rvc_inst.switch_model(syz_model);
}
+
if (i >= len / 2 && flag == 1)
{
flag = 2;
m_rvc_inst.reset();
}
out_step = 2 * step;
m_rvc_inst.pop(outdata+out_len, out_step);
out_len += out_step;
}
STCWaveFile wav_out_inst(out_wav, true);
wav_out_inst.SetSampleRate(sample_rate);
wav_out_inst.SetChannels(channel);
wav_out_inst.SetSampleFormat(SF_IEEE_FLOAT);
wav_out_inst.SetupDone();
wav_out_inst.WriteFrame(outdata, wav_inst.GetTotalFrames());
delete[] data;
delete[] outdata;
}
void test_rvc_lite_online() {
// const char *hubert_model_path = "/mnt/d/dataset/svc/models/mnn/hubert_test_v2_fp16.mnn";
// const char *hubert_model_path = "/mnt/d/dataset/svc/models/layer6_bingxiao_v1/mnn/layers6_checkpoint_14_1660000_1_hubert.mnn";
const char *hubert_model_path = "/mnt/d/dataset/svc/models/layers_3/layer3_contentvec.mnn";
// const char *contentvec_model_path = "/mnt/d/dataset/svc/models/mnn/contentvec_test_fp16.mnn";
// const char *syz_model = "/mnt/d/dataset/svc/models/layer6_bingxiao_v1/mnn/xusong_v1_6hubert_hifix_syz_base_vctk_kd_32k_hubert6_jianli_e225_s62775_205.mnn";
const char *xs_model = "/mnt/d/dataset/svc/models/layers_3/layer3_xusong.mnn";
const char *syz_model = "/mnt/d/dataset/svc/models/layers_3/layer3_syz.mnn";
// const char *contentvec_model_path = "/mnt/d/dataset/svc/models/layer6_bingxiao_v1/mnn/xiafan_fp16.mnn";
// const char *in_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01.wav";
const char *in_wav = "/mnt/d/dataset/tmp/t1.wav";
// const char* in_wav = "/mnt/d/dataset/svc/dataset/短数据样本/男声/qiankun.wav";
// const char* in_wav = "/mnt/d/dataset/tmp/i.wav";
// const char *in_wav = "/mnt/d/code/develop/svc/Retrieval-based-Voice-Conversion-WebUI/online/1_1.wav";
// const char *out_wav = "/mnt/d/dataset/svc/dataset/tests/rainy_day321_01_cpp_v4.wav";
// const char *out_wav = "/mnt/d/dataset/svc/dataset/tests/qiankun_412_v4.wav";
const char *out_wav = "/mnt/d/dataset/tmp/i_out2.wav";
// 读取音频文件, 要求16k,单声道
STCWaveFile wav_inst(in_wav, false);
int sample_rate = wav_inst.GetSampleRate();
int channel = wav_inst.GetChannels();
int len = wav_inst.GetTotalFrames() * channel;
float *data = new float[len];
float *outdata = new float[len];
CRvcLiteOnlineRealTime rvc_inst;
rvc_inst.init(hubert_model_path, sample_rate, channel);
wav_inst.ReadFrameAsfloat(data, wav_inst.GetTotalFrames());
int step = 1024;
printf("start ..\n");
bool flag = true;
rvc_inst.switch_synth(syz_model);
for (int i = 0; i < len; i += step) {
if (i + step > len) {
step = len - i;
}
struct timeval start;
struct timeval end;
gettimeofday(&start, NULL);
int ret = rvc_inst.process(data + i, step, outdata+i, step);
std::this_thread::sleep_for(std::chrono::milliseconds (15));
gettimeofday(&end, NULL);
printf("ret = %d, sp = %f ms step=%d\n", ret,
(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0, step);
if (flag && i >= len / 3) {
flag = false;
rvc_inst.reset();
// rvc_inst.switch_synth(xs_model);
}
}
STCWaveFile wav_out_inst(out_wav, true);
wav_out_inst.SetSampleRate(sample_rate);
wav_out_inst.SetChannels(channel);
wav_out_inst.SetSampleFormat(SF_IEEE_FLOAT);
wav_out_inst.SetupDone();
wav_out_inst.WriteFrame(outdata, wav_inst.GetTotalFrames());
float* flush_data;
int flush_len;
rvc_inst.flush(flush_data, flush_len);
wav_out_inst.WriteFrame(flush_data, flush_len/channel);
printf("finish2 ....\n");
}
int main() {
// int ret_hubert = test_hubert();
// int ret_contentvec = test_contentvec();
// test();
// test();
// test_rvc_lite_online();
// test_rvc_lite_synth();
test_rvc_lite_v2();
return 0;
}
diff --git a/mnn_demo/src/CRvcLiteOnline.cpp b/mnn_demo/src/CRvcLiteOnline.cpp
index f9067f7..60b9fa6 100644
--- a/mnn_demo/src/CRvcLiteOnline.cpp
+++ b/mnn_demo/src/CRvcLiteOnline.cpp
@@ -1,831 +1,831 @@
//
// Created by Administrator on 2023/11/29.
//
#include <cmath>
#include <cstring>
#include <sys/time.h>
#include "CRvcLiteOnline.h"
#include "Hubert.h"
#include "CSynthesizer.h"
#include "espyin-v1.0/ESPYIN.h"
#include "ThreadPool.h"
#include "CRvcCircleBuffer.h"
#include "FfmpegResampler.h"
#include <unistd.h>
inline bool file_exists (const std::string& name) {
return ( access( name.c_str(), F_OK ) != -1 );
}
// size代表了buf的长度
void stereo2mono(float *input, int size, float *output) {
for (int i = 0; i < size - 1; i += 2) {
output[i / 2] = (input[i] + input[i + 1]) / 2;
}
}
void mono2stereo(float *input, int size, float *output) {
for (int i = 0; i < size; i++) {
output[2 * i] = input[i];
output[2 * i + 1] = input[i];
}
}
CRvcLiteOnline::CRvcLiteOnline() {
init_variable();
m_init = false;
m_switch_model = false;
// 输入部分需要的变量
// 要求输入的时间片长度,采样点数
m_input_block_frame = int(gs_block_time * gs_src_samplerate);
// 推理时额外需要的长度
m_input_extra_frame = int(gs_extra_time * gs_src_samplerate);
int zc = gs_src_samplerate / 100; // 10ms的点数
int input_corssfade_frame = int(gs_crossfade_time * gs_src_samplerate);
// 推理时使用的buffer长度
m_input_predict_buf_frame = int(ceil((m_input_extra_frame + input_corssfade_frame + m_input_block_frame)
* 1.0 / zc) * zc);
// 推理时使用的buffer
m_input_predict_buf = new float[m_input_predict_buf_frame];
memset(m_input_predict_buf, 0, sizeof(float) * m_input_predict_buf_frame);
// 输出部分需要的变量
m_crossfade_frame = int(gs_crossfade_time * gs_dst_samplerate);
m_output_block_frame = int(gs_block_time * gs_dst_samplerate);
int output_extra_frame = int(gs_extra_time * gs_dst_samplerate);
zc = gs_dst_samplerate / 100;
m_output_cache_buf_frame = int(ceil((m_output_block_frame + m_crossfade_frame + output_extra_frame)
* 1.0 / zc) * zc);
m_output_cache_buf = new float[m_output_cache_buf_frame];
memset(m_output_cache_buf, 0, sizeof(float) * m_output_cache_buf_frame);
m_crossfade_buf = new float[m_crossfade_frame];
memset(m_crossfade_buf, 0, sizeof(float) * m_crossfade_frame);
// 对于模型的输入和输出进行缓存
// 此处是写死的和模型有关
m_hubert_ret.resize(1);
m_hubert_ret[0].resize(gs_hubert_frame);
for (int i = 0; i < gs_hubert_frame; i++) {
m_hubert_ret[0][i].resize(gs_hubert_dim);
}
// synth模型的输入
m_synth_input.resize(1);
m_synth_input[0].resize(gs_synth_input_frame);
for (int i = 0; i < gs_synth_input_frame; i++) {
m_synth_input[0][i].resize(gs_synth_input_dim);
}
m_synth_out.resize(1);
m_synth_out[0].resize(1);
m_synth_out[0][0].resize(gs_synth_output_frame);
}
CRvcLiteOnline::~CRvcLiteOnline() {
uninit();
}
/**********************************对内函数*********************************************/
void CRvcLiteOnline::uninit() {
if (m_input_predict_buf != NULL) {
delete[] m_input_predict_buf;
m_input_predict_buf = NULL;
}
if (m_output_cache_buf != NULL) {
delete[] m_output_cache_buf;
m_output_cache_buf = NULL;
}
if (m_crossfade_buf != NULL) {
delete[] m_crossfade_buf;
m_crossfade_buf = NULL;
}
init_variable();
}
void CRvcLiteOnline::get_pyin_f0() {
- for (int i = 0; i < m_input_predict_buf_frame; i += 160) {
+ for (int i = 0; i < m_input_predict_buf_frame - 1024 - 160; i += 160) {
m_es_pyin->process(m_input_predict_buf + i);
}
m_f0_data.clear();
ESFeatureSet feats = m_es_pyin->getRemainingFeatures();
if (!feats.empty()) {
m_f0_data.resize(feats[4].size());
for (size_t i = 0; i < feats[4].size(); ++i) {
// 设置变调
m_f0_data[i] = feats[4][i].values[0] * m_f0_up_key;
if (m_f0_data[i] < 0) {
m_f0_data[i] = 0;
}
}
}
m_es_pyin->reset();
get_f0_post();
}
void CRvcLiteOnline::get_f0_post() {
int f0_min = 50;
int f0_max = 1100;
float f0_mel_min = 1127 * log2(1 + f0_min * 1.0 / 700);
float f0_mel_max = 1127 * log2(1 + f0_max * 1.0 / 700);
m_f0_coarse_data.clear();
m_f0_coarse_data.resize(m_f0_data.size());
for (int i = 0; i < m_f0_data.size(); i++) {
float f0_mel = 1127 * log2(1 + m_f0_data[i] / 700);
if (f0_mel > 0) {
f0_mel = (f0_mel - f0_mel_min) * 254.f / (f0_mel_max - f0_mel_min) + 1;
}
if (f0_mel <= 1) {
f0_mel = 1;
} else if (f0_mel > 255) {
f0_mel = 255;
}
m_f0_coarse_data[i] = float(int(f0_mel + 0.5));
}
}
void CRvcLiteOnline::init_variable() {
m_init = false;
m_switch_model = false;
// 缓存使用的数据
// 要求输入的时间片长度,采样点数
m_input_block_frame = 0;
m_input_extra_frame = 0;
m_input_predict_buf_frame = 0;
m_input_predict_buf = nullptr;
m_f0_data.clear();
m_f0_coarse_data.clear();
m_crossfade_frame = 0;
m_output_block_frame = 0;
m_output_cache_buf_frame = 0;
m_crossfade_buf = nullptr;
m_output_cache_buf = nullptr;
// 各个实例的返回结果
m_hubert_ret.clear();
m_synth_input.clear();
m_synth_out.clear();
m_fade_in = true;
m_f0_up_key = 1.f;
m_f0_new_up_key = 1.f;
}
/**********************************对外函数*********************************************/
int CRvcLiteOnline::init(const char *hubert_model_path) {
if (m_init) {
return ERR_RVC_LITE_REINIT;
}
m_hubert_inst = std::make_shared<Hubert>();
m_synthesizer_inst = std::make_shared<CSynthesizer>();
m_hubert_inst->init(hubert_model_path);
// m_synthesizer_inst->init(synth_model_path);
// 要求stepSize必须是2^n
m_es_pyin = std::make_shared<ESPYIN>(16000, 160, 1024, 50, 1100);
m_init = true;
m_switch_model = false;
m_fade_in = true;
m_f0_up_key = 1.f;
m_f0_new_up_key = 1.f;
return ERR_RVC_LITE_SUCCESS;
}
int CRvcLiteOnline::switch_synth_model(const char *synth_model_path) {
if (!m_init) {
return ERR_RVC_LITE_NOT_INIT;
}
if (file_exists(synth_model_path))
{
m_synthesizer_inst = std::make_shared<CSynthesizer>();
m_synthesizer_inst->init(synth_model_path);
m_switch_model = true;
return ERR_RVC_LITE_SUCCESS;
}
return ERR_RVC_LITE_MODEL_NOT_EXISTS;
}
void CRvcLiteOnline::set_up_key(int key)
{
if (key > 12)
{
key = 12;
}
if (key < -12)
{
key = -12;
}
m_f0_new_up_key = pow(2, key / 12.f);
}
void CRvcLiteOnline::reset() {
memset(m_input_predict_buf, 0, sizeof(float) * m_input_predict_buf_frame);
memset(m_crossfade_buf, 0, sizeof(float) * m_crossfade_frame);
memset(m_output_cache_buf, 0, sizeof(float) * m_output_cache_buf_frame);
m_fade_in = true;
}
int CRvcLiteOnline::process_block(float *in_buf, int in_len, float *out_buf, int out_len) {
if (!m_init) {
return ERR_RVC_LITE_NOT_INIT;
}
if (!m_switch_model)
{
return ERR_RVC_LITE_NOT_SWITCH_MODEL;
}
// 外部数据产生不连贯,比如做了reset的时候,需要做fade_in
if (m_fade_in)
{
for(int i = 0; i < in_len; i++)
{
float rate = i * 1.0 / in_len;
in_buf[i] = in_buf[i] * rate;
}
m_fade_in = false;
}
// 剔除尾部的block的数据
memcpy(m_input_predict_buf, m_input_predict_buf + in_len,
sizeof(float) * (m_input_predict_buf_frame - in_len));
// 向尾部填充in_buf的数据
memcpy(m_input_predict_buf + (m_input_predict_buf_frame - in_len), in_buf,
sizeof(float) * in_len);
// 提取f0特征序列
struct timeval start;
struct timeval end;
gettimeofday(&start, NULL);
m_f0_up_key = m_f0_new_up_key;
get_pyin_f0();
gettimeofday(&end, NULL);
LOGE("CRvcLiteOnline", "get pyin sp = %f ms\n",
(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);
// 推理hubert
gettimeofday(&start, NULL);
m_hubert_inst->process(m_input_predict_buf, m_hubert_ret);
gettimeofday(&end, NULL);
LOGE("CRvcLiteOnline", "m_hubert_inst sp = %f ms\n",
(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);
// 合成语音
for (int i = 0; i < gs_synth_input_frame; i++) {
// 拷贝数据 1,gs_hubert_frame,258
for (int j = 0; j < gs_hubert_dim; j++) {
m_synth_input[0][i][j] = m_hubert_ret[0][i][j];
}
m_synth_input[0][i][256] = m_f0_coarse_data[i];
m_synth_input[0][i][257] = m_f0_data[i];
}
gettimeofday(&start, NULL);
m_synthesizer_inst->process(m_synth_input, m_synth_out);
gettimeofday(&end, NULL);
LOGE("CRvcLiteOnline", "m_synthesizer_inst sp = %f ms\n",
(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);
// 将结果全部放到缓存中
memcpy(m_output_cache_buf, m_output_cache_buf + gs_synth_output_frame,
sizeof(float) * (m_output_cache_buf_frame - gs_synth_output_frame));
memcpy(m_output_cache_buf + (m_output_cache_buf_frame - gs_synth_output_frame),
m_synth_out[0][0].data(), sizeof(float) * gs_synth_output_frame);
int start_pos = m_output_cache_buf_frame - m_crossfade_frame - out_len;
memcpy(out_buf, m_output_cache_buf + start_pos, sizeof(float) * out_len);
// 对头部数据做fade_in以及fadeout
for (int i = 0; i < m_crossfade_frame; i++) {
float rate = float(i * 1.f / m_crossfade_frame);
out_buf[i] = rate * out_buf[i] + m_crossfade_buf[i] * (1 - rate);
}
memcpy(m_crossfade_buf, m_output_cache_buf + (m_output_cache_buf_frame - m_crossfade_frame),
sizeof(float) * m_crossfade_frame);
return 0;
}
int CRvcLiteOnline::get_latency_ms() {
// 此处除了block的延迟,还有推理时hubert理论上应该获取208,实际获取205帧,所以少的30ms
return gs_crossfade_time * 1000 + 30;
}
/*******************************对内的类**************************************/
CResample::CResample()
{
m_resample_inst = nullptr;
}
CResample::~CResample()
{
}
int CResample::init(int in_samplerate, int out_samplerate, int in_channel, int out_channel)
{
// 只是通道数不一致时走自驱逻辑
m_in_channel = in_channel;
m_out_channel = out_channel;
if (in_samplerate == out_samplerate && in_channel != out_channel) {
m_resample_inst = nullptr;
}
else {
m_resample_inst = std::make_shared<CFfmpegResampler>();
return m_resample_inst->init(in_samplerate, out_samplerate, in_channel, out_channel);
}
return ERR_RVC_LITE_SUCCESS;
}
int CResample::get_out_samples(int num)
{
if (m_resample_inst)
{
return m_resample_inst->get_out_samples(num);
}
return num;
}
void CResample::reset()
{
if (m_resample_inst)
{
return m_resample_inst->reset();
}
}
int CResample::get_latency()
{
if (m_resample_inst)
{
return m_resample_inst->get_latency();
}
return 0;
}
int CResample::resample(float *in_buf, int in_num, float *out_buf, int &out_num) {
if (m_resample_inst) {
return m_resample_inst->resample(in_buf, in_num, out_buf, out_num);
}
if (m_in_channel == 2 && m_out_channel == 1) {
if (out_num < in_num) {
return ERR_RVC_LITE_RT_RESAMPLE_OUTBUF_SHORT;
}
stereo2mono(in_buf, in_num, out_buf);
return ERR_RVC_LITE_SUCCESS;
}
if (m_in_channel == 1 && m_out_channel == 2) {
if (out_num < in_num) {
return ERR_RVC_LITE_RT_RESAMPLE_OUTBUF_SHORT;
}
mono2stereo(in_buf, in_num, out_buf);
return ERR_RVC_LITE_SUCCESS;
}
return ERR_RVC_LITE_SUCCESS;
}
/*******************************对外的类***************************************/
/*******************************对内函数***************************************/
void CRvcLiteOnlineRealTime::init_variable() {
m_init = false;
m_rvc_stop = true;
m_sample_rate = 44100;
m_channel = 1;
m_synth_path = "";
m_new_synth_path = "";
m_syn_state = RVC_LITE_RT_SYN_STATE_DEFAULT;
}
/*******************************对外函数***************************************/
CRvcLiteOnlineRealTime::CRvcLiteOnlineRealTime() {
init_variable();
}
CRvcLiteOnlineRealTime::~CRvcLiteOnlineRealTime() {
uninit();
}
int CRvcLiteOnlineRealTime::init(const char *hubert_model_path, int sample_rate, int channel) {
if (m_init) {
return ERR_RVC_LITE_RT_REINIT;
}
if (sample_rate < 16000) {
return ERR_RVC_LITE_RT_INPUT_SAMPLE_ERR;
}
init_variable();
m_sample_rate = sample_rate;
m_channel = channel;
m_synth_path = "";
m_new_synth_path = "";
m_syn_state = RVC_LITE_RT_SYN_STATE_DEFAULT;
int output_one_sec_number = m_sample_rate * m_channel; // 临时使用的数据
int latency_len = gs_crossfade_time * m_sample_rate * m_channel;
CThreadPool::Task task = std::bind(&CRvcLiteOnlineRealTime::rvc_process, this);
m_rvc_inst = std::make_shared<CRvcLiteOnline>();
int err = m_rvc_inst->init(hubert_model_path);
if (ERR_RVC_LITE_SUCCESS != err) {
goto exit;
}
// 重采样部分
m_resample_queue = std::make_shared<CRvcCircleBuffer>(sample_rate * 3 * m_channel);
m_resample16 = std::make_shared<CResample>();
err = m_resample16->init(m_sample_rate, gs_src_samplerate, m_channel, 1);
if (ERR_RVC_LITE_SUCCESS != err) {
goto exit;
}
m_resample2src = std::make_shared<CResample>();
err = m_resample2src->init(gs_dst_samplerate, m_sample_rate, 1, m_channel);
if (ERR_RVC_LITE_SUCCESS != err) {
goto exit;
}
m_resample_buf_max_len = 2048; // 此时空间最大是2048,保证不超即可
m_resample_in_buf = std::shared_ptr<float>(new float[m_resample_buf_max_len], std::default_delete<float[]>());
m_resample_out_buf = std::shared_ptr<float>(new float[m_resample_buf_max_len], std::default_delete<float[]>());
// 核心处理部分
m_input_tmp_buf_len = gs_src_samplerate;
m_output_tmp_buf_len = gs_dst_samplerate;
m_input_tmp_buf = std::shared_ptr<float>(new float[m_input_tmp_buf_len], std::default_delete<float[]>());
m_output_tmp_buf = std::shared_ptr<float>(new float[m_output_tmp_buf_len], std::default_delete<float[]>());
memset(m_input_tmp_buf.get(), 0, sizeof(float) * m_input_tmp_buf_len);
memset(m_output_tmp_buf.get(), 0, sizeof(float) * m_output_tmp_buf_len);
// 循环buffer
m_input_queue = std::make_shared<CRvcCircleBuffer>(m_input_tmp_buf_len * 3);
// 对外的是目标的采样率和通道数的数据
m_out_queue = std::make_shared<CRvcCircleBuffer>(output_one_sec_number * 3);
m_latency_queue = std::make_shared<CRvcCircleBuffer>(latency_len);
// 提前塞入两组,保证延迟稳定在2s
for (int i = 0; i < 2; i++) {
// 塞入1s数据
for (int j = 0; j < output_one_sec_number / m_output_tmp_buf_len; j++) {
m_out_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len);
}
m_out_queue->push(m_output_tmp_buf.get(), output_one_sec_number % m_output_tmp_buf_len);
}
// 算法本身有延迟,所有为了保证延迟一致,在无效果的时候需要添加该延迟
for (int j = 0; j < latency_len / m_output_tmp_buf_len; j++) {
m_latency_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len);
}
m_latency_queue->push(m_output_tmp_buf.get(), latency_len % m_output_tmp_buf_len);
// 开始处理线程
m_thread_pool = std::make_shared<CThreadPool>();
m_thread_pool->start(1);
m_rvc_stop = false;
m_thread_pool->run(task);
m_init = true;
exit:
if (ERR_RVC_LITE_SUCCESS != err) {
m_init = true;
uninit();
}
return err;
}
int CRvcLiteOnlineRealTime::switch_synth(const char *synth_model_path) {
if (!m_init) {
return ERR_RVC_LITE_RT_NOT_INIT;
}
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_new_synth_path = synth_model_path;
}
return ERR_RVC_LITE_SUCCESS;
}
int CRvcLiteOnlineRealTime::process(float *in_buf, int in_len, float *out_buf, int out_len) {
if (!m_init) {
return ERR_RVC_LITE_RT_NOT_INIT;
}
// 写入数据
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_resample_queue->push(in_buf, in_len);
m_rvc_cond.notify_all();
}
memset(out_buf, 0, sizeof(float) * out_len);
int tmp_out_len = out_len;
// 获取数据
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_out_queue->pop(out_buf, tmp_out_len);
}
if (tmp_out_len != out_len) {
return ERR_RVC_LITE_RT_NOT_ENOUGH_DATA;
}
return ERR_RVC_LITE_SUCCESS;
}
void CRvcLiteOnlineRealTime::reset() {
if (!m_init) {
return;
}
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_resample_queue->reset();
m_resample16->reset();
m_resample2src->reset();
m_input_queue->reset();
m_out_queue->reset();
m_rvc_inst->reset();
m_latency_queue->reset();
// 提前塞入两组,保证延迟稳定在2s
int output_one_sec_number = m_sample_rate * m_channel; // 临时使用的数据
memset(m_output_tmp_buf.get(), 0, sizeof(float) * m_output_tmp_buf_len);
for (int i = 0; i < 2; i++) {
for (int j = 0; j < output_one_sec_number / m_output_tmp_buf_len; j++) {
m_out_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len);
}
m_out_queue->push(m_output_tmp_buf.get(), output_one_sec_number % m_output_tmp_buf_len);
}
// 算法本身有延迟,所有为了保证延迟一致,在无效果的时候需要添加该延迟
int latency_len = gs_crossfade_time * m_sample_rate * m_channel;
for (int j = 0; j < latency_len / m_output_tmp_buf_len; j++) {
m_latency_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len);
}
m_latency_queue->push(m_output_tmp_buf.get(), latency_len % m_output_tmp_buf_len);
}
}
void CRvcLiteOnlineRealTime::flush(float *&out_buf, int &len) {
// 将内部的所有的数据吐出来
/**
* 先停止
*/
stop();
// 无音色转换的情况
int resample_in_len = 0;
int resample_out_len = 0;
if(m_syn_state == RVC_LITE_RT_SYN_STATE_DEFAULT)
{
while (m_resample_queue->size() > 0) {
resample_in_len = m_resample_buf_max_len;
m_resample_queue->pop(m_resample_in_buf.get(), resample_in_len);
m_latency_queue->push(m_resample_in_buf.get(), resample_in_len);
m_latency_queue->pop(m_resample_in_buf.get(), resample_in_len);
m_out_queue->push(m_resample_in_buf.get(), resample_in_len);
}
while(m_latency_queue->size() > 0)
{
resample_in_len = m_resample_buf_max_len;
m_latency_queue->pop(m_resample_in_buf.get(), resample_in_len);
m_out_queue->push(m_resample_in_buf.get(), resample_in_len);
}
len = m_out_queue->size();
out_buf = new float[len];
m_out_queue->pop(out_buf, len);
return;
}
// 有音色转换的情况
while (m_resample_queue->size() > 0) {
resample_in_len = m_resample_buf_max_len;
m_resample_queue->pop(m_resample_in_buf.get(), resample_in_len);
// 输入的数据需要考虑channel
resample_out_len = m_resample16->get_out_samples(resample_in_len / m_channel);
m_resample16->resample(m_resample_in_buf.get(), resample_in_len / m_channel, m_resample_out_buf.get(),
resample_out_len);
// 输出是16k单声道,不需要考虑
m_input_queue->push(m_resample_out_buf.get(), resample_out_len);
}
memset(m_input_tmp_buf.get(), 0, sizeof(float) * m_input_tmp_buf_len);
int add_size = m_input_tmp_buf_len - m_input_queue->size() % m_input_tmp_buf_len;
if (add_size != 0 && add_size < m_input_tmp_buf_len) {
m_input_queue->push(m_input_tmp_buf.get(), add_size);
}
int num = m_input_queue->size() / m_input_tmp_buf_len;
for (int i = 0; i < num; i++) {
rvc_process_step();
}
// 将所有数据拷贝出来
len = m_out_queue->size();
out_buf = new float[len];
m_out_queue->pop(out_buf, len);
}
int CRvcLiteOnlineRealTime::get_latency_ms() {
return m_rvc_inst->get_latency_ms() + 2000;
}
/*******************************对内函数***************************************/
void CRvcLiteOnlineRealTime::uninit() {
if (!m_init) {
return;
}
stop();
}
void CRvcLiteOnlineRealTime::stop() {
// 释放thread_pool的数据,先通知一下rvc_process,防止是在等待中
m_rvc_stop = true;
if (m_thread_pool) {
m_rvc_cond.notify_all();
m_thread_pool->stop();
}
}
void CRvcLiteOnlineRealTime::rvc_process_step() {
struct timeval start;
struct timeval end;
int sample_out_len = 0;
// 开始处理
if (m_input_queue->size() < m_input_tmp_buf_len) {
return;
}
gettimeofday(&start, NULL);
m_input_queue->pop(m_input_tmp_buf.get(), m_input_tmp_buf_len);
m_rvc_inst->process_block(m_input_tmp_buf.get(), m_input_tmp_buf_len,
m_output_tmp_buf.get(), m_output_tmp_buf_len);
gettimeofday(&end, NULL);
LOGD("RvcLite", "rvc_process process sp %f ms",
(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);
// 重采样
// 考虑到此处采样率变大,但是最多也不到两倍,但是通道数有可能扩展到两倍,所以按照1/4进行设置
gettimeofday(&start, NULL);
bool last = false;
int step = m_resample_buf_max_len / 4;
for (int i = 0; i < m_output_tmp_buf_len; i += step) {
if (i + step >= m_output_tmp_buf_len) {
step = m_output_tmp_buf_len - i;
last = true;
}
// 此时的输入是单声道,采样点数量和总长度一致
sample_out_len = m_resample2src->get_out_samples(step);
m_resample2src->resample(m_output_tmp_buf.get() + i, step, m_resample_out_buf.get(), sample_out_len);
// 从有到无
if(last && m_syn_state == RVC_LITE_RT_SYN_STATE_EFFECT2DEFAULT)
{
// 因为不加音效也需要延迟对齐,所以此处只要做fade_out就行了
for(int ii =0; ii < sample_out_len * m_channel; ii+=m_channel)
{
float rate = ii * 1.0 / step;
for(int jj = 0; jj < m_channel; jj++)
{
m_resample_out_buf.get()[ii+jj] = m_resample_out_buf.get()[ii+jj] * (1 - rate);
}
}
m_syn_state = RVC_LITE_RT_SYN_STATE_BEFORE_DEFAULT;
}
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_out_queue->push(m_resample_out_buf.get(), sample_out_len * m_channel);
}
}
gettimeofday(&end, NULL);
LOGD("RvcLite", "rvc_process re_resample sp %f ms",
(end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0);
printf("finish ...\n");
}
void CRvcLiteOnlineRealTime::rvc_process() {
int sample_in_len;
int sample_out_len = 0;
while (!m_rvc_stop) {
{
// 重采样
std::unique_lock<std::mutex> lock(m_rvc_mutex);
if (m_resample_queue->size() < m_resample_buf_max_len) {
// 睡眠前检查下情况
if (m_rvc_stop) {
return;
}
m_rvc_cond.wait(lock);
continue;
}
sample_in_len = m_resample_buf_max_len;
m_resample_queue->pop(m_resample_in_buf.get(), sample_in_len);
}
/**
* 此处有三种情况:
* 因为无论哪种变换,有延迟的存在,导致输入的数据都是需要塞0进去,所以对当前的数据做fade_out即可
* 1. 无到有:对无到有的部分做个fade_out,对下一帧要塞入音效器的部分做fade_in
* 2. 有到无:对无到有的部分做个fade_out,对下一帧要塞入音效器的部分做fade_in
* 3. 有到有[这个不用考虑,内部自己做了处理]
*/
if (m_synth_path != m_new_synth_path) {
// 从无到有,此时对本帧做fade_out,对下一帧输入做fade_in
if(m_synth_path.empty() && !m_new_synth_path.empty())
{
m_syn_state = RVC_LITE_RT_SYN_STATE_DEFAULT2EFFECT;
}
// 从有到无
if (!m_synth_path.empty() && m_new_synth_path.empty())
{
m_syn_state = RVC_LITE_RT_SYN_STATE_EFFECT2DEFAULT;
}
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_synth_path = m_new_synth_path;
}
m_rvc_inst->switch_synth_model(m_new_synth_path.c_str());
}
// 刚切过来第一次做效果
if(m_syn_state == RVC_LITE_RT_SYN_STATE_BEFORE_DEFAULT)
{
// 刚从有到无,需要清空数据,以及对输入的队列添加fade_in
m_latency_queue->reset();
// 算法本身有延迟,所有为了保证延迟一致,在无效果的时候需要添加该延迟
memset(m_output_tmp_buf.get(), 0, sizeof(float) * m_output_tmp_buf_len);
int latency_len = gs_crossfade_time * m_sample_rate * m_channel;
for (int j = 0; j < latency_len / m_output_tmp_buf_len; j++) {
m_latency_queue->push(m_output_tmp_buf.get(), m_output_tmp_buf_len);
}
m_latency_queue->push(m_output_tmp_buf.get(), latency_len % m_output_tmp_buf_len);
// 对输入做fade_in
for(int i = 0; i < sample_in_len; i+=m_channel)
{
float rate = i * 1.0 / sample_in_len;
for(int j = 0; j < m_channel; j++)
{
m_resample_in_buf.get()[i+j] *= rate;
}
}
m_syn_state = RVC_LITE_RT_SYN_STATE_DEFAULT;
}
// 不做效果
if(m_syn_state == RVC_LITE_RT_SYN_STATE_DEFAULT)
{
m_latency_queue->push(m_resample_in_buf.get(), sample_in_len);
m_latency_queue->pop(m_resample_in_buf.get(), sample_in_len);
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_out_queue->push(m_resample_in_buf.get(), sample_in_len);
}
continue;
}
// 从无到有的转换
if (m_syn_state == RVC_LITE_RT_SYN_STATE_DEFAULT2EFFECT)
{
// 做fade_out
for(int i = 0; i < sample_in_len; i+=m_channel)
{
float rate = i * 1.0 / sample_in_len;
for(int j = 0; j < m_channel; j++)
{
m_resample_in_buf.get()[i+j] *= 1 - rate;
}
}
m_latency_queue->push(m_resample_in_buf.get(), sample_in_len);
m_latency_queue->pop(m_resample_in_buf.get(), sample_in_len);
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_out_queue->push(m_resample_in_buf.get(), sample_in_len);
}
// 此时对于rvc来说输入的数据不连贯了,所以清空内部数据重新搞
m_syn_state = RVC_LITE_RT_SYN_STATE_EFFECT;
m_rvc_inst->reset();
continue;
}
// 重采样到16k,此处采样率变低,所以不会出现sample_out_len > sample_in_len的情况
sample_out_len = m_resample16->get_out_samples(sample_in_len / m_channel);
m_resample16->resample(m_resample_in_buf.get(), sample_in_len / m_channel, m_resample_out_buf.get(),
sample_out_len);
m_input_queue->push(m_resample_out_buf.get(), sample_out_len);
rvc_process_step();
}
}
\ No newline at end of file
diff --git a/mnn_demo/src/CRvcLiteOnlineV2.cpp b/mnn_demo/src/CRvcLiteOnlineV2.cpp
index 0269a7d..97a67fb 100644
--- a/mnn_demo/src/CRvcLiteOnlineV2.cpp
+++ b/mnn_demo/src/CRvcLiteOnlineV2.cpp
@@ -1,215 +1,215 @@
//
// Created by Administrator on 2024/1/22.
//
#include "CRvcLiteOnlineV2.h"
#include "CRvcCircleBuffer.h"
#include <unistd.h>
inline bool file_exists1 (const std::string& name) {
return ( access( name.c_str(), F_OK ) != -1 );
}
CRvcLiteOnlineV2::CRvcLiteOnlineV2()
{
}
CRvcLiteOnlineV2::~CRvcLiteOnlineV2()
{
}
/*****************************************对内函数***************************************************************/
void CRvcLiteOnlineV2::set_cur_state(bool reset)
{
/**
* 一共三种状态
* 从无到有: 让不做效果的fade_out,做效果的fade_in
* 从有到无: 让做效果的fade_out, 不做效果的fade_in即可
* 从有到有,这种情况不考虑,内部自己会做fade
*/
if (m_syn_model != m_new_syn_model)
{
// 从无到有
if (m_syn_model.empty() && !m_new_syn_model.empty())
{
m_sync_state = CRVC_V2_STATE_DEFAULT2EFFECT;
// 如果此时已经发生了reset,则不需要做切换,直接做就行
if (reset)
{
m_sync_state = CRVC_V2_STATE_EFFECT;
}
m_syn_model = m_new_syn_model;
m_rvc_inst->switch_model(m_syn_model.c_str());
}
// 从有到无
if (!m_syn_model.empty() && m_new_syn_model.empty())
{
m_sync_state = CRVC_V2_STATE_EFFECT2DEFAULT;
// 如果此时已经发生了reset,则不需要做切换,直接做就行
if (reset)
{
m_sync_state = CRVC_V2_STATE_DEFAULT;
}
m_syn_model = m_new_syn_model;
}
}
}
/*****************************************对外函数***************************************************************/
int CRvcLiteOnlineV2::init(const char *hubert_model, int sample_rate, int channel)
{
m_rvc_inst = std::make_shared<CRvcLiteSynthesizer>();
- m_block_len = sample_rate * channel - 100 * channel;
+ m_block_len = int(sample_rate * 0.9) * channel; // 每900ms处理一次
m_tmp_buf_len = m_block_len * 2;
m_reset = true;
m_syn_model = "";
m_new_syn_model = "";
m_sync_state = CRVC_V2_STATE_DEFAULT;
m_fade_len = int(sample_rate * 0.05) * channel; // 50ms的时长用来做fade
m_channel = channel;
m_tmp_in_buf = std::shared_ptr<float>(new float[m_tmp_buf_len], std::default_delete<float[]>());
m_tmp_out_buf = std::shared_ptr<float>(new float[m_tmp_buf_len], std::default_delete<float[]>());
m_in_queue = std::make_shared<CRvcCircleBuffer>(m_tmp_buf_len * 2);
m_out_queue = std::make_shared<CRvcCircleBuffer>(m_tmp_buf_len * 2);
m_input_latency_output_frame = 0;
return m_rvc_inst->init(hubert_model, sample_rate, channel);
}
int CRvcLiteOnlineV2::switch_model(const char *synth_model)
{
if (synth_model != "" && !file_exists1(synth_model))
{
return ERR_RVC_LITE_MODEL_NOT_EXISTS;
}
m_new_syn_model = synth_model;
return ERR_RVC_LITE_SUCCESS;
}
void CRvcLiteOnlineV2::set_up_key(int key)
{
// 内部是线程安全的,所以直接设置即可
m_rvc_inst->set_up_key(key);
}
void CRvcLiteOnlineV2::reset()
{
m_reset = true;
}
int CRvcLiteOnlineV2::push(float *buf, int len, bool last)
{
bool reset = m_reset;
if (m_reset)
{
m_reset = false;
m_input_latency_output_frame = 0;
m_in_queue->reset();
m_out_queue->reset();
m_rvc_inst->reset();
}
set_cur_state(reset);
if (CRVC_V2_STATE_DEFAULT == m_sync_state)
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_out_queue->push(buf, len);
return ERR_RVC_LITE_SUCCESS;
}
// 此时无论怎样,都要让模型跑一下,得到结果再说
m_in_queue->push(buf, len);
while(m_in_queue->size() >= m_block_len || last) {
if (m_in_queue->size() <= 0)
{
return ERR_RVC_LITE_SUCCESS;
}
int cur_in_len = m_block_len;
int cur_out_len = m_block_len;
m_in_queue->pop(m_tmp_in_buf.get(), cur_in_len);
int err = m_rvc_inst->process(m_tmp_in_buf.get(), cur_in_len, m_tmp_out_buf.get(), cur_out_len);
if (err != ERR_RVC_LITE_SUCCESS) {
return err;
}
// 此时对于effect做fade_out,default做fade_in
if (m_sync_state == CRVC_V2_STATE_EFFECT2DEFAULT)
{
// 此时由于m_rvc_inst本身存在延迟输出的情况[虽然头部的静音帧已经被砍掉了],但是其输入的数据和输出的数据并不是完美对应的,存在延迟差
// 所以此时输入的头部和输出的头部之前存在延迟差,但是不加音效是没有这个延迟差的
// 所以需要将输入的头部对应到其应该对应的输出真实数据的头部
// 比如: 输入: 1,2,3,4,5 输出: l1,l2,1,2,3 ,其中l1和l2是延迟采样点,也就是1,2,对应的是输出+延迟采样点才对
for(int i = 0; i < m_fade_len; i+=m_channel)
{
float rate = i * 1.0 / m_fade_len;
for(int j = 0; j < m_channel; j+=1)
{
m_tmp_in_buf.get()[i+j] = m_tmp_in_buf.get()[i+j] * rate + m_tmp_out_buf.get()[i+j+m_input_latency_output_frame] * (1 - rate);
}
}
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
// 将之前要输入的那块塞进去
m_out_queue->push(m_tmp_out_buf.get(), m_input_latency_output_frame);
m_out_queue->push(m_tmp_in_buf.get(), cur_in_len);
}
m_sync_state = CRVC_V2_STATE_DEFAULT;
m_input_latency_output_frame = 0;
while(m_in_queue->size() > 0)
{
cur_in_len = m_block_len;
m_in_queue->pop(m_tmp_in_buf.get(), cur_in_len);
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_out_queue->push(m_tmp_in_buf.get(), cur_in_len);
}
}
return ERR_RVC_LITE_SUCCESS;
}
// 此时对effect做fade_in,default做fade_out
if (m_sync_state == CRVC_V2_STATE_DEFAULT2EFFECT)
{
for(int i = 0; i < m_fade_len; i+=m_channel)
{
float rate = i * 1.0 / m_fade_len;
for(int j = 0; j < m_channel; j+=1)
{
m_tmp_out_buf.get()[i+j] = m_tmp_out_buf.get()[i+j] * rate + m_tmp_in_buf.get()[i+j] * (1 - rate);
}
}
// 设置状态
m_sync_state = CRVC_V2_STATE_EFFECT;
}
// effect会存在输入和输出长度不一致的情况
m_input_latency_output_frame += cur_in_len - cur_out_len;
// 加锁塞入数据
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_out_queue->push(m_tmp_out_buf.get(), cur_out_len);
}
}
return ERR_RVC_LITE_SUCCESS;
}
int CRvcLiteOnlineV2::size()
{
return m_out_queue->size();
}
void CRvcLiteOnlineV2::pop(float *buf, int &len)
{
std::unique_lock<std::mutex> lock(m_rvc_mutex);
m_out_queue->pop(buf, len);
}
diff --git a/mnn_demo/src/CRvcLiteSynthesizer.cpp b/mnn_demo/src/CRvcLiteSynthesizer.cpp
index 6ff952b..711a6ed 100644
--- a/mnn_demo/src/CRvcLiteSynthesizer.cpp
+++ b/mnn_demo/src/CRvcLiteSynthesizer.cpp
@@ -1,128 +1,127 @@
//
// Created by Administrator on 2024/1/21.
//
#include "CRvcLiteSynthesizer.h"
#include <cstring>
#include <sys/time.h>
CRvcLiteSynthesizer::CRvcLiteSynthesizer(){}
CRvcLiteSynthesizer::~CRvcLiteSynthesizer() {}
int CRvcLiteSynthesizer::init(const char *hubert_model, int sample_rate, int channel)
{
m_rvc_inst = std::make_shared<CRvcLiteOnline>();
int err = m_rvc_inst->init(hubert_model);
if (err != ERR_RVC_LITE_SUCCESS)
{
return err;
}
m_resample2_16 = std::make_shared<CResample>();
m_resample2_16->init(sample_rate, gs_src_samplerate, channel, 1);
m_resample2src = std::make_shared<CResample>();
m_resample2src->init(gs_dst_samplerate, sample_rate, 1, channel);
m_channel = channel;
m_sample_rate = sample_rate;
m_buf_tmp_16k_len = 0;
m_buf_tmp_16k_cap = 0;
m_buf_tmp_32k_len = 0;
m_buf_tmp_32k_cap = 0;
m_buf_tmp_src_len = 0;
m_buf_tmp_src_cap = 0;
m_first = true;
return ERR_RVC_LITE_SUCCESS;
}
int CRvcLiteSynthesizer::switch_model(const char *synth_model)
{
return m_rvc_inst->switch_synth_model(synth_model);
}
void CRvcLiteSynthesizer::set_up_key(int key)
{
m_rvc_inst->set_up_key(key);
}
void CRvcLiteSynthesizer::reset()
{
m_rvc_inst->reset();
m_first = true;
}
int CRvcLiteSynthesizer::process(float *in_buf, int in_len, float *out_buf, int &out_len) {
// 1 重采样 2 推理 3 再次重采样
int resample_out_len = m_resample2_16->get_out_samples(in_len / m_channel);
// 控制逻辑,不能超过该长度
if (resample_out_len > gs_src_samplerate) {
return ERR_RVC_LITE_BLOCK_TOO_LONG;
}
-
if (m_buf_tmp_16k_cap < resample_out_len) {
m_buf_tmp_16k_cap = resample_out_len;
m_buf_tmp_16k = std::shared_ptr<float>(new float[m_buf_tmp_16k_cap], std::default_delete<float[]>());
}
m_buf_tmp_16k_len = resample_out_len;
int err = m_resample2_16->resample(in_buf, in_len / m_channel, m_buf_tmp_16k.get(), m_buf_tmp_16k_len);
if (err != ERR_RVC_LITE_SUCCESS) {
return err;
}
if (m_buf_tmp_32k_cap < m_buf_tmp_16k_len * 2) {
m_buf_tmp_32k_cap = m_buf_tmp_16k_len * 2;
m_buf_tmp_32k = std::shared_ptr<float>(new float[m_buf_tmp_32k_cap], std::default_delete<float[]>());
}
m_buf_tmp_32k_len = m_buf_tmp_16k_len * 2;
// 推理
err = m_rvc_inst->process_block(m_buf_tmp_16k.get(), m_buf_tmp_16k_len, m_buf_tmp_32k.get(), m_buf_tmp_32k_len);
if (err != ERR_RVC_LITE_SUCCESS) {
return err;
}
// 重采样回来
int out_frame = m_resample2src->get_out_samples(m_buf_tmp_32k_len);
if (m_buf_tmp_src_cap < out_frame * m_channel) {
m_buf_tmp_src_cap = out_frame * m_channel;
m_buf_tmp_src = std::shared_ptr<float>(new float[m_buf_tmp_src_cap], std::default_delete<float[]>());
}
m_buf_tmp_src_len = out_frame;
err = m_resample2src->resample(m_buf_tmp_32k.get(), m_buf_tmp_32k_len, m_buf_tmp_src.get(), m_buf_tmp_src_len);
if (err != ERR_RVC_LITE_SUCCESS) {
return err;
}
// 取较小的值
if (out_len > m_buf_tmp_src_len * m_channel)
{
out_len = m_buf_tmp_src_len * m_channel;
}
// 第一次过来,将头部的延迟块切掉
int latency_frame = 0;
if (m_first)
{
m_first = false;
latency_frame = int(m_rvc_inst->get_latency_ms() * 1.0 / 1000 * m_sample_rate) * m_channel;
out_len -= latency_frame;
}
memcpy(out_buf, m_buf_tmp_src.get()+latency_frame, sizeof(float) * out_len);
return ERR_RVC_LITE_SUCCESS;
}
float CRvcLiteSynthesizer::get_rtf()
{
struct timeval start;
struct timeval end;
gettimeofday(&start, NULL);
int in_len = m_sample_rate * m_channel - 100 *m_channel;
int out_len = in_len;
float* in_buf = new float[in_len];
process(in_buf, in_len, in_buf, in_len);
delete [] in_buf;
gettimeofday(&end, NULL);
double sp = (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0;
return sp / 1000;
}
\ No newline at end of file
diff --git a/mnn_demo/src/CSynthesizer.cpp b/mnn_demo/src/CSynthesizer.cpp
index 2346fe9..427d6c8 100644
--- a/mnn_demo/src/CSynthesizer.cpp
+++ b/mnn_demo/src/CSynthesizer.cpp
@@ -1,73 +1,86 @@
//
// Created by ZIHAO GUO on 2023/11/16.
//
#include "CSynthesizer.h"
#include <cstring>
#include <sys/time.h>
CSynthesizer::CSynthesizer() = default;
CSynthesizer::~CSynthesizer() {
uninit();
}
int CSynthesizer::init(const char *model_path) {
m_config.type = MNN_FORWARD_CPU;
m_runtime_info = MNN::Interpreter::createRuntime({m_config});
m_net = std::shared_ptr<MNN::Interpreter>(MNN::Interpreter::createFromFile(model_path));
m_session = m_net->createSession(m_config, m_runtime_info);
m_input_tensor = m_net->getSessionInput(m_session, nullptr);
return 0;
}
float CSynthesizer::process(std::vector<std::vector<std::vector<float>>> &contentvec_input, std::vector<std::vector<std::vector<float>>> &ret) {
std::vector<int> input_dims{1, 205, 258};
auto input_tensor = MNN::Tensor::create<float>(input_dims, nullptr, MNN::Tensor::CAFFE);
auto input_data = input_tensor->host<float>();
auto input_size = input_tensor->size();
// ::memcpy(input_data, contentvec_input.data(), input_size);
for (int i = 0; i < 205; i++)
{
std::memcpy(input_data+i*258, contentvec_input[0][i].data(), input_size / 205);
}
m_input_tensor->copyFromHostTensor(input_tensor);
delete input_tensor;
struct timeval start;
struct timeval end;
gettimeofday(&start, NULL);
m_net->runSession(m_session);
gettimeofday(&end, NULL);
float time = (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_usec - start.tv_usec) / 1000.0;
auto output_tensor = m_net->getSessionOutput(m_session, nullptr);
std::vector<int> shape = output_tensor->shape();
auto output = MNN::Tensor::create<float>(shape, nullptr, MNN::Tensor::CAFFE);
auto output_data = output->host<float>();
auto output_size = output->size();
output_tensor->copyToHostTensor(output);
+
for (int i = 0; i < shape[0]; i++)
{
+ if (shape[0] > ret.size())
+ {
+ ret.resize(shape[0]);
+ }
for (int j = 0; j < shape[1]; j++)
{
+ if (shape[1] > ret[j].size())
+ {
+ ret[j].resize(shape[1]);
+ }
for (int k = 0; k < shape[2]; k++)
{
+ if (shape[2] > ret[i][j].size())
+ {
+ ret[i][j].resize(shape[2]);
+ }
ret[i][j][k] = *(output_data + i * 35840 + k);
}
}
}
return time;
}
void CSynthesizer::uninit() {
if (m_net != nullptr)
{
m_net->releaseModel();
}
m_net = nullptr;
m_session = nullptr;
m_input_tensor = nullptr;
}
diff --git a/mnn_demo/third_party/espyin-v1.0/ESPYIN.cpp b/mnn_demo/third_party/espyin-v1.0/ESPYIN.cpp
index a0c762d..604ab5a 100644
--- a/mnn_demo/third_party/espyin-v1.0/ESPYIN.cpp
+++ b/mnn_demo/third_party/espyin-v1.0/ESPYIN.cpp
@@ -1,163 +1,162 @@
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
/*
pYIN - A fundamental frequency estimator for monophonic audio
Centre for Digital Music, Queen Mary, University of London.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version. See the file
COPYING included with this distribution for more information.
*/
#include "ESPYIN.h"
#include "ESMonoPitch.h"
#include <vector>
#include <algorithm>
#include <cstdio>
#include <cmath>
#include <complex>
using std::string;
using std::vector;
ESPYIN::ESPYIN(float inputSampleRate, size_t stepSize, size_t blockSize, size_t fmin, size_t fmax) :
m_stepSize(stepSize),
m_blockSize(blockSize),
m_fmin(fmin),
m_fmax(fmax),
m_yin(blockSize, inputSampleRate, 0.0),
m_oF0Candidates(0),
m_oF0Probs(1),
m_oVoicedProb(2),
m_oCandidateSalience(3),
m_oSmoothedPitchTrack(4),
m_threshDistr(2.0f),
m_outputUnvoiced(2.0f),
m_pitchProb(0)
{
reset();
}
ESPYIN::~ESPYIN()
{
}
void
ESPYIN::reset()
{
m_yin.setThresholdDistr(m_threshDistr);
m_yin.setFrameSize(m_blockSize);
m_pitchProb.clear();
}
void
ESPYIN::updata(int reserve_frame_num)
{
vector<vector<pair<double, double> > > temp_pitchProb(m_pitchProb);
if (!temp_pitchProb.empty()) {
int frame_num = int(temp_pitchProb.size());
if (reserve_frame_num <= 0 || reserve_frame_num > frame_num) {
return;
}
for (int i = 0; i < reserve_frame_num; ++i) {
temp_pitchProb[i] = temp_pitchProb[frame_num - reserve_frame_num + i];
}
temp_pitchProb.resize(reserve_frame_num);
m_pitchProb = temp_pitchProb;
}
}
-ESFeatureSet
-ESPYIN::process(const float * const inputBuffers)
+ESFeatureSet ESPYIN::process(const float * const inputBuffers)
{
ESFeatureSet fs;
double *dInputBuffers = new double[m_blockSize];
for (size_t i = 0; i < m_blockSize; ++i) dInputBuffers[i] = inputBuffers[i];
ESYin::YinOutput yo = m_yin.processProbabilisticYin(dInputBuffers);
ESFeature f;
for (size_t i = 0; i < yo.freqProb.size(); ++i)
{
f.values.push_back(yo.freqProb[i].first);
}
fs[m_oF0Candidates].push_back(f);
f.values.clear();
float voicedProb = 0;
for (size_t i = 0; i < yo.freqProb.size(); ++i)
{
f.values.push_back(yo.freqProb[i].second);
voicedProb += yo.freqProb[i].second;
}
fs[m_oF0Probs].push_back(f);
f.values.clear();
f.values.push_back(voicedProb);
fs[m_oVoicedProb].push_back(f);
f.values.clear();
float salienceSum = 0;
for (size_t iBin = 0; iBin < yo.salience.size(); ++iBin)
{
f.values.push_back(yo.salience[iBin]);
salienceSum += yo.salience[iBin];
}
fs[m_oCandidateSalience].push_back(f);
delete [] dInputBuffers;
vector<pair<double, double> > tempPitchProb;
for (size_t iCandidate = 0; iCandidate < yo.freqProb.size(); ++iCandidate)
{
double tempPitch = 12 * std::log(yo.freqProb[iCandidate].first/440)/std::log(2.) + 69;
tempPitchProb.push_back(pair<double, double>
(tempPitch, yo.freqProb[iCandidate].second));
}
m_pitchProb.push_back(tempPitchProb);
return fs;
}
ESFeatureSet
ESPYIN::getRemainingFeatures(int reso_type)
{
ESFeatureSet fs;
ESFeature f;
vector<vector<pair<double, double> > > temp_pitchProb(m_pitchProb);
if (temp_pitchProb.empty()) {
return fs;
}
// MONO-PITCH STUFF
ESMonoPitch mp(reso_type);
// std::cerr << "before viterbi" << std::endl;
vector<float> mpOut = mp.process(temp_pitchProb);
// std::cerr << "after viterbi " << mpOut.size() << " "<< m_timestamp.size() << std::endl;
for (size_t iFrame = 0; iFrame < mpOut.size(); ++iFrame)
{
if (mpOut[iFrame] < 0 && (m_outputUnvoiced==0)) continue;
f.values.clear();
if (m_outputUnvoiced == 1)
{
f.values.push_back(abs(mpOut[iFrame]));
} else {
f.values.push_back(mpOut[iFrame]);
}
fs[m_oSmoothedPitchTrack].push_back(f);
}
return fs;
}
int
ESPYIN::getFrames() {
return int(m_pitchProb.size());
}
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Sun, Jan 12, 08:33 (1 d, 15 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1347192
Default Alt Text
(63 KB)
Attached To
R350 av_svc
Event Timeline
Log In to Comment