Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F4880358
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
42 KB
Subscribers
None
View Options
diff --git a/AIMeiSheng/._readme_meisheng.md b/AIMeiSheng/._readme_meisheng.md
deleted file mode 100644
index 50212ca..0000000
Binary files a/AIMeiSheng/._readme_meisheng.md and /dev/null differ
diff --git a/AIMeiSheng/docker_demo/common.py b/AIMeiSheng/docker_demo/common.py
index 098cba8..64aba31 100644
--- a/AIMeiSheng/docker_demo/common.py
+++ b/AIMeiSheng/docker_demo/common.py
@@ -1,121 +1,122 @@
import os
import sys
import time
# import logging
import urllib, urllib.request
# 测试/正式环境
gs_prod = True
# if len(sys.argv) > 1 and sys.argv[1] == "prod":
# gs_prod = True
# print(gs_prod)
gs_tmp_dir = "/tmp/ai_meisheng_tmp"
gs_model_dir = "/tmp/ai_meisheng_models"
gs_resource_cache_dir = "/tmp/ai_meisheng_resource_cache"
gs_embed_model_path = os.path.join(gs_model_dir, "RawNet3/models/weights/model.pt")
gs_svc_model_path = os.path.join(gs_model_dir,
"weights/xusong_v2_org_version_alldata_embed_spkenx200x_double_e14_s90706.pth")
gs_hubert_model_path = os.path.join(gs_model_dir, "hubert.pt")
gs_rmvpe_model_path = os.path.join(gs_model_dir, "rmvpe.pt")
gs_embed_model_spk_path = os.path.join(gs_model_dir, "SpeakerEncoder/pretrained_model/best_model.pth.tar")
gs_embed_config_spk_path = os.path.join(gs_model_dir, "SpeakerEncoder/pretrained_model/config.json")
# errcode
gs_err_code_success = 0
gs_err_code_download_vocal = 100
gs_err_code_download_svc_url = 101
gs_err_code_svc_process = 102
gs_err_code_transcode = 103
gs_err_code_volume_adjust = 104
gs_err_code_upload = 105
gs_err_code_params = 106
gs_err_code_pending = 107
gs_err_code_target_silence = 108
gs_err_code_too_many_connections = 429
+gs_err_code_gender_classify = 430
gs_redis_conf = {
"host": "av-credis.starmaker.co",
"port": 6379,
"pwd": "lKoWEhz%jxTO",
}
gs_server_redis_conf = {
"producer": "test_ai_meisheng_producer", # 输入的队列
"ai_meisheng_key_prefix": "test_ai_meisheng_key_", # 存储结果情况
}
if gs_prod:
gs_server_redis_conf = {
"producer": "ai_meisheng_producer", # 输入的队列
"ai_meisheng_key_prefix": "ai_meisheng_key_", # 存储结果情况
}
gs_feishu_conf = {
"url": "http://sg-prod-songbook-webmp-1:8000/api/feishu/people",
"users": [
"18810833785", # 杨建利
"17778007843", # 王健军
"18612496315" # 郭子豪
]
}
def download2disk(url, dst_path):
try:
urllib.request.urlretrieve(url, dst_path)
return os.path.exists(dst_path)
except Exception as ex:
print(f"download url={url} error", ex)
return False
def exec_cmd(cmd):
# gs_logger.info(cmd)
print(cmd)
ret = os.system(cmd)
if ret != 0:
return False
return True
def exec_cmd_and_result(cmd):
r = os.popen(cmd)
text = r.read()
r.close()
return text
def upload_file2cos(key, file_path, region='ap-singapore', bucket_name='av-audit-sync-sg-1256122840'):
"""
将文件上传到cos
:param key: 桶上的具体地址
:param file_path: 本地文件地址
:param region: 区域
:param bucket_name: 桶地址
:return:
"""
gs_coscmd = "coscmd"
gs_coscmd_conf = "~/.cos.conf"
cmd = "{} -c {} -r {} -b {} upload {} {}".format(gs_coscmd, gs_coscmd_conf, region, bucket_name, file_path, key)
if exec_cmd(cmd):
cmd = "{} -c {} -r {} -b {} info {}".format(gs_coscmd, gs_coscmd_conf, region, bucket_name, key) \
+ "| grep Content-Length |awk \'{print $2}\'"
res_str = exec_cmd_and_result(cmd)
# logging.info("{},res={}".format(key, res_str))
size = float(res_str)
if size > 0:
return True
return False
return False
def check_input(input_data):
key_list = ["record_song_url", "target_url", "start", "end", "vocal_loudness", "female_recording_url",
"male_recording_url"]
for key in key_list:
if key not in input_data.keys():
return False
return True
diff --git a/AIMeiSheng/docker_demo/svc_online.py b/AIMeiSheng/docker_demo/svc_online.py
index a52ab24..f12143f 100644
--- a/AIMeiSheng/docker_demo/svc_online.py
+++ b/AIMeiSheng/docker_demo/svc_online.py
@@ -1,190 +1,194 @@
# -*- coding: UTF-8 -*-
"""
SVC的核心处理逻辑
"""
import os
import time
import socket
import shutil
import hashlib
from AIMeiSheng.meisheng_svc_final import load_model, process_svc_online
from AIMeiSheng.cos_similar_ui_zoom import cos_similar
from AIMeiSheng.meisheng_env_preparex import meisheng_env_prepare
from AIMeiSheng.voice_classification.online.voice_class_online_fang import VoiceClass, download_volume_balanced
from AIMeiSheng.docker_demo.common import *
import logging
hostname = socket.gethostname()
log_file_name = f"{os.path.dirname(os.path.abspath(__file__))}/av_meisheng_{hostname}.log"
# 设置logger
svc_offline_logger = logging.getLogger("svc_offline")
file_handler = logging.FileHandler(log_file_name)
file_handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %I:%M:%S')
file_handler.setFormatter(formatter)
if gs_prod:
svc_offline_logger.addHandler(file_handler)
if os.path.exists(gs_tmp_dir):
shutil.rmtree(gs_tmp_dir)
os.makedirs(gs_model_dir, exist_ok=True)
os.makedirs(gs_resource_cache_dir, exist_ok=True)
# 预设参数
gs_gender_models_url = "https://av-audit-sync-sg-1256122840.cos.ap-singapore.myqcloud.com/hub/voice_classification/models.zip"
gs_volume_bin_url = "https://av-audit-sync-sg-1256122840.cos.ap-singapore.myqcloud.com/dataset/AIMeiSheng/ebur128_tool"
class GSWorkerAttr:
def __init__(self, input_data):
# 取出输入资源
vocal_url = input_data["record_song_url"]
target_url = input_data["target_url"]
start = input_data["start"] # 单位是ms
end = input_data["end"] # 单位是ms
vocal_loudness = input_data["vocal_loudness"]
female_recording_url = input_data["female_recording_url"]
male_recording_url = input_data["male_recording_url"]
self.distinct_id = hashlib.md5(vocal_url.encode()).hexdigest()
self.tmp_dir = os.path.join(gs_tmp_dir, self.distinct_id)
if os.path.exists(self.tmp_dir):
shutil.rmtree(self.tmp_dir)
os.makedirs(self.tmp_dir)
self.vocal_url = vocal_url
self.target_url = target_url
ext = vocal_url.split(".")[-1]
self.vocal_path = os.path.join(self.tmp_dir, self.distinct_id + f"_in.{ext}")
self.target_wav_path = os.path.join(self.tmp_dir, self.distinct_id + "_out.wav")
self.target_wav_ad_path = os.path.join(self.tmp_dir, self.distinct_id + "_out_ad.wav")
self.target_path = os.path.join(self.tmp_dir, self.distinct_id + "_out.m4a")
self.female_svc_source_url = female_recording_url
self.male_svc_source_url = male_recording_url
ext = female_recording_url.split(".")[-1]
self.female_svc_source_path = os.path.join(gs_resource_cache_dir,
hashlib.md5(female_recording_url.encode()).hexdigest() + "." + ext)
ext = male_recording_url.split(".")[-1]
self.male_svc_source_path = os.path.join(gs_resource_cache_dir,
hashlib.md5(male_recording_url.encode()).hexdigest() + "." + ext)
self.st_tm = start
self.ed_tm = end
self.target_loudness = vocal_loudness
def log_info_name(self):
return f"d_id={self.distinct_id}, vocal_url={self.vocal_url}"
def rm_cache(self):
if os.path.exists(self.tmp_dir):
shutil.rmtree(self.tmp_dir)
def init_gender_model():
"""
下载模型
:return:
"""
dst_model_dir = os.path.join(gs_model_dir, "voice_classification")
if not os.path.exists(dst_model_dir):
dst_zip_path = os.path.join(gs_model_dir, "models.zip")
if not download2disk(gs_gender_models_url, dst_zip_path):
svc_offline_logger.fatal(f"download gender_model err={gs_gender_models_url}")
cmd = f"cd {gs_model_dir}; unzip {dst_zip_path}; mv models voice_classification; rm -f {dst_zip_path}"
os.system(cmd)
if not os.path.exists(dst_model_dir):
svc_offline_logger.fatal(f"unzip {dst_zip_path} err")
music_voice_pure_model = os.path.join(dst_model_dir, "voice_005_rec_v5.pth")
music_voice_no_pure_model = os.path.join(dst_model_dir, "voice_10_v5.pth")
gender_pure_model = os.path.join(dst_model_dir, "gender_8k_ratev5_v6_adam.pth")
gender_no_pure_model = os.path.join(dst_model_dir, "gender_8k_v6_adam.pth")
vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model)
return vc
def init_svc_model():
meisheng_env_prepare(logging, gs_model_dir)
embed_model, hubert_model = load_model()
cs_sim = cos_similar()
return embed_model, hubert_model,cs_sim
def download_volume_adjustment():
"""
下载音量调整工具
:return:
"""
volume_bin_path = os.path.join(gs_model_dir, "ebur128_tool")
if not os.path.exists(volume_bin_path):
if not download2disk(gs_volume_bin_url, volume_bin_path):
svc_offline_logger.fatal(f"download volume_bin err={gs_volume_bin_url}")
os.system(f"chmod +x {volume_bin_path}")
def volume_adjustment(wav_path, target_loudness, out_path):
"""
音量调整
:param wav_path:
:param target_loudness:
:param out_path:
:return:
"""
volume_bin_path = os.path.join(gs_model_dir, "ebur128_tool")
cmd = f"{volume_bin_path} {wav_path} {target_loudness} {out_path}"
os.system(cmd)
class SVCOnline:
def __init__(self):
st = time.time()
self.gender_model = init_gender_model()
self.embed_model, self.hubert_model, self.cs_sim = init_svc_model()
download_volume_adjustment()
download_volume_balanced()
svc_offline_logger.info(f"svc init finished, sp = {time.time() - st}")
def gender_process(self, worker_attr):
st = time.time()
gender, female_rate, is_pure = self.gender_model.process(worker_attr.vocal_path)
svc_offline_logger.info(
f"{worker_attr.vocal_url}, gender={gender}, female_rate={female_rate}, is_pure={is_pure}, "
f"gender_process sp = {time.time() - st}")
if gender == 0:
gender = 'female'
elif gender == 1:
gender = 'male'
+ elif female_rate == None:
+ gender = 'male'
+ return gender, gs_err_code_gender_classify
elif female_rate > 0.5:
gender = 'female'
else:
gender = 'male'
+
svc_offline_logger.info(f"{worker_attr.vocal_url}, modified gender={gender}")
# err = gs_err_code_success
# if female_rate == -1:
# err = gs_err_code_target_silence
return gender, gs_err_code_success
def process(self, worker_attr):
gender, err = self.gender_process(worker_attr)
if err != gs_err_code_success:
return gender, err
song_path = worker_attr.female_svc_source_path
if gender == "male":
song_path = worker_attr.male_svc_source_path
params = {'gender': gender, 'tst': worker_attr.st_tm, "tnd": worker_attr.ed_tm, 'delay': 0, 'song_path': None}
st = time.time()
err_code = process_svc_online(song_path, worker_attr.vocal_path, worker_attr.target_wav_path, self.embed_model,
self.hubert_model, self.cs_sim, params)
svc_offline_logger.info(f"{worker_attr.vocal_url}, err_code={err_code} process svc sp = {time.time() - st}")
return gender, err_code
diff --git a/AIMeiSheng/vc_infer_pipeline_org_embed_spk.py b/AIMeiSheng/vc_infer_pipeline_org_embed_spk.py
index 076184f..f1e8f48 100644
--- a/AIMeiSheng/vc_infer_pipeline_org_embed_spk.py
+++ b/AIMeiSheng/vc_infer_pipeline_org_embed_spk.py
@@ -1,778 +1,781 @@
import numpy as np, parselmouth, torch, pdb, sys, os
from time import time as ttime
import torch.nn.functional as F
import scipy.signal as signal
import pyworld, os, traceback, faiss, librosa, torchcrepe
from scipy import signal
from functools import lru_cache
now_dir = os.getcwd()
sys.path.append(now_dir)
bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
input_audio_path2wav = {}
fidx = 0
import threading
import concurrent.futures
@lru_cache
def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
audio = input_audio_path2wav[input_audio_path]
f0, t = pyworld.harvest(
audio,
fs=fs,
f0_ceil=f0max,
f0_floor=f0min,
frame_period=frame_period,
)
f0 = pyworld.stonemask(audio, f0, t, fs)
return f0
def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比
# print(data1.max(),data2.max())
rms1 = librosa.feature.rms(
y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
) # 每半秒一个点
rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
rms1 = torch.from_numpy(rms1)
rms1 = F.interpolate(
rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
).squeeze()
rms2 = torch.from_numpy(rms2)
rms2 = F.interpolate(
rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
).squeeze()
rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
data2 *= (
torch.pow(rms1, torch.tensor(1 - rate))
* torch.pow(rms2, torch.tensor(rate - 1))
).numpy()
return data2
class VC(object):
def __init__(self, tgt_sr, config):
self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
config.x_pad, ##config会根据设备配置不通知如:3
config.x_query, # 10 等于x_max-x_center)*2
config.x_center, #60
config.x_max, #65
config.is_half,
)
self.sr = 16000 # hubert输入采样率
self.window = 160 # 每帧点数
self.t_pad = self.sr * self.x_pad # 每条前后pad时间
self.t_pad_tgt = tgt_sr * self.x_pad
self.t_pad2 = self.t_pad * 2
self.t_query = self.sr * self.x_query # 查询切点前后查询时间,
self.t_center = self.sr * self.x_center # 查询切点位置
self.t_max = self.sr * self.x_max # 免查询时长阈值
self.device = config.device
def get_f0(
self,
input_audio_path,
x,
p_len,
f0_up_key,
f0_method,
filter_radius,
inp_f0=None,
):
global input_audio_path2wav
time_step = self.window / self.sr * 1000
f0_min = 50
f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
if f0_method == "pm":
f0 = (
parselmouth.Sound(x, self.sr)
.to_pitch_ac(
time_step=time_step / 1000,
voicing_threshold=0.6,
pitch_floor=f0_min,
pitch_ceiling=f0_max,
)
.selected_array["frequency"]
)
pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
f0 = np.pad(
f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
)
elif f0_method == "harvest":
input_audio_path2wav[input_audio_path] = x.astype(np.double)
f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
if filter_radius > 2:
f0 = signal.medfilt(f0, 3)
elif f0_method == "crepe":
model = "full"
# Pick a batch size that doesn't cause memory errors on your gpu
batch_size = 512
# Compute pitch using first gpu
audio = torch.tensor(np.copy(x))[None].float()
f0, pd = torchcrepe.predict(
audio,
self.sr,
self.window,
f0_min,
f0_max,
model,
batch_size=batch_size,
device=self.device,
return_periodicity=True,
)
pd = torchcrepe.filter.median(pd, 3)
f0 = torchcrepe.filter.mean(f0, 3)
f0[pd < 0.1] = 0
f0 = f0[0].cpu().numpy()
elif f0_method == "rmvpe":
if hasattr(self, "model_rmvpe") == False:
from lib.rmvpe import RMVPE
print("loading rmvpe model")
self.model_rmvpe = RMVPE(
"rmvpe.pt", is_half=self.is_half, device=self.device
)
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
else: ##for meisheng
self.model_rmvpe = f0_method
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
##这里读文件,更改pitch st fang
valid_f0 = f0[f0 > 50]
mean_pitch_cur = np.mean(valid_f0[:min(len(valid_f0),500)])
-
#print("@@f0_up_key:",f0_up_key)
deta = 0
if(f0_up_key > 50 ):
deta = -mean_pitch_cur + f0_up_key
#print("$$$$$$$$$fangxxxxx pitch shift: ",deta)
- f0_up_key = int(np.log2(deta/(mean_pitch_cur + 1) + 1) * 12)##方法2 fang
- if( abs(f0_up_key) <= 8 ):
+ f0_up_key = np.log2(deta/(mean_pitch_cur + 1) + 1) * 12
+ if np.isnan(f0_up_key):
f0_up_key = 0
- elif f0_up_key > 8:
+ f0_up_key = int(f0_up_key)
+ #f0_up_key = int(np.log2(deta/(mean_pitch_cur + 1) + 1) * 12)##方法2 fang
+ if( f0_up_key >= 12 ):
f0_up_key = 12
- elif f0_up_key < -8:
+ elif f0_up_key < -12:
f0_up_key = -12
+ else:
+ f0_up_key = 0
#if( abs(f0_up_key) < 3 ):
# f0_up_key = 0
- f0_up_key = max(min(12,f0_up_key),-12)
+ # f0_up_key = max(min(12,f0_up_key),-12)
#print("f0_up_key: ",f0_up_key)
f0 *= pow(2, f0_up_key / 12)#这块是音调更改 fang 我设置的0
# with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
tf0 = self.sr // self.window # 每秒f0点数
if inp_f0 is not None:
delta_t = np.round(
(inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
).astype("int16")
replace_f0 = np.interp(
list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
)
shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
:shape
]
# with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
f0bak = f0.copy()
f0_mel = 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
f0_mel_max - f0_mel_min
) + 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255
f0_coarse = np.rint(f0_mel).astype(int)
return f0_coarse, f0bak # 1-0
def vc(
self,
model,
net_g,
sid,
audio0,
pitch,
pitchf,
times,
index,
big_npy,
index_rate,
version,
protect,
): # ,file_index,file_big_npy
feats = torch.from_numpy(audio0)
if self.is_half:
feats = feats.half()
else:
feats = feats.float()
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim()
feats = feats.view(1, -1)
padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
#print("@@@feats: ",feats.shape)
#print("@@@padding_mask: ",padding_mask.shape)
inputs = {
"source": feats.to(self.device),
"padding_mask": padding_mask,
"output_layer": 9 if version == "v1" else 12,
#"output_layer": 6 if version == "v1" else 12,
}
t0 = ttime()
#'''
with torch.no_grad():
logits = model.extract_features(**inputs)
feats = model.final_proj(logits[0]) if version == "v1" else logits[0]#为何v1要转化,维度问题??? fang
#'''
#print("@@@feats: ",feats.shape)
'''
global fidx
feats_name = f"./feats_{fidx}.pt"
fidx += 1
torch.save(feats, feats_name)
feats = torch.load(feats_name)
#'''
if protect < 0.5 and pitch != None and pitchf != None:
feats0 = feats.clone()
if (
isinstance(index, type(None)) == False
and isinstance(big_npy, type(None)) == False
and index_rate != 0
):
npy = feats[0].cpu().numpy()
if self.is_half:
npy = npy.astype("float32")
# _, I = index.search(npy, 1)
# npy = big_npy[I.squeeze()]
score, ix = index.search(npy, k=8)
weight = np.square(1 / score)
weight /= weight.sum(axis=1, keepdims=True)
npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
if self.is_half:
npy = npy.astype("float16")
feats = (
torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
+ (1 - index_rate) * feats
)##基于index和实际音频的特征进行组合,作为输入 fang
#print("@@@feats: ",feats.shape)
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
if protect < 0.5 and pitch != None and pitchf != None:
feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
0, 2, 1
)#feats0的维度1 插值增加一倍 fang
t1 = ttime()
p_len = audio0.shape[0] // self.window ##分帧求pitch fang
if feats.shape[1] < p_len:
p_len = feats.shape[1]
if pitch != None and pitchf != None:
pitch = pitch[:, :p_len]
pitchf = pitchf[:, :p_len]
if protect < 0.5 and pitch != None and pitchf != None:
pitchff = pitchf.clone()
pitchff[pitchf > 0] = 1
pitchff[pitchf < 1] = protect
pitchff = pitchff.unsqueeze(-1)
feats = feats * pitchff + feats0 * (1 - pitchff)
feats = feats.to(feats0.dtype)
p_len = torch.tensor([p_len], device=self.device).long()
#print("###feats:",feats.shape,"pitch:",pitch.shape,"p_len:",p_len)
with torch.no_grad():
if pitch != None and pitchf != None:
audio1 = (
(net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
.data.cpu()
.float()
.numpy()
)
else:
audio1 = (
(net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
)
del feats, p_len, padding_mask
if torch.cuda.is_available():
torch.cuda.empty_cache()
t2 = ttime()
times[0] += t1 - t0
times[2] += t2 - t1
return audio1
def pipeline(
self,
model,
net_g,
sid,
audio,## input wav
input_audio_path, #input wav name
times,
f0_up_key,
f0_method,# f0 meathod
file_index, #index 路径
# file_big_npy,
index_rate,
if_f0,
filter_radius,
tgt_sr,
resample_sr,
rms_mix_rate,
version,
protect,
f0_file=None,
):
if (
file_index != "" #.index文件不为空 fang
# and file_big_npy != ""
# and os.path.exists(file_big_npy) == True
and os.path.exists(file_index) == True
and index_rate != 0
):
try:
index = faiss.read_index(file_index)
# big_npy = np.load(file_big_npy)
big_npy = index.reconstruct_n(0, index.ntotal)
except:
traceback.print_exc()
index = big_npy = None
else:
index = big_npy = None
#print("####audio 1:",audio.shape)
audio = signal.filtfilt(bh, ah, audio)
#print("####audio 2:",audio.shape)
audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
opt_ts = []
#print("###t_max:",self.t_max)
#print("###window:",self.window,"self.t_query:",self.t_query,"self.t_pad2:",self.t_pad2)
if audio_pad.shape[0] > self.t_max:
audio_sum = np.zeros_like(audio)
for i in range(self.window):
audio_sum += audio_pad[i : i - self.window]#这样算循环了,每个idx是过去一帧的值的和 fang
for t in range(self.t_center, audio.shape[0], self.t_center):#一分钟一帧?? fang
opt_ts.append(
t
- self.t_query
+ np.where(
np.abs(audio_sum[t - self.t_query : t + self.t_query])
== np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
)[0][0]
)#返回[ t - self.t_query, t+self.t_query] 区间最小值位置的索引保存,fang
s = 0
audio_opt = []
t = None
t1 = ttime()
audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
p_len = audio_pad.shape[0] // self.window
inp_f0 = None
if hasattr(f0_file, "name") == True:
try:
with open(f0_file.name, "r") as f:
lines = f.read().strip("\n").split("\n")
inp_f0 = []
for line in lines:
inp_f0.append([float(i) for i in line.split(",")])
inp_f0 = np.array(inp_f0, dtype="float32")
except:
traceback.print_exc()
#sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
sid_embed = np.load(sid)
sid = torch.FloatTensor(sid_embed).to(self.device).half()
pitch, pitchf = None, None
if if_f0 == 1:
pitch, pitchf = self.get_f0(
input_audio_path,
audio_pad,
p_len,
f0_up_key,
f0_method,
filter_radius,
inp_f0,
)
pitch = pitch[:p_len]
pitchf = pitchf[:p_len]
if self.device == "mps":
pitchf = pitchf.astype(np.float32)
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
#print("&&&&pitch: ",pitchf)
t2 = ttime()
times[1] += t2 - t1
#print("####len(audio_pad):",len(audio_pad))
#print("###pitch:", pitch.shape)
for t in opt_ts: #分段推理每段音频,一段这里设置60s左右 fang
t = t // self.window * self.window
if if_f0 == 1:
audio_opt.append(
self.vc(
model,
net_g,
sid,
audio_pad[s : t + self.t_pad2 + self.window],
pitch[:, s // self.window : (t + self.t_pad2) // self.window],
pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
times,
index,
big_npy,
index_rate,
version,
protect,
)[self.t_pad_tgt : -self.t_pad_tgt]
)
else:
audio_opt.append(
self.vc(
model,
net_g,
sid,
audio_pad[s : t + self.t_pad2 + self.window],
None,
None,
times,
index,
big_npy,
index_rate,
version,
protect,
)[self.t_pad_tgt : -self.t_pad_tgt]
)
s = t
if if_f0 == 1: ##后面是最后一段处理 fang
audio_opt.append(
self.vc(
model,
net_g,
sid,
audio_pad[t:],
pitch[:, t // self.window :] if t is not None else pitch,
pitchf[:, t // self.window :] if t is not None else pitchf,
times,
index,
big_npy,
index_rate,
version,
protect,
)[self.t_pad_tgt : -self.t_pad_tgt]
)
else:
audio_opt.append(
self.vc(
model,
net_g,
sid,
audio_pad[t:],
None,
None,
times,
index,
big_npy,
index_rate,
version,
protect,
)[self.t_pad_tgt : -self.t_pad_tgt]
)
audio_opt = np.concatenate(audio_opt)
if rms_mix_rate != 1:
audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
if resample_sr >= 16000 and tgt_sr != resample_sr:
audio_opt = librosa.resample(
audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
)
audio_max = np.abs(audio_opt).max() / 0.99
max_int16 = 32768
if audio_max > 1:
max_int16 /= audio_max
audio_opt = (audio_opt * max_int16).astype(np.int16)
del pitch, pitchf, sid
if torch.cuda.is_available():
torch.cuda.empty_cache()
return audio_opt
def infer_core_fang(self,para1,para2,para3,idx,
model,
net_g,
sid,
times,
index,
big_npy,
index_rate,
version,
protect):
return [ self.vc(
model,
net_g,
sid,
para1, para2, para3,
# audio_pad[s: t + self.t_pad2 + self.window],
# pitch[:, s // self.window: (t + self.t_pad2) // self.window],
# pitchf[:, s // self.window: (t + self.t_pad2) // self.window],
times,
index,
big_npy,
index_rate,
version,
protect,
)[self.t_pad_tgt: -self.t_pad_tgt], idx]
def ThreadPool_process_core(self, func_process,params1,params2,params3,
model,
net_g,
sid,
# audio_pad[s: t + self.t_pad2 + self.window],
# pitch[:, s // self.window: (t + self.t_pad2) // self.window],
# pitchf[:, s // self.window: (t + self.t_pad2) // self.window],
times,
index,
big_npy,
index_rate,
version,
protect
):
num_threads = 2
futures = []
sort_ret = {}
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
for idx in range(len(params1)):
para1 = params1[idx]
para2 = params2[idx]
para3 = params3[idx]
ret = executor.submit(self.infer_core_fang,para1,para2,para3,idx,
model,
net_g,
sid,
times,
index,
big_npy,
index_rate,
version,
protect)
futures.append(ret)
cnt = 0
for future in concurrent.futures.as_completed(futures):
cnt += 1
#print(f"process finised {cnt}, and index :{future.result()[1]}")
#print(future.result()) # result
# print(future.result()[1]) ##index
sort_ret[str(future.result()[1])] = future.result()[0]
fea_list = []
for idx in range(len(sort_ret)):
fea_list.append(sort_ret[str(idx)])
return fea_list
def pipeline_mulprocess(
self,
model,
net_g,
sid,
audio, ## input wav
input_audio_path, # input wav name
times,
f0_up_key,
f0_method, # f0 meathod
file_index, # index 路径
# file_big_npy,
index_rate,
if_f0,
filter_radius,
tgt_sr,
resample_sr,
rms_mix_rate,
version,
protect,
f0_file=None,
):
if (
file_index != "" # .index文件不为空 fang
# and file_big_npy != ""
# and os.path.exists(file_big_npy) == True
and os.path.exists(file_index) == True
and index_rate != 0
):
try:
index = faiss.read_index(file_index)
# big_npy = np.load(file_big_npy)
big_npy = index.reconstruct_n(0, index.ntotal)
except:
traceback.print_exc()
index = big_npy = None
else:
index = big_npy = None
audio = signal.filtfilt(bh, ah, audio)
audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
opt_ts = []
if audio_pad.shape[0] > self.t_max:
audio_sum = np.zeros_like(audio)
for i in range(self.window):
audio_sum += audio_pad[i: i - self.window] # 这样算循环了,每个idx是过去一帧的值的和 fang
for t in range(self.t_center, audio.shape[0], self.t_center): # 一分钟一帧?? fang
opt_ts.append(
t
- self.t_query
+ np.where(
np.abs(audio_sum[t - self.t_query: t + self.t_query])
== np.abs(audio_sum[t - self.t_query: t + self.t_query]).min()
)[0][0]
) # 返回[ t - self.t_query, t+self.t_query] 区间最小值位置的索引保存,fang
s = 0
t = None
t1 = ttime()
audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
p_len = audio_pad.shape[0] // self.window
inp_f0 = None
if hasattr(f0_file, "name") == True:
try:
with open(f0_file.name, "r") as f:
lines = f.read().strip("\n").split("\n")
inp_f0 = []
for line in lines:
inp_f0.append([float(i) for i in line.split(",")])
inp_f0 = np.array(inp_f0, dtype="float32")
except:
traceback.print_exc()
# sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
sid_embed = np.load(sid)
embed_npy_spk = sid[:-4] + '_spk.npy'
sid_spk_embed = np.load(embed_npy_spk )
print("555555sid_embed:",np.shape(sid_embed),'type:',type(sid_embed))
print('sid_spk_embed:', np.shape(sid_spk_embed), 'type:',type(sid_spk_embed))
sid_embed = np.concatenate((sid_embed, sid_spk_embed),axis=0)
print('sid_embed:', np.shape(sid_embed), 'type:',type(sid_embed))
sid = torch.FloatTensor(sid_embed).to(self.device).half()
#sid_embed = np.load(sid)
#sid = torch.FloatTensor(sid_embed).to(self.device).half()
print('sid:',sid.shape)
pitch, pitchf = None, None
#'''
if if_f0 == 1:
pitch, pitchf = self.get_f0(
input_audio_path,
audio_pad,
p_len,
f0_up_key,
f0_method,
filter_radius,
inp_f0,
)
pitch = pitch[:p_len]
pitchf = pitchf[:p_len]
if self.device == "mps":
pitchf = pitchf.astype(np.float32)
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
#'''
'''
pitch_name = "./pitch_pitchf.npz"
#np.savez(pitch_name, pitch = pitch.detach().cpu().numpy(), pitchf = pitchf.detach().cpu().numpy())
npz_obj = np.load(pitch_name) #文件名的后缀为npz
pitch, pitchf = npz_obj['pitch'], npz_obj['pitchf']
pitch = torch.tensor(pitch, device=self.device).long()
pitchf = torch.tensor(pitchf, device=self.device).float()
#'''
t2 = ttime()
times[1] += t2 - t1
audio_opt = []
audio_pad_list = []
pitch_list = []
pitchf_list = []
for t in opt_ts: # 分段推理每段音频,一段这里设置60s左右 fang
t = t // self.window * self.window
audio_pad_list.append(audio_pad[s: t + self.t_pad2 + self.window])
pitch_list.append(pitch[:, s // self.window: (t + self.t_pad2) // self.window])
pitchf_list.append(pitchf[:, s // self.window: (t + self.t_pad2) // self.window])
s = t
audio_pad_list.append(audio_pad[t:])
pitch_list.append(pitch[:, t // self.window:] if t is not None else pitch)
pitchf_list.append(pitchf[:, t // self.window:] if t is not None else pitchf)
audio_opt = self.ThreadPool_process_core(self.infer_core_fang, audio_pad_list, pitch_list, pitchf_list,
model,
net_g,
sid,
times,
index,
big_npy,
index_rate,
version,
protect
)
'''
if if_f0 == 1: ##后面是最后一段处理 fang
audio_opt.append(
self.vc(
model,
net_g,
sid,
audio_pad[t:],
pitch[:, t // self.window:] if t is not None else pitch,
pitchf[:, t // self.window:] if t is not None else pitchf,
times,
index,
big_npy,
index_rate,
version,
protect,
)[self.t_pad_tgt: -self.t_pad_tgt]
)
else:
audio_opt.append(
self.vc(
model,
net_g,
sid,
audio_pad[t:],
None,
None,
times,
index,
big_npy,
index_rate,
version,
protect,
)[self.t_pad_tgt: -self.t_pad_tgt]
)
#'''
audio_opt = np.concatenate(audio_opt)
if rms_mix_rate != 1:
audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
if resample_sr >= 16000 and tgt_sr != resample_sr:
audio_opt = librosa.resample(
audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
)
audio_max = np.abs(audio_opt).max() / 0.99
max_int16 = 32768
if audio_max > 1:
max_int16 /= audio_max
audio_opt = (audio_opt * max_int16).astype(np.int16)
del pitch, pitchf, sid
if torch.cuda.is_available():
torch.cuda.empty_cache()
return audio_opt
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Sun, Jan 12, 08:35 (1 d, 15 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1347136
Default Alt Text
(42 KB)
Attached To
R350 av_svc
Event Timeline
Log In to Comment