Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F4880342
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
29 KB
Subscribers
None
View Options
diff --git a/AIMeiSheng/vc_infer_pipeline_org_embed.py b/AIMeiSheng/vc_infer_pipeline_org_embed.py
index d53f4ae..cfa0675 100644
--- a/AIMeiSheng/vc_infer_pipeline_org_embed.py
+++ b/AIMeiSheng/vc_infer_pipeline_org_embed.py
@@ -1,760 +1,766 @@
import numpy as np, parselmouth, torch, pdb, sys, os
from time import time as ttime
import torch.nn.functional as F
import scipy.signal as signal
import pyworld, os, traceback, faiss, librosa, torchcrepe
from scipy import signal
from functools import lru_cache
now_dir = os.getcwd()
sys.path.append(now_dir)
bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
input_audio_path2wav = {}
fidx = 0
import threading
import concurrent.futures
@lru_cache
def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
audio = input_audio_path2wav[input_audio_path]
f0, t = pyworld.harvest(
audio,
fs=fs,
f0_ceil=f0max,
f0_floor=f0min,
frame_period=frame_period,
)
f0 = pyworld.stonemask(audio, f0, t, fs)
return f0
def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比
# print(data1.max(),data2.max())
rms1 = librosa.feature.rms(
y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
) # 每半秒一个点
rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
rms1 = torch.from_numpy(rms1)
rms1 = F.interpolate(
rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
).squeeze()
rms2 = torch.from_numpy(rms2)
rms2 = F.interpolate(
rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
).squeeze()
rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
data2 *= (
torch.pow(rms1, torch.tensor(1 - rate))
* torch.pow(rms2, torch.tensor(rate - 1))
).numpy()
return data2
class VC(object):
def __init__(self, tgt_sr, config):
self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
config.x_pad, ##config会根据设备配置不通知如:3
config.x_query, # 10 等于x_max-x_center)*2
config.x_center, #60
config.x_max, #65
config.is_half,
)
self.sr = 16000 # hubert输入采样率
self.window = 160 # 每帧点数
self.t_pad = self.sr * self.x_pad # 每条前后pad时间
self.t_pad_tgt = tgt_sr * self.x_pad
self.t_pad2 = self.t_pad * 2
self.t_query = self.sr * self.x_query # 查询切点前后查询时间,
self.t_center = self.sr * self.x_center # 查询切点位置
self.t_max = self.sr * self.x_max # 免查询时长阈值
self.device = config.device
def get_f0(
self,
input_audio_path,
x,
p_len,
f0_up_key,
f0_method,
filter_radius,
inp_f0=None,
):
global input_audio_path2wav
time_step = self.window / self.sr * 1000
f0_min = 50
f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
if f0_method == "pm":
f0 = (
parselmouth.Sound(x, self.sr)
.to_pitch_ac(
time_step=time_step / 1000,
voicing_threshold=0.6,
pitch_floor=f0_min,
pitch_ceiling=f0_max,
)
.selected_array["frequency"]
)
pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
f0 = np.pad(
f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
)
elif f0_method == "harvest":
input_audio_path2wav[input_audio_path] = x.astype(np.double)
f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
if filter_radius > 2:
f0 = signal.medfilt(f0, 3)
elif f0_method == "crepe":
model = "full"
# Pick a batch size that doesn't cause memory errors on your gpu
batch_size = 512
# Compute pitch using first gpu
audio = torch.tensor(np.copy(x))[None].float()
f0, pd = torchcrepe.predict(
audio,
self.sr,
self.window,
f0_min,
f0_max,
model,
batch_size=batch_size,
device=self.device,
return_periodicity=True,
)
pd = torchcrepe.filter.median(pd, 3)
f0 = torchcrepe.filter.mean(f0, 3)
f0[pd < 0.1] = 0
f0 = f0[0].cpu().numpy()
elif f0_method == "rmvpe":
if hasattr(self, "model_rmvpe") == False:
from lib.rmvpe import RMVPE
print("loading rmvpe model")
self.model_rmvpe = RMVPE(
"rmvpe.pt", is_half=self.is_half, device=self.device
)
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
else: ##for meisheng
self.model_rmvpe = f0_method
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
##这里读文件,更改pitch st fang
valid_f0 = f0[f0 > 50]
mean_pitch_cur = np.mean(valid_f0[:min(len(valid_f0),500)])
#print("@@f0_up_key:",f0_up_key)
deta = 0
if(f0_up_key > 50 ):
deta = -mean_pitch_cur + f0_up_key
#print("$$$$$$$$$fangxxxxx pitch shift: ",deta)
f0_up_key = int(np.log2(deta/(mean_pitch_cur + 1) + 1) * 12)##方法2 fang
+ if( abs(f0_up_key) <= 8 ):
+ f0_up_key = 0
+ elif f0_up_key > 8:
+ f0_up_key = 12
+ elif f0_up_key < -8:
+ f0_up_key = -12
#if( abs(f0_up_key) < 3 ):
# f0_up_key = 0
f0_up_key = max(min(12,f0_up_key),-12)
#print("f0_up_key: ",f0_up_key)
f0 *= pow(2, f0_up_key / 12)#这块是音调更改 fang 我设置的0
# with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
tf0 = self.sr // self.window # 每秒f0点数
if inp_f0 is not None:
delta_t = np.round(
(inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
).astype("int16")
replace_f0 = np.interp(
list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
)
shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
:shape
]
# with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
f0bak = f0.copy()
f0_mel = 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
f0_mel_max - f0_mel_min
) + 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255
f0_coarse = np.rint(f0_mel).astype(int)
return f0_coarse, f0bak # 1-0
def vc(
self,
model,
net_g,
sid,
audio0,
pitch,
pitchf,
times,
index,
big_npy,
index_rate,
version,
protect,
): # ,file_index,file_big_npy
feats = torch.from_numpy(audio0)
if self.is_half:
feats = feats.half()
else:
feats = feats.float()
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim()
feats = feats.view(1, -1)
padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
#print("@@@feats: ",feats.shape)
#print("@@@padding_mask: ",padding_mask.shape)
inputs = {
"source": feats.to(self.device),
"padding_mask": padding_mask,
"output_layer": 9 if version == "v1" else 12,
#"output_layer": 6 if version == "v1" else 12,
}
t0 = ttime()
#'''
with torch.no_grad():
logits = model.extract_features(**inputs)
feats = model.final_proj(logits[0]) if version == "v1" else logits[0]#为何v1要转化,维度问题??? fang
#'''
#print("@@@feats: ",feats.shape)
'''
global fidx
feats_name = f"./feats_{fidx}.pt"
fidx += 1
torch.save(feats, feats_name)
feats = torch.load(feats_name)
#'''
if protect < 0.5 and pitch != None and pitchf != None:
feats0 = feats.clone()
if (
isinstance(index, type(None)) == False
and isinstance(big_npy, type(None)) == False
and index_rate != 0
):
npy = feats[0].cpu().numpy()
if self.is_half:
npy = npy.astype("float32")
# _, I = index.search(npy, 1)
# npy = big_npy[I.squeeze()]
score, ix = index.search(npy, k=8)
weight = np.square(1 / score)
weight /= weight.sum(axis=1, keepdims=True)
npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
if self.is_half:
npy = npy.astype("float16")
feats = (
torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
+ (1 - index_rate) * feats
)##基于index和实际音频的特征进行组合,作为输入 fang
#print("@@@feats: ",feats.shape)
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
if protect < 0.5 and pitch != None and pitchf != None:
feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
0, 2, 1
)#feats0的维度1 插值增加一倍 fang
t1 = ttime()
p_len = audio0.shape[0] // self.window ##分帧求pitch fang
if feats.shape[1] < p_len:
p_len = feats.shape[1]
if pitch != None and pitchf != None:
pitch = pitch[:, :p_len]
pitchf = pitchf[:, :p_len]
if protect < 0.5 and pitch != None and pitchf != None:
pitchff = pitchf.clone()
pitchff[pitchf > 0] = 1
pitchff[pitchf < 1] = protect
pitchff = pitchff.unsqueeze(-1)
feats = feats * pitchff + feats0 * (1 - pitchff)
feats = feats.to(feats0.dtype)
p_len = torch.tensor([p_len], device=self.device).long()
#print("###feats:",feats.shape,"pitch:",pitch.shape,"p_len:",p_len)
with torch.no_grad():
if pitch != None and pitchf != None:
audio1 = (
(net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
.data.cpu()
.float()
.numpy()
)
else:
audio1 = (
(net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
)
del feats, p_len, padding_mask
if torch.cuda.is_available():
torch.cuda.empty_cache()
t2 = ttime()
times[0] += t1 - t0
times[2] += t2 - t1
return audio1
def pipeline(
self,
model,
net_g,
sid,
audio,## input wav
input_audio_path, #input wav name
times,
f0_up_key,
f0_method,# f0 meathod
file_index, #index 路径
# file_big_npy,
index_rate,
if_f0,
filter_radius,
tgt_sr,
resample_sr,
rms_mix_rate,
version,
protect,
f0_file=None,
):
if (
file_index != "" #.index文件不为空 fang
# and file_big_npy != ""
# and os.path.exists(file_big_npy) == True
and os.path.exists(file_index) == True
and index_rate != 0
):
try:
index = faiss.read_index(file_index)
# big_npy = np.load(file_big_npy)
big_npy = index.reconstruct_n(0, index.ntotal)
except:
traceback.print_exc()
index = big_npy = None
else:
index = big_npy = None
#print("####audio 1:",audio.shape)
audio = signal.filtfilt(bh, ah, audio)
#print("####audio 2:",audio.shape)
audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
opt_ts = []
#print("###t_max:",self.t_max)
#print("###window:",self.window,"self.t_query:",self.t_query,"self.t_pad2:",self.t_pad2)
if audio_pad.shape[0] > self.t_max:
audio_sum = np.zeros_like(audio)
for i in range(self.window):
audio_sum += audio_pad[i : i - self.window]#这样算循环了,每个idx是过去一帧的值的和 fang
for t in range(self.t_center, audio.shape[0], self.t_center):#一分钟一帧?? fang
opt_ts.append(
t
- self.t_query
+ np.where(
np.abs(audio_sum[t - self.t_query : t + self.t_query])
== np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
)[0][0]
)#返回[ t - self.t_query, t+self.t_query] 区间最小值位置的索引保存,fang
s = 0
audio_opt = []
t = None
t1 = ttime()
audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
p_len = audio_pad.shape[0] // self.window
inp_f0 = None
if hasattr(f0_file, "name") == True:
try:
with open(f0_file.name, "r") as f:
lines = f.read().strip("\n").split("\n")
inp_f0 = []
for line in lines:
inp_f0.append([float(i) for i in line.split(",")])
inp_f0 = np.array(inp_f0, dtype="float32")
except:
traceback.print_exc()
#sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
sid_embed = np.load(sid)
sid = torch.FloatTensor(sid_embed).to(self.device).half()
pitch, pitchf = None, None
if if_f0 == 1:
pitch, pitchf = self.get_f0(
input_audio_path,
audio_pad,
p_len,
f0_up_key,
f0_method,
filter_radius,
inp_f0,
)
pitch = pitch[:p_len]
pitchf = pitchf[:p_len]
if self.device == "mps":
pitchf = pitchf.astype(np.float32)
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
#print("&&&&pitch: ",pitchf)
t2 = ttime()
times[1] += t2 - t1
#print("####len(audio_pad):",len(audio_pad))
#print("###pitch:", pitch.shape)
for t in opt_ts: #分段推理每段音频,一段这里设置60s左右 fang
t = t // self.window * self.window
if if_f0 == 1:
audio_opt.append(
self.vc(
model,
net_g,
sid,
audio_pad[s : t + self.t_pad2 + self.window],
pitch[:, s // self.window : (t + self.t_pad2) // self.window],
pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
times,
index,
big_npy,
index_rate,
version,
protect,
)[self.t_pad_tgt : -self.t_pad_tgt]
)
else:
audio_opt.append(
self.vc(
model,
net_g,
sid,
audio_pad[s : t + self.t_pad2 + self.window],
None,
None,
times,
index,
big_npy,
index_rate,
version,
protect,
)[self.t_pad_tgt : -self.t_pad_tgt]
)
s = t
if if_f0 == 1: ##后面是最后一段处理 fang
audio_opt.append(
self.vc(
model,
net_g,
sid,
audio_pad[t:],
pitch[:, t // self.window :] if t is not None else pitch,
pitchf[:, t // self.window :] if t is not None else pitchf,
times,
index,
big_npy,
index_rate,
version,
protect,
)[self.t_pad_tgt : -self.t_pad_tgt]
)
else:
audio_opt.append(
self.vc(
model,
net_g,
sid,
audio_pad[t:],
None,
None,
times,
index,
big_npy,
index_rate,
version,
protect,
)[self.t_pad_tgt : -self.t_pad_tgt]
)
audio_opt = np.concatenate(audio_opt)
if rms_mix_rate != 1:
audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
if resample_sr >= 16000 and tgt_sr != resample_sr:
audio_opt = librosa.resample(
audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
)
audio_max = np.abs(audio_opt).max() / 0.99
max_int16 = 32768
if audio_max > 1:
max_int16 /= audio_max
audio_opt = (audio_opt * max_int16).astype(np.int16)
del pitch, pitchf, sid
if torch.cuda.is_available():
torch.cuda.empty_cache()
return audio_opt
def infer_core_fang(self,para1,para2,para3,idx,
model,
net_g,
sid,
times,
index,
big_npy,
index_rate,
version,
protect):
return [ self.vc(
model,
net_g,
sid,
para1, para2, para3,
# audio_pad[s: t + self.t_pad2 + self.window],
# pitch[:, s // self.window: (t + self.t_pad2) // self.window],
# pitchf[:, s // self.window: (t + self.t_pad2) // self.window],
times,
index,
big_npy,
index_rate,
version,
protect,
)[self.t_pad_tgt: -self.t_pad_tgt], idx]
def ThreadPool_process_core(self, func_process,params1,params2,params3,
model,
net_g,
sid,
# audio_pad[s: t + self.t_pad2 + self.window],
# pitch[:, s // self.window: (t + self.t_pad2) // self.window],
# pitchf[:, s // self.window: (t + self.t_pad2) // self.window],
times,
index,
big_npy,
index_rate,
version,
protect
):
num_threads = 2
futures = []
sort_ret = {}
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
for idx in range(len(params1)):
para1 = params1[idx]
para2 = params2[idx]
para3 = params3[idx]
ret = executor.submit(self.infer_core_fang,para1,para2,para3,idx,
model,
net_g,
sid,
times,
index,
big_npy,
index_rate,
version,
protect)
futures.append(ret)
cnt = 0
for future in concurrent.futures.as_completed(futures):
cnt += 1
#print(f"process finised {cnt}, and index :{future.result()[1]}")
#print(future.result()) # result
# print(future.result()[1]) ##index
sort_ret[str(future.result()[1])] = future.result()[0]
fea_list = []
for idx in range(len(sort_ret)):
fea_list.append(sort_ret[str(idx)])
return fea_list
def pipeline_mulprocess(
self,
model,
net_g,
sid,
audio, ## input wav
input_audio_path, # input wav name
times,
f0_up_key,
f0_method, # f0 meathod
file_index, # index 路径
# file_big_npy,
index_rate,
if_f0,
filter_radius,
tgt_sr,
resample_sr,
rms_mix_rate,
version,
protect,
f0_file=None,
):
if (
file_index != "" # .index文件不为空 fang
# and file_big_npy != ""
# and os.path.exists(file_big_npy) == True
and os.path.exists(file_index) == True
and index_rate != 0
):
try:
index = faiss.read_index(file_index)
# big_npy = np.load(file_big_npy)
big_npy = index.reconstruct_n(0, index.ntotal)
except:
traceback.print_exc()
index = big_npy = None
else:
index = big_npy = None
audio = signal.filtfilt(bh, ah, audio)
audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
opt_ts = []
if audio_pad.shape[0] > self.t_max:
audio_sum = np.zeros_like(audio)
for i in range(self.window):
audio_sum += audio_pad[i: i - self.window] # 这样算循环了,每个idx是过去一帧的值的和 fang
for t in range(self.t_center, audio.shape[0], self.t_center): # 一分钟一帧?? fang
opt_ts.append(
t
- self.t_query
+ np.where(
np.abs(audio_sum[t - self.t_query: t + self.t_query])
== np.abs(audio_sum[t - self.t_query: t + self.t_query]).min()
)[0][0]
) # 返回[ t - self.t_query, t+self.t_query] 区间最小值位置的索引保存,fang
s = 0
t = None
t1 = ttime()
audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
p_len = audio_pad.shape[0] // self.window
inp_f0 = None
if hasattr(f0_file, "name") == True:
try:
with open(f0_file.name, "r") as f:
lines = f.read().strip("\n").split("\n")
inp_f0 = []
for line in lines:
inp_f0.append([float(i) for i in line.split(",")])
inp_f0 = np.array(inp_f0, dtype="float32")
except:
traceback.print_exc()
# sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
sid_embed = np.load(sid)
sid = torch.FloatTensor(sid_embed).to(self.device).half()
pitch, pitchf = None, None
#'''
if if_f0 == 1:
pitch, pitchf = self.get_f0(
input_audio_path,
audio_pad,
p_len,
f0_up_key,
f0_method,
filter_radius,
inp_f0,
)
pitch = pitch[:p_len]
pitchf = pitchf[:p_len]
if self.device == "mps":
pitchf = pitchf.astype(np.float32)
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
#'''
'''
pitch_name = "./pitch_pitchf.npz"
#np.savez(pitch_name, pitch = pitch.detach().cpu().numpy(), pitchf = pitchf.detach().cpu().numpy())
npz_obj = np.load(pitch_name) #文件名的后缀为npz
pitch, pitchf = npz_obj['pitch'], npz_obj['pitchf']
pitch = torch.tensor(pitch, device=self.device).long()
pitchf = torch.tensor(pitchf, device=self.device).float()
#'''
t2 = ttime()
times[1] += t2 - t1
audio_opt = []
audio_pad_list = []
pitch_list = []
pitchf_list = []
for t in opt_ts: # 分段推理每段音频,一段这里设置60s左右 fang
t = t // self.window * self.window
audio_pad_list.append(audio_pad[s: t + self.t_pad2 + self.window])
pitch_list.append(pitch[:, s // self.window: (t + self.t_pad2) // self.window])
pitchf_list.append(pitchf[:, s // self.window: (t + self.t_pad2) // self.window])
s = t
audio_pad_list.append(audio_pad[t:])
pitch_list.append(pitch[:, t // self.window:] if t is not None else pitch)
pitchf_list.append(pitchf[:, t // self.window:] if t is not None else pitchf)
audio_opt = self.ThreadPool_process_core(self.infer_core_fang, audio_pad_list, pitch_list, pitchf_list,
model,
net_g,
sid,
times,
index,
big_npy,
index_rate,
version,
protect
)
'''
if if_f0 == 1: ##后面是最后一段处理 fang
audio_opt.append(
self.vc(
model,
net_g,
sid,
audio_pad[t:],
pitch[:, t // self.window:] if t is not None else pitch,
pitchf[:, t // self.window:] if t is not None else pitchf,
times,
index,
big_npy,
index_rate,
version,
protect,
)[self.t_pad_tgt: -self.t_pad_tgt]
)
else:
audio_opt.append(
self.vc(
model,
net_g,
sid,
audio_pad[t:],
None,
None,
times,
index,
big_npy,
index_rate,
version,
protect,
)[self.t_pad_tgt: -self.t_pad_tgt]
)
#'''
audio_opt = np.concatenate(audio_opt)
if rms_mix_rate != 1:
audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
if resample_sr >= 16000 and tgt_sr != resample_sr:
audio_opt = librosa.resample(
audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
)
audio_max = np.abs(audio_opt).max() / 0.99
max_int16 = 32768
if audio_max > 1:
max_int16 /= audio_max
audio_opt = (audio_opt * max_int16).astype(np.int16)
del pitch, pitchf, sid
if torch.cuda.is_available():
torch.cuda.empty_cache()
return audio_opt
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Sun, Jan 12, 08:34 (1 d, 15 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1347208
Default Alt Text
(29 KB)
Attached To
R350 av_svc
Event Timeline
Log In to Comment