diff --git a/AIMeiSheng/._readme_meisheng.md b/AIMeiSheng/._readme_meisheng.md deleted file mode 100644 index 50212ca..0000000 Binary files a/AIMeiSheng/._readme_meisheng.md and /dev/null differ diff --git a/AIMeiSheng/docker_demo/common.py b/AIMeiSheng/docker_demo/common.py index 098cba8..64aba31 100644 --- a/AIMeiSheng/docker_demo/common.py +++ b/AIMeiSheng/docker_demo/common.py @@ -1,121 +1,122 @@ import os import sys import time # import logging import urllib, urllib.request # 测试/正式环境 gs_prod = True # if len(sys.argv) > 1 and sys.argv[1] == "prod": # gs_prod = True # print(gs_prod) gs_tmp_dir = "/tmp/ai_meisheng_tmp" gs_model_dir = "/tmp/ai_meisheng_models" gs_resource_cache_dir = "/tmp/ai_meisheng_resource_cache" gs_embed_model_path = os.path.join(gs_model_dir, "RawNet3/models/weights/model.pt") gs_svc_model_path = os.path.join(gs_model_dir, "weights/xusong_v2_org_version_alldata_embed_spkenx200x_double_e14_s90706.pth") gs_hubert_model_path = os.path.join(gs_model_dir, "hubert.pt") gs_rmvpe_model_path = os.path.join(gs_model_dir, "rmvpe.pt") gs_embed_model_spk_path = os.path.join(gs_model_dir, "SpeakerEncoder/pretrained_model/best_model.pth.tar") gs_embed_config_spk_path = os.path.join(gs_model_dir, "SpeakerEncoder/pretrained_model/config.json") # errcode gs_err_code_success = 0 gs_err_code_download_vocal = 100 gs_err_code_download_svc_url = 101 gs_err_code_svc_process = 102 gs_err_code_transcode = 103 gs_err_code_volume_adjust = 104 gs_err_code_upload = 105 gs_err_code_params = 106 gs_err_code_pending = 107 gs_err_code_target_silence = 108 gs_err_code_too_many_connections = 429 +gs_err_code_gender_classify = 430 gs_redis_conf = { "host": "av-credis.starmaker.co", "port": 6379, "pwd": "lKoWEhz%jxTO", } gs_server_redis_conf = { "producer": "test_ai_meisheng_producer", # 输入的队列 "ai_meisheng_key_prefix": "test_ai_meisheng_key_", # 存储结果情况 } if gs_prod: gs_server_redis_conf = { "producer": "ai_meisheng_producer", # 输入的队列 "ai_meisheng_key_prefix": "ai_meisheng_key_", # 存储结果情况 } gs_feishu_conf = { "url": "http://sg-prod-songbook-webmp-1:8000/api/feishu/people", "users": [ "18810833785", # 杨建利 "17778007843", # 王健军 "18612496315" # 郭子豪 ] } def download2disk(url, dst_path): try: urllib.request.urlretrieve(url, dst_path) return os.path.exists(dst_path) except Exception as ex: print(f"download url={url} error", ex) return False def exec_cmd(cmd): # gs_logger.info(cmd) print(cmd) ret = os.system(cmd) if ret != 0: return False return True def exec_cmd_and_result(cmd): r = os.popen(cmd) text = r.read() r.close() return text def upload_file2cos(key, file_path, region='ap-singapore', bucket_name='av-audit-sync-sg-1256122840'): """ 将文件上传到cos :param key: 桶上的具体地址 :param file_path: 本地文件地址 :param region: 区域 :param bucket_name: 桶地址 :return: """ gs_coscmd = "coscmd" gs_coscmd_conf = "~/.cos.conf" cmd = "{} -c {} -r {} -b {} upload {} {}".format(gs_coscmd, gs_coscmd_conf, region, bucket_name, file_path, key) if exec_cmd(cmd): cmd = "{} -c {} -r {} -b {} info {}".format(gs_coscmd, gs_coscmd_conf, region, bucket_name, key) \ + "| grep Content-Length |awk \'{print $2}\'" res_str = exec_cmd_and_result(cmd) # logging.info("{},res={}".format(key, res_str)) size = float(res_str) if size > 0: return True return False return False def check_input(input_data): key_list = ["record_song_url", "target_url", "start", "end", "vocal_loudness", "female_recording_url", "male_recording_url"] for key in key_list: if key not in input_data.keys(): return False return True diff --git a/AIMeiSheng/docker_demo/svc_online.py b/AIMeiSheng/docker_demo/svc_online.py index a52ab24..f12143f 100644 --- a/AIMeiSheng/docker_demo/svc_online.py +++ b/AIMeiSheng/docker_demo/svc_online.py @@ -1,190 +1,194 @@ # -*- coding: UTF-8 -*- """ SVC的核心处理逻辑 """ import os import time import socket import shutil import hashlib from AIMeiSheng.meisheng_svc_final import load_model, process_svc_online from AIMeiSheng.cos_similar_ui_zoom import cos_similar from AIMeiSheng.meisheng_env_preparex import meisheng_env_prepare from AIMeiSheng.voice_classification.online.voice_class_online_fang import VoiceClass, download_volume_balanced from AIMeiSheng.docker_demo.common import * import logging hostname = socket.gethostname() log_file_name = f"{os.path.dirname(os.path.abspath(__file__))}/av_meisheng_{hostname}.log" # 设置logger svc_offline_logger = logging.getLogger("svc_offline") file_handler = logging.FileHandler(log_file_name) file_handler.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %I:%M:%S') file_handler.setFormatter(formatter) if gs_prod: svc_offline_logger.addHandler(file_handler) if os.path.exists(gs_tmp_dir): shutil.rmtree(gs_tmp_dir) os.makedirs(gs_model_dir, exist_ok=True) os.makedirs(gs_resource_cache_dir, exist_ok=True) # 预设参数 gs_gender_models_url = "https://av-audit-sync-sg-1256122840.cos.ap-singapore.myqcloud.com/hub/voice_classification/models.zip" gs_volume_bin_url = "https://av-audit-sync-sg-1256122840.cos.ap-singapore.myqcloud.com/dataset/AIMeiSheng/ebur128_tool" class GSWorkerAttr: def __init__(self, input_data): # 取出输入资源 vocal_url = input_data["record_song_url"] target_url = input_data["target_url"] start = input_data["start"] # 单位是ms end = input_data["end"] # 单位是ms vocal_loudness = input_data["vocal_loudness"] female_recording_url = input_data["female_recording_url"] male_recording_url = input_data["male_recording_url"] self.distinct_id = hashlib.md5(vocal_url.encode()).hexdigest() self.tmp_dir = os.path.join(gs_tmp_dir, self.distinct_id) if os.path.exists(self.tmp_dir): shutil.rmtree(self.tmp_dir) os.makedirs(self.tmp_dir) self.vocal_url = vocal_url self.target_url = target_url ext = vocal_url.split(".")[-1] self.vocal_path = os.path.join(self.tmp_dir, self.distinct_id + f"_in.{ext}") self.target_wav_path = os.path.join(self.tmp_dir, self.distinct_id + "_out.wav") self.target_wav_ad_path = os.path.join(self.tmp_dir, self.distinct_id + "_out_ad.wav") self.target_path = os.path.join(self.tmp_dir, self.distinct_id + "_out.m4a") self.female_svc_source_url = female_recording_url self.male_svc_source_url = male_recording_url ext = female_recording_url.split(".")[-1] self.female_svc_source_path = os.path.join(gs_resource_cache_dir, hashlib.md5(female_recording_url.encode()).hexdigest() + "." + ext) ext = male_recording_url.split(".")[-1] self.male_svc_source_path = os.path.join(gs_resource_cache_dir, hashlib.md5(male_recording_url.encode()).hexdigest() + "." + ext) self.st_tm = start self.ed_tm = end self.target_loudness = vocal_loudness def log_info_name(self): return f"d_id={self.distinct_id}, vocal_url={self.vocal_url}" def rm_cache(self): if os.path.exists(self.tmp_dir): shutil.rmtree(self.tmp_dir) def init_gender_model(): """ 下载模型 :return: """ dst_model_dir = os.path.join(gs_model_dir, "voice_classification") if not os.path.exists(dst_model_dir): dst_zip_path = os.path.join(gs_model_dir, "models.zip") if not download2disk(gs_gender_models_url, dst_zip_path): svc_offline_logger.fatal(f"download gender_model err={gs_gender_models_url}") cmd = f"cd {gs_model_dir}; unzip {dst_zip_path}; mv models voice_classification; rm -f {dst_zip_path}" os.system(cmd) if not os.path.exists(dst_model_dir): svc_offline_logger.fatal(f"unzip {dst_zip_path} err") music_voice_pure_model = os.path.join(dst_model_dir, "voice_005_rec_v5.pth") music_voice_no_pure_model = os.path.join(dst_model_dir, "voice_10_v5.pth") gender_pure_model = os.path.join(dst_model_dir, "gender_8k_ratev5_v6_adam.pth") gender_no_pure_model = os.path.join(dst_model_dir, "gender_8k_v6_adam.pth") vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model) return vc def init_svc_model(): meisheng_env_prepare(logging, gs_model_dir) embed_model, hubert_model = load_model() cs_sim = cos_similar() return embed_model, hubert_model,cs_sim def download_volume_adjustment(): """ 下载音量调整工具 :return: """ volume_bin_path = os.path.join(gs_model_dir, "ebur128_tool") if not os.path.exists(volume_bin_path): if not download2disk(gs_volume_bin_url, volume_bin_path): svc_offline_logger.fatal(f"download volume_bin err={gs_volume_bin_url}") os.system(f"chmod +x {volume_bin_path}") def volume_adjustment(wav_path, target_loudness, out_path): """ 音量调整 :param wav_path: :param target_loudness: :param out_path: :return: """ volume_bin_path = os.path.join(gs_model_dir, "ebur128_tool") cmd = f"{volume_bin_path} {wav_path} {target_loudness} {out_path}" os.system(cmd) class SVCOnline: def __init__(self): st = time.time() self.gender_model = init_gender_model() self.embed_model, self.hubert_model, self.cs_sim = init_svc_model() download_volume_adjustment() download_volume_balanced() svc_offline_logger.info(f"svc init finished, sp = {time.time() - st}") def gender_process(self, worker_attr): st = time.time() gender, female_rate, is_pure = self.gender_model.process(worker_attr.vocal_path) svc_offline_logger.info( f"{worker_attr.vocal_url}, gender={gender}, female_rate={female_rate}, is_pure={is_pure}, " f"gender_process sp = {time.time() - st}") if gender == 0: gender = 'female' elif gender == 1: gender = 'male' + elif female_rate == None: + gender = 'male' + return gender, gs_err_code_gender_classify elif female_rate > 0.5: gender = 'female' else: gender = 'male' + svc_offline_logger.info(f"{worker_attr.vocal_url}, modified gender={gender}") # err = gs_err_code_success # if female_rate == -1: # err = gs_err_code_target_silence return gender, gs_err_code_success def process(self, worker_attr): gender, err = self.gender_process(worker_attr) if err != gs_err_code_success: return gender, err song_path = worker_attr.female_svc_source_path if gender == "male": song_path = worker_attr.male_svc_source_path params = {'gender': gender, 'tst': worker_attr.st_tm, "tnd": worker_attr.ed_tm, 'delay': 0, 'song_path': None} st = time.time() err_code = process_svc_online(song_path, worker_attr.vocal_path, worker_attr.target_wav_path, self.embed_model, self.hubert_model, self.cs_sim, params) svc_offline_logger.info(f"{worker_attr.vocal_url}, err_code={err_code} process svc sp = {time.time() - st}") return gender, err_code diff --git a/AIMeiSheng/vc_infer_pipeline_org_embed_spk.py b/AIMeiSheng/vc_infer_pipeline_org_embed_spk.py index 076184f..f1e8f48 100644 --- a/AIMeiSheng/vc_infer_pipeline_org_embed_spk.py +++ b/AIMeiSheng/vc_infer_pipeline_org_embed_spk.py @@ -1,778 +1,781 @@ import numpy as np, parselmouth, torch, pdb, sys, os from time import time as ttime import torch.nn.functional as F import scipy.signal as signal import pyworld, os, traceback, faiss, librosa, torchcrepe from scipy import signal from functools import lru_cache now_dir = os.getcwd() sys.path.append(now_dir) bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) input_audio_path2wav = {} fidx = 0 import threading import concurrent.futures @lru_cache def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period): audio = input_audio_path2wav[input_audio_path] f0, t = pyworld.harvest( audio, fs=fs, f0_ceil=f0max, f0_floor=f0min, frame_period=frame_period, ) f0 = pyworld.stonemask(audio, f0, t, fs) return f0 def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比 # print(data1.max(),data2.max()) rms1 = librosa.feature.rms( y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2 ) # 每半秒一个点 rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2) rms1 = torch.from_numpy(rms1) rms1 = F.interpolate( rms1.unsqueeze(0), size=data2.shape[0], mode="linear" ).squeeze() rms2 = torch.from_numpy(rms2) rms2 = F.interpolate( rms2.unsqueeze(0), size=data2.shape[0], mode="linear" ).squeeze() rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6) data2 *= ( torch.pow(rms1, torch.tensor(1 - rate)) * torch.pow(rms2, torch.tensor(rate - 1)) ).numpy() return data2 class VC(object): def __init__(self, tgt_sr, config): self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = ( config.x_pad, ##config会根据设备配置不通知如:3 config.x_query, # 10 等于x_max-x_center)*2 config.x_center, #60 config.x_max, #65 config.is_half, ) self.sr = 16000 # hubert输入采样率 self.window = 160 # 每帧点数 self.t_pad = self.sr * self.x_pad # 每条前后pad时间 self.t_pad_tgt = tgt_sr * self.x_pad self.t_pad2 = self.t_pad * 2 self.t_query = self.sr * self.x_query # 查询切点前后查询时间, self.t_center = self.sr * self.x_center # 查询切点位置 self.t_max = self.sr * self.x_max # 免查询时长阈值 self.device = config.device def get_f0( self, input_audio_path, x, p_len, f0_up_key, f0_method, filter_radius, inp_f0=None, ): global input_audio_path2wav time_step = self.window / self.sr * 1000 f0_min = 50 f0_max = 1100 f0_mel_min = 1127 * np.log(1 + f0_min / 700) f0_mel_max = 1127 * np.log(1 + f0_max / 700) if f0_method == "pm": f0 = ( parselmouth.Sound(x, self.sr) .to_pitch_ac( time_step=time_step / 1000, voicing_threshold=0.6, pitch_floor=f0_min, pitch_ceiling=f0_max, ) .selected_array["frequency"] ) pad_size = (p_len - len(f0) + 1) // 2 if pad_size > 0 or p_len - len(f0) - pad_size > 0: f0 = np.pad( f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" ) elif f0_method == "harvest": input_audio_path2wav[input_audio_path] = x.astype(np.double) f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10) if filter_radius > 2: f0 = signal.medfilt(f0, 3) elif f0_method == "crepe": model = "full" # Pick a batch size that doesn't cause memory errors on your gpu batch_size = 512 # Compute pitch using first gpu audio = torch.tensor(np.copy(x))[None].float() f0, pd = torchcrepe.predict( audio, self.sr, self.window, f0_min, f0_max, model, batch_size=batch_size, device=self.device, return_periodicity=True, ) pd = torchcrepe.filter.median(pd, 3) f0 = torchcrepe.filter.mean(f0, 3) f0[pd < 0.1] = 0 f0 = f0[0].cpu().numpy() elif f0_method == "rmvpe": if hasattr(self, "model_rmvpe") == False: from lib.rmvpe import RMVPE print("loading rmvpe model") self.model_rmvpe = RMVPE( "rmvpe.pt", is_half=self.is_half, device=self.device ) f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) else: ##for meisheng self.model_rmvpe = f0_method f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) ##这里读文件,更改pitch st fang valid_f0 = f0[f0 > 50] mean_pitch_cur = np.mean(valid_f0[:min(len(valid_f0),500)]) - #print("@@f0_up_key:",f0_up_key) deta = 0 if(f0_up_key > 50 ): deta = -mean_pitch_cur + f0_up_key #print("$$$$$$$$$fangxxxxx pitch shift: ",deta) - f0_up_key = int(np.log2(deta/(mean_pitch_cur + 1) + 1) * 12)##方法2 fang - if( abs(f0_up_key) <= 8 ): + f0_up_key = np.log2(deta/(mean_pitch_cur + 1) + 1) * 12 + if np.isnan(f0_up_key): f0_up_key = 0 - elif f0_up_key > 8: + f0_up_key = int(f0_up_key) + #f0_up_key = int(np.log2(deta/(mean_pitch_cur + 1) + 1) * 12)##方法2 fang + if( f0_up_key >= 12 ): f0_up_key = 12 - elif f0_up_key < -8: + elif f0_up_key < -12: f0_up_key = -12 + else: + f0_up_key = 0 #if( abs(f0_up_key) < 3 ): # f0_up_key = 0 - f0_up_key = max(min(12,f0_up_key),-12) + # f0_up_key = max(min(12,f0_up_key),-12) #print("f0_up_key: ",f0_up_key) f0 *= pow(2, f0_up_key / 12)#这块是音调更改 fang 我设置的0 # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) tf0 = self.sr // self.window # 每秒f0点数 if inp_f0 is not None: delta_t = np.round( (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1 ).astype("int16") replace_f0 = np.interp( list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1] ) shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0] f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[ :shape ] # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) f0bak = f0.copy() f0_mel = 1127 * np.log(1 + f0 / 700) f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( f0_mel_max - f0_mel_min ) + 1 f0_mel[f0_mel <= 1] = 1 f0_mel[f0_mel > 255] = 255 f0_coarse = np.rint(f0_mel).astype(int) return f0_coarse, f0bak # 1-0 def vc( self, model, net_g, sid, audio0, pitch, pitchf, times, index, big_npy, index_rate, version, protect, ): # ,file_index,file_big_npy feats = torch.from_numpy(audio0) if self.is_half: feats = feats.half() else: feats = feats.float() if feats.dim() == 2: # double channels feats = feats.mean(-1) assert feats.dim() == 1, feats.dim() feats = feats.view(1, -1) padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) #print("@@@feats: ",feats.shape) #print("@@@padding_mask: ",padding_mask.shape) inputs = { "source": feats.to(self.device), "padding_mask": padding_mask, "output_layer": 9 if version == "v1" else 12, #"output_layer": 6 if version == "v1" else 12, } t0 = ttime() #''' with torch.no_grad(): logits = model.extract_features(**inputs) feats = model.final_proj(logits[0]) if version == "v1" else logits[0]#为何v1要转化,维度问题??? fang #''' #print("@@@feats: ",feats.shape) ''' global fidx feats_name = f"./feats_{fidx}.pt" fidx += 1 torch.save(feats, feats_name) feats = torch.load(feats_name) #''' if protect < 0.5 and pitch != None and pitchf != None: feats0 = feats.clone() if ( isinstance(index, type(None)) == False and isinstance(big_npy, type(None)) == False and index_rate != 0 ): npy = feats[0].cpu().numpy() if self.is_half: npy = npy.astype("float32") # _, I = index.search(npy, 1) # npy = big_npy[I.squeeze()] score, ix = index.search(npy, k=8) weight = np.square(1 / score) weight /= weight.sum(axis=1, keepdims=True) npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) if self.is_half: npy = npy.astype("float16") feats = ( torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats )##基于index和实际音频的特征进行组合,作为输入 fang #print("@@@feats: ",feats.shape) feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) if protect < 0.5 and pitch != None and pitchf != None: feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute( 0, 2, 1 )#feats0的维度1 插值增加一倍 fang t1 = ttime() p_len = audio0.shape[0] // self.window ##分帧求pitch fang if feats.shape[1] < p_len: p_len = feats.shape[1] if pitch != None and pitchf != None: pitch = pitch[:, :p_len] pitchf = pitchf[:, :p_len] if protect < 0.5 and pitch != None and pitchf != None: pitchff = pitchf.clone() pitchff[pitchf > 0] = 1 pitchff[pitchf < 1] = protect pitchff = pitchff.unsqueeze(-1) feats = feats * pitchff + feats0 * (1 - pitchff) feats = feats.to(feats0.dtype) p_len = torch.tensor([p_len], device=self.device).long() #print("###feats:",feats.shape,"pitch:",pitch.shape,"p_len:",p_len) with torch.no_grad(): if pitch != None and pitchf != None: audio1 = ( (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]) .data.cpu() .float() .numpy() ) else: audio1 = ( (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy() ) del feats, p_len, padding_mask if torch.cuda.is_available(): torch.cuda.empty_cache() t2 = ttime() times[0] += t1 - t0 times[2] += t2 - t1 return audio1 def pipeline( self, model, net_g, sid, audio,## input wav input_audio_path, #input wav name times, f0_up_key, f0_method,# f0 meathod file_index, #index 路径 # file_big_npy, index_rate, if_f0, filter_radius, tgt_sr, resample_sr, rms_mix_rate, version, protect, f0_file=None, ): if ( file_index != "" #.index文件不为空 fang # and file_big_npy != "" # and os.path.exists(file_big_npy) == True and os.path.exists(file_index) == True and index_rate != 0 ): try: index = faiss.read_index(file_index) # big_npy = np.load(file_big_npy) big_npy = index.reconstruct_n(0, index.ntotal) except: traceback.print_exc() index = big_npy = None else: index = big_npy = None #print("####audio 1:",audio.shape) audio = signal.filtfilt(bh, ah, audio) #print("####audio 2:",audio.shape) audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") opt_ts = [] #print("###t_max:",self.t_max) #print("###window:",self.window,"self.t_query:",self.t_query,"self.t_pad2:",self.t_pad2) if audio_pad.shape[0] > self.t_max: audio_sum = np.zeros_like(audio) for i in range(self.window): audio_sum += audio_pad[i : i - self.window]#这样算循环了,每个idx是过去一帧的值的和 fang for t in range(self.t_center, audio.shape[0], self.t_center):#一分钟一帧?? fang opt_ts.append( t - self.t_query + np.where( np.abs(audio_sum[t - self.t_query : t + self.t_query]) == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min() )[0][0] )#返回[ t - self.t_query, t+self.t_query] 区间最小值位置的索引保存,fang s = 0 audio_opt = [] t = None t1 = ttime() audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") p_len = audio_pad.shape[0] // self.window inp_f0 = None if hasattr(f0_file, "name") == True: try: with open(f0_file.name, "r") as f: lines = f.read().strip("\n").split("\n") inp_f0 = [] for line in lines: inp_f0.append([float(i) for i in line.split(",")]) inp_f0 = np.array(inp_f0, dtype="float32") except: traceback.print_exc() #sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() sid_embed = np.load(sid) sid = torch.FloatTensor(sid_embed).to(self.device).half() pitch, pitchf = None, None if if_f0 == 1: pitch, pitchf = self.get_f0( input_audio_path, audio_pad, p_len, f0_up_key, f0_method, filter_radius, inp_f0, ) pitch = pitch[:p_len] pitchf = pitchf[:p_len] if self.device == "mps": pitchf = pitchf.astype(np.float32) pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() #print("&&&&pitch: ",pitchf) t2 = ttime() times[1] += t2 - t1 #print("####len(audio_pad):",len(audio_pad)) #print("###pitch:", pitch.shape) for t in opt_ts: #分段推理每段音频,一段这里设置60s左右 fang t = t // self.window * self.window if if_f0 == 1: audio_opt.append( self.vc( model, net_g, sid, audio_pad[s : t + self.t_pad2 + self.window], pitch[:, s // self.window : (t + self.t_pad2) // self.window], pitchf[:, s // self.window : (t + self.t_pad2) // self.window], times, index, big_npy, index_rate, version, protect, )[self.t_pad_tgt : -self.t_pad_tgt] ) else: audio_opt.append( self.vc( model, net_g, sid, audio_pad[s : t + self.t_pad2 + self.window], None, None, times, index, big_npy, index_rate, version, protect, )[self.t_pad_tgt : -self.t_pad_tgt] ) s = t if if_f0 == 1: ##后面是最后一段处理 fang audio_opt.append( self.vc( model, net_g, sid, audio_pad[t:], pitch[:, t // self.window :] if t is not None else pitch, pitchf[:, t // self.window :] if t is not None else pitchf, times, index, big_npy, index_rate, version, protect, )[self.t_pad_tgt : -self.t_pad_tgt] ) else: audio_opt.append( self.vc( model, net_g, sid, audio_pad[t:], None, None, times, index, big_npy, index_rate, version, protect, )[self.t_pad_tgt : -self.t_pad_tgt] ) audio_opt = np.concatenate(audio_opt) if rms_mix_rate != 1: audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate) if resample_sr >= 16000 and tgt_sr != resample_sr: audio_opt = librosa.resample( audio_opt, orig_sr=tgt_sr, target_sr=resample_sr ) audio_max = np.abs(audio_opt).max() / 0.99 max_int16 = 32768 if audio_max > 1: max_int16 /= audio_max audio_opt = (audio_opt * max_int16).astype(np.int16) del pitch, pitchf, sid if torch.cuda.is_available(): torch.cuda.empty_cache() return audio_opt def infer_core_fang(self,para1,para2,para3,idx, model, net_g, sid, times, index, big_npy, index_rate, version, protect): return [ self.vc( model, net_g, sid, para1, para2, para3, # audio_pad[s: t + self.t_pad2 + self.window], # pitch[:, s // self.window: (t + self.t_pad2) // self.window], # pitchf[:, s // self.window: (t + self.t_pad2) // self.window], times, index, big_npy, index_rate, version, protect, )[self.t_pad_tgt: -self.t_pad_tgt], idx] def ThreadPool_process_core(self, func_process,params1,params2,params3, model, net_g, sid, # audio_pad[s: t + self.t_pad2 + self.window], # pitch[:, s // self.window: (t + self.t_pad2) // self.window], # pitchf[:, s // self.window: (t + self.t_pad2) // self.window], times, index, big_npy, index_rate, version, protect ): num_threads = 2 futures = [] sort_ret = {} with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor: for idx in range(len(params1)): para1 = params1[idx] para2 = params2[idx] para3 = params3[idx] ret = executor.submit(self.infer_core_fang,para1,para2,para3,idx, model, net_g, sid, times, index, big_npy, index_rate, version, protect) futures.append(ret) cnt = 0 for future in concurrent.futures.as_completed(futures): cnt += 1 #print(f"process finised {cnt}, and index :{future.result()[1]}") #print(future.result()) # result # print(future.result()[1]) ##index sort_ret[str(future.result()[1])] = future.result()[0] fea_list = [] for idx in range(len(sort_ret)): fea_list.append(sort_ret[str(idx)]) return fea_list def pipeline_mulprocess( self, model, net_g, sid, audio, ## input wav input_audio_path, # input wav name times, f0_up_key, f0_method, # f0 meathod file_index, # index 路径 # file_big_npy, index_rate, if_f0, filter_radius, tgt_sr, resample_sr, rms_mix_rate, version, protect, f0_file=None, ): if ( file_index != "" # .index文件不为空 fang # and file_big_npy != "" # and os.path.exists(file_big_npy) == True and os.path.exists(file_index) == True and index_rate != 0 ): try: index = faiss.read_index(file_index) # big_npy = np.load(file_big_npy) big_npy = index.reconstruct_n(0, index.ntotal) except: traceback.print_exc() index = big_npy = None else: index = big_npy = None audio = signal.filtfilt(bh, ah, audio) audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") opt_ts = [] if audio_pad.shape[0] > self.t_max: audio_sum = np.zeros_like(audio) for i in range(self.window): audio_sum += audio_pad[i: i - self.window] # 这样算循环了,每个idx是过去一帧的值的和 fang for t in range(self.t_center, audio.shape[0], self.t_center): # 一分钟一帧?? fang opt_ts.append( t - self.t_query + np.where( np.abs(audio_sum[t - self.t_query: t + self.t_query]) == np.abs(audio_sum[t - self.t_query: t + self.t_query]).min() )[0][0] ) # 返回[ t - self.t_query, t+self.t_query] 区间最小值位置的索引保存,fang s = 0 t = None t1 = ttime() audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") p_len = audio_pad.shape[0] // self.window inp_f0 = None if hasattr(f0_file, "name") == True: try: with open(f0_file.name, "r") as f: lines = f.read().strip("\n").split("\n") inp_f0 = [] for line in lines: inp_f0.append([float(i) for i in line.split(",")]) inp_f0 = np.array(inp_f0, dtype="float32") except: traceback.print_exc() # sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() sid_embed = np.load(sid) embed_npy_spk = sid[:-4] + '_spk.npy' sid_spk_embed = np.load(embed_npy_spk ) print("555555sid_embed:",np.shape(sid_embed),'type:',type(sid_embed)) print('sid_spk_embed:', np.shape(sid_spk_embed), 'type:',type(sid_spk_embed)) sid_embed = np.concatenate((sid_embed, sid_spk_embed),axis=0) print('sid_embed:', np.shape(sid_embed), 'type:',type(sid_embed)) sid = torch.FloatTensor(sid_embed).to(self.device).half() #sid_embed = np.load(sid) #sid = torch.FloatTensor(sid_embed).to(self.device).half() print('sid:',sid.shape) pitch, pitchf = None, None #''' if if_f0 == 1: pitch, pitchf = self.get_f0( input_audio_path, audio_pad, p_len, f0_up_key, f0_method, filter_radius, inp_f0, ) pitch = pitch[:p_len] pitchf = pitchf[:p_len] if self.device == "mps": pitchf = pitchf.astype(np.float32) pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() #''' ''' pitch_name = "./pitch_pitchf.npz" #np.savez(pitch_name, pitch = pitch.detach().cpu().numpy(), pitchf = pitchf.detach().cpu().numpy()) npz_obj = np.load(pitch_name) #文件名的后缀为npz pitch, pitchf = npz_obj['pitch'], npz_obj['pitchf'] pitch = torch.tensor(pitch, device=self.device).long() pitchf = torch.tensor(pitchf, device=self.device).float() #''' t2 = ttime() times[1] += t2 - t1 audio_opt = [] audio_pad_list = [] pitch_list = [] pitchf_list = [] for t in opt_ts: # 分段推理每段音频,一段这里设置60s左右 fang t = t // self.window * self.window audio_pad_list.append(audio_pad[s: t + self.t_pad2 + self.window]) pitch_list.append(pitch[:, s // self.window: (t + self.t_pad2) // self.window]) pitchf_list.append(pitchf[:, s // self.window: (t + self.t_pad2) // self.window]) s = t audio_pad_list.append(audio_pad[t:]) pitch_list.append(pitch[:, t // self.window:] if t is not None else pitch) pitchf_list.append(pitchf[:, t // self.window:] if t is not None else pitchf) audio_opt = self.ThreadPool_process_core(self.infer_core_fang, audio_pad_list, pitch_list, pitchf_list, model, net_g, sid, times, index, big_npy, index_rate, version, protect ) ''' if if_f0 == 1: ##后面是最后一段处理 fang audio_opt.append( self.vc( model, net_g, sid, audio_pad[t:], pitch[:, t // self.window:] if t is not None else pitch, pitchf[:, t // self.window:] if t is not None else pitchf, times, index, big_npy, index_rate, version, protect, )[self.t_pad_tgt: -self.t_pad_tgt] ) else: audio_opt.append( self.vc( model, net_g, sid, audio_pad[t:], None, None, times, index, big_npy, index_rate, version, protect, )[self.t_pad_tgt: -self.t_pad_tgt] ) #''' audio_opt = np.concatenate(audio_opt) if rms_mix_rate != 1: audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate) if resample_sr >= 16000 and tgt_sr != resample_sr: audio_opt = librosa.resample( audio_opt, orig_sr=tgt_sr, target_sr=resample_sr ) audio_max = np.abs(audio_opt).max() / 0.99 max_int16 = 32768 if audio_max > 1: max_int16 /= audio_max audio_opt = (audio_opt * max_int16).astype(np.int16) del pitch, pitchf, sid if torch.cuda.is_available(): torch.cuda.empty_cache() return audio_opt