diff --git a/AIMeiSheng/SpeakerEncoder/README.md b/AIMeiSheng/SpeakerEncoder/README.md new file mode 100644 index 0000000..f1e6bce --- /dev/null +++ b/AIMeiSheng/SpeakerEncoder/README.md @@ -0,0 +1,17 @@ +# Speaker Encoder + + +This repository contains the Speaker Encoder model of Mozilla TTS repository without additional modules for easy-to-use computation of speech embeddings. + +### Steps +- Clone the repository
+- Download a pretrained speaker encoder model from here: https://github.com/mozilla/TTS/wiki/Released-Models
+**Preferred**: Speaker-Encoder by @mueller91 + +- Copy files `config.json` and `best_model.pth.tar` to the folder `pretrained_model` + +- Run `python compute_embedding.py --input_type "single_file" --input_path "WAV_PATH" --output_name "out.pkl"` by specifying path to a wav file + +It prints the embedding vector and also saves it in a pickle file with the key `default` + +** To compute embedding vectors for wav files inside a folder, please check the bash script `./scripts/batch_folder.sh` \ No newline at end of file diff --git a/AIMeiSheng/SpeakerEncoder/cal_cos_distance_folder.py b/AIMeiSheng/SpeakerEncoder/cal_cos_distance_folder.py new file mode 100644 index 0000000..40f70ef --- /dev/null +++ b/AIMeiSheng/SpeakerEncoder/cal_cos_distance_folder.py @@ -0,0 +1,47 @@ +import torch +import numpy as np +import sys + +from scipy.spatial.distance import cosine + +def l2_norm(s1, s2): + norm = torch.sum(s1 * s2, -1, keepdim=True) + return norm + +def cos_distance(s1,s2,eps=1e-8): + '''#方法1 + s1_s2_norm = l2_norm(s1, s2) + s2_s2_norm = l2_norm(s2, s2) + s1_s1_norm = l2_norm(s1, s1) + #print('s1_s1_norm: ',s1_s1_norm) + #print('s1_s2_norm: ',s1_s2_norm) + #print('s2_s2_norm: ',s2_s2_norm) + loss = s1_s2_norm / (torch.sqrt(s2_s2_norm *s1_s1_norm) + eps) + #''' + loss = (1 - cosine(s1, s2)) + return loss +import os +def load_and_cal_distance(npy_name1,npy_name2): + spk1_embead = np.array(np.load(npy_name1 ))#.mean(0) + spk2_embead = np.array(np.load(npy_name2 ))#.mean(0) + print('dim1:',spk1_embead.shape) + print('dim2:',spk2_embead.shape) + spk1_embead = torch.from_numpy(spk1_embead) + spk2_embead = torch.from_numpy(spk2_embead) + loss = cos_distance(spk1_embead,spk2_embead)#.mean(0) + print("file:",os.path.basename(npy_name2),"cos distance:", loss) + return loss + +def cal_cos_folder(target_npy,test_folder): + all_files = os.listdir(test_folder) + for npy_test in all_files: + npy_filename = os.path.join(test_folder,npy_test) + load_and_cal_distance(target_npy,npy_filename) + +if __name__ == '__main__': + + npy_name1 = sys.argv[1] #"../../test_wav/xiafan_RawNet3/zihao.npy" + npy_name2 = sys.argv[2] #"../../test_wav/xiafan_RawNet3/" + + #load_and_cal_distance(npy_name1,npy_name2) + cal_cos_folder(npy_name1,npy_name2) diff --git a/AIMeiSheng/SpeakerEncoder/compute_embedding.py b/AIMeiSheng/SpeakerEncoder/compute_embedding.py new file mode 100644 index 0000000..e081c11 --- /dev/null +++ b/AIMeiSheng/SpeakerEncoder/compute_embedding.py @@ -0,0 +1,130 @@ +from limit_threads import * +import argparse +import glob +import os +import numpy as np +import pickle +import random +from tqdm import tqdm +import torch +from concurrent.futures import ProcessPoolExecutor, as_completed +from functools import partial +from speaker_encoder.model import SpeakerEncoder +from speaker_encoder.audio import AudioProcessor +from speaker_encoder.io import load_config + + +class SpeechEmbedding(): + def __init__(self, config, model_path): + self.ap = AudioProcessor(**config['audio']) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Define Encoder model and load pretrained checkpoint + self.model = SpeakerEncoder(**config.model).to(self.device) + self.model.load_state_dict(torch.load(model_path, map_location=self.device)['model']) + self.model.eval() + + def compute_embedding(self, wav_file, itr, total, verbose=True): + if verbose: + print(f"Computing embedding for file {itr}/{total}") + + mel_spec = self.ap.melspectrogram(self.ap.load_wav(wav_file, sr=self.ap.sample_rate)).T + mel_spec = torch.FloatTensor(mel_spec[None, :, :]) + mel_spec = mel_spec.to(self.device) + embedd = self.model.compute_embedding(mel_spec) + embedd = embedd.detach().cpu().numpy() + + return embedd + + +def main(args): + config = load_config(args.config_path) + speech_embedding = SpeechEmbedding(config, args.model_path) + if (not os.path.exists(args.output_path)): + os.mkdir(args.output_path) + + emb_dict = {} + # Compute speaker embeddings + if args.input_type == "single_file": + wav_file = args.input_path + embedd = speech_embedding.compute_embedding(wav_file, 1, 1) + embedd = embedd[0] + emb_dict[args.speaker_name] = embedd + print(embedd) + else: + if args.input_type == "single_speaker": + spk_list = [args.speaker_name] + elif args.input_type == "multi_speaker": + spk_list = os.listdir(args.input_path) + + executor = ProcessPoolExecutor(max_workers=args.num_workers) + for spk_itr, spk_name in enumerate(spk_list): + print(f"========== Speaker {spk_itr}/{len(spk_list)}::") + if args.input_type == "single_speaker": + wav_files = glob.glob(os.path.join(args.input_path, "*.wav")) + elif args.input_type == "multi_speaker": + #wav_files = glob.glob(os.path.join(args.input_path, spk_name, "*.wav")) + wav_files = glob.glob(os.path.join(args.input_path, "*.wav")) + print('wav_files:',wav_files) + # Skip if no wav available for speaker + if len(wav_files) == 0: + continue + + # Randomly shuffle and select a sub-list of num_wavs != -1 + if args.num_wavs != -1: + print(f"Selecting {args.num_wavs} random wavs ...") + random.shuffle(wav_files) + wav_files = wav_files[:args.num_wavs] + + # Compute embeddings for all wav files + all_embdds = [] + for itr, wav_file in enumerate(wav_files): + # embedd = executor.submit(speech_embedding.compute_embedding, + # wav_file, + # itr, + # len(wav_files)) + embedd = speech_embedding.compute_embedding(wav_file, itr, len(wav_files)) + all_embdds.append((os.path.basename(wav_file), embedd[0])) + + # for basename_tmp, embedd in concurrent.futures.as_completed(all_embdds): + for basename_tmp, embedd in all_embdds: + print('embedd:',embedd.shape) + + # basename_tmp = os.path.basename(wav_files) + out_dirname = os.path.join(args.output_path, basename_tmp)[:-4] + '.npy' + print('out_dirname:',out_dirname) + np.save(out_dirname, embedd) + + # print('out_dirname1:', all_embdds[0][0],all_embdds[0][1].result()) + # Process outputs + # all_embdds = [(embedd[0], embedd[1].result()) for embedd in all_embdds if embedd[1].result() is not None] + # print('out_dirname2:',all_embdds) + # # Add embedding of all files + # emb_dict[spk_name] = {} + # if args.mode == "all_embs": + # emb_dict[spk_name].update({embed[0]:embed[1][0] for embed in all_embdds}) + # + # # Add mean of embeddings + # all_embdds_list = [list(embedd[1][0]) for embedd in all_embdds] + # embedd_mean = np.mean(np.array(all_embdds_list), axis=0) + # emb_dict[spk_name].update({"mean":embedd_mean}) + + # with open(os.path.join(args.output_path, f"{args.output_name}_emb.pkl"), "wb") as pkl_file: + # pickle.dump(emb_dict, pkl_file) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--model_path', type=str, default="pretrained_model/best_model.pth.tar", required=False) + parser.add_argument('--config_path', type=str, default="pretrained_model/config.json", required=False) + parser.add_argument('--output_path', type=str, default="outputs/", required=False) + parser.add_argument('--input_path', type=str, default="outputs/", required=False) + parser.add_argument('--input_type', type=str, default="file", required=False) # single_speaker, #multi_speaker + parser.add_argument('--speaker_name', type=str, default="default", required=False) + parser.add_argument('--num_wavs', type=int, default=20, required=False) + parser.add_argument('--num_workers', type=int, default=10, required=False) + #parser.add_argument('--output_name', type=str, required=True) + parser.add_argument('--mode', type=str, default="all_embs", required=False) + + args = parser.parse_args() + + main(args) diff --git a/AIMeiSheng/SpeakerEncoder/compute_embedding_svc_multi.py b/AIMeiSheng/SpeakerEncoder/compute_embedding_svc_multi.py new file mode 100644 index 0000000..96a3988 --- /dev/null +++ b/AIMeiSheng/SpeakerEncoder/compute_embedding_svc_multi.py @@ -0,0 +1,202 @@ +from limit_threads import * +import argparse +import glob +import os +import sys +import numpy as np +import pickle +import random +from tqdm import tqdm +import torch +from concurrent.futures import ProcessPoolExecutor, as_completed +from functools import partial +from speaker_encoder.model import SpeakerEncoder +from speaker_encoder.audio import AudioProcessor +from speaker_encoder.io import load_config + +from multi_threads_wraper import ThreadPool_process_core + +class SpeechEmbedding(): + def __init__(self, config, model_path): + self.ap = AudioProcessor(**config['audio']) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Define Encoder model and load pretrained checkpoint + self.model = SpeakerEncoder(**config.model).to(self.device) + self.model.load_state_dict(torch.load(model_path, map_location=self.device)['model']) + self.model.eval() + + def compute_embedding(self, wav_file, itr, total, verbose=True): + if verbose: + print(f"Computing embedding for file {itr}/{total}") + + mel_spec = self.ap.melspectrogram(self.ap.load_wav(wav_file, sr=self.ap.sample_rate)).T + mel_spec = torch.FloatTensor(mel_spec[None, :, :]) + mel_spec = mel_spec.to(self.device) + embedd = self.model.compute_embedding(mel_spec) + embedd = embedd.detach().cpu().numpy() + + return embedd + + + +from time import sleep, time +from multi_threads_wraper import ThreadPool_process_core +def main_filelist_multi(args) -> None: + + config = load_config(args.config_path) + model = SpeechEmbedding(config, args.model_path) + + + def process_embed_extra(wav_filename, out_dirname): + print("@@@@input wav name :", wav_filename) + print("embedding out_dirname: ", out_dirname) + if not os.path.exists(out_dirname): + embedd = model.compute_embedding(wav_filename, 1, 1) + output = embedd[0] + print("embead shapexx:")#, output)#.size()) + + np.save(out_dirname, output)#.detach().cpu().numpy()) + return + + ###finish文件下载以后,进行数据提取 + if args.filelist is not None: + with open(args.filelist, "r") as file: + # 按行读取文件内容 + lines = file.readlines() + #lines = lines[:10] + # # 遍历每一行并输出 + print("len_lines: ",len(lines)) + cnt = 0 + src_para = [] + des_para = [] + start_time = time() + for line in lines: + cnt += 1 + print(f"cnt: {cnt}/{len(lines)}") + #print(line.strip()) # 使用 strip() 方法移除行尾的换行符 + ###这块提取出来wav文件路径。。。。 + wav_filename = line.split('|')[0] #/data/bingxiao.fang/voice_conversion/Retrieval-based-Voice-Conversion-WebUIx/content/Retrieval-based-Voice-Conversion-WebUI/logs/xusong_v2_org_version_multispk_eaysing4/用户1-男_wav_part/0_gt_wavs/5_17.wav + + basename_tmp = os.path.basename(wav_filename) + dir_parent = os.path.dirname(os.path.dirname(wav_filename)) + out_dir = os.path.join(dir_parent, '5_embed256') + if(not os.path.exists(out_dir)): + os.mkdir(out_dir) + print("mkdir out_dir: ",out_dir) + + out_dirname = os.path.join(out_dir, basename_tmp)[:-4] + '.npy' + src_para.append(wav_filename) + des_para.append(out_dirname) + # print('wav_filename:',wav_filename) + # print('out_dirname:', out_dirname) + + for idx in range(len(des_para)): + process_embed_extra(src_para[idx], des_para[idx]) + #ThreadPool_process_core(process_embed_extra, src_para, des_para) + end_time = time() + duration = end_time - start_time + print("process finished cost {:.3f} seconds ,".format(duration)) + + return + + +def main(args): + config = load_config(args.config_path) + speech_embedding = SpeechEmbedding(config, args.model_path) + if (not os.path.exists(args.output_path)): + os.mkdir(args.output_path) + + emb_dict = {} + # Compute speaker embeddings + if args.input_type == "single_file": + wav_file = args.input_path + embedd = speech_embedding.compute_embedding(wav_file, 1, 1) + embedd = embedd[0] + emb_dict[args.speaker_name] = embedd + print(embedd) + else: + if args.input_type == "single_speaker": + spk_list = [args.speaker_name] + elif args.input_type == "multi_speaker": + spk_list = os.listdir(args.input_path) + + # executor = ProcessPoolExecutor(max_workers=args.num_workers) + for spk_itr, spk_name in enumerate(spk_list): + print(f"========== Speaker {spk_itr}/{len(spk_list)}::") + if args.input_type == "single_speaker": + wav_files = glob.glob(os.path.join(args.input_path, "*.wav")) + elif args.input_type == "multi_speaker": + #wav_files = glob.glob(os.path.join(args.input_path, spk_name, "*.wav")) + wav_files = glob.glob(os.path.join(args.input_path, "*.wav")) + print('wav_files:',wav_files) + # Skip if no wav available for speaker + if len(wav_files) == 0: + continue + + # Randomly shuffle and select a sub-list of num_wavs != -1 + if args.num_wavs != -1: + print(f"Selecting {args.num_wavs} random wavs ...") + random.shuffle(wav_files) + wav_files = wav_files[:args.num_wavs] + + # Compute embeddings for all wav files + all_embdds = [] + + if 1: + for wav_filename in wav_files: + basename_tmp = os.path.basename(wav_filename) + # dir_parent = os.path.dirname(os.path.dirname(wav_filename)) + # out_dir = os.path.join(dir_parent, '4_embed256') + if (not os.path.exists(out_dir)): + os.mkdir(out_dir) + # print("mkdir out_dir: ",out_dir) + + out_dirname = os.path.join(out_dir, basename_tmp)[:-4] + '.npy' + src_para.append(wav_filename) + des_para.append(out_dirname) + + ThreadPool_process_core(process_embed_extra, src_para, des_para) + else: + for itr, wav_file in enumerate(wav_files): + # embedd = executor.submit(speech_embedding.compute_embedding, + # wav_file, + # itr, + # len(wav_files)) + embedd = speech_embedding.compute_embedding(wav_file, itr, len(wav_files)) + all_embdds.append((os.path.basename(wav_file), embedd[0])) + + # for basename_tmp, embedd in concurrent.futures.as_completed(all_embdds): + for basename_tmp, embedd in all_embdds: + print('embedd:',embedd.shape) + + # basename_tmp = os.path.basename(wav_files) + out_dirname = os.path.join(args.output_path, basename_tmp)[:-4] + '.npy' + print('out_dirname:',out_dirname) + np.save(out_dirname, embedd) + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--model_path', type=str, default="pretrained_model/best_model.pth.tar", required=False) + parser.add_argument('--config_path', type=str, default="pretrained_model/config.json", required=False) + parser.add_argument('--output_path', type=str, default="outputs/", required=False) + parser.add_argument('--input_path', type=str, default="outputs/", required=False) + parser.add_argument('--input_type', type=str, default="file", required=False) # single_speaker, #multi_speaker + parser.add_argument('--speaker_name', type=str, default="default", required=False) + parser.add_argument('--num_wavs', type=int, default=20, required=False) + parser.add_argument('--num_workers', type=int, default=10, required=False) + #parser.add_argument('--output_name', type=str, required=True) + parser.add_argument('--mode', type=str, default="all_embs", required=False) + parser.add_argument( + "--filelist", + type=str, + default="", + help="Input filelist to extract embedding in rvc.", + ) + + args = parser.parse_args() + + # main(args) + sys.exit(main_filelist_multi(args)) diff --git a/AIMeiSheng/SpeakerEncoder/compute_embedding_svc_multi_test.py b/AIMeiSheng/SpeakerEncoder/compute_embedding_svc_multi_test.py new file mode 100644 index 0000000..e3e0fc4 --- /dev/null +++ b/AIMeiSheng/SpeakerEncoder/compute_embedding_svc_multi_test.py @@ -0,0 +1,248 @@ +import os +import sys +sys.path.append(os.path.dirname(__file__)) +from limit_threads import * +import argparse +import glob +#import os +#import sys +import numpy as np +import pickle +import random +from tqdm import tqdm +import torch +from concurrent.futures import ProcessPoolExecutor, as_completed +from functools import partial +from speaker_encoder.model import SpeakerEncoder +from speaker_encoder.audio import AudioProcessor +from speaker_encoder.io import load_config + +from multi_threads_wraper import ThreadPool_process_core + +class SpeechEmbedding(): + def __init__(self, config, model_path): + self.ap = AudioProcessor(**config['audio']) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Define Encoder model and load pretrained checkpoint + self.model = SpeakerEncoder(**config.model).to(self.device) + self.model.load_state_dict(torch.load(model_path, map_location=self.device)['model']) + self.model.eval() + + def compute_embedding(self, wav_file, itr, total, verbose=True): + if verbose: + print(f"Computing embedding for file {itr}/{total}") + + mel_spec = self.ap.melspectrogram(self.ap.load_wav(wav_file, sr=self.ap.sample_rate)).T + mel_spec = torch.FloatTensor(mel_spec[None, :, :]) + mel_spec = mel_spec.to(self.device) + embedd = self.model.compute_embedding(mel_spec) + embedd = embedd.detach().cpu().numpy() + + return embedd + + + +from time import sleep, time +from multi_threads_wraper import ThreadPool_process_core +def main_filelist_multi(args) -> None: + + config = load_config(args.config_path) + model = SpeechEmbedding(config, args.model_path) + + + def process_embed_extra(wav_filename, out_dirname): + print("@@@@input wav name :", wav_filename) + print("embedding out_dirname: ", out_dirname) + if not os.path.exists(out_dirname): + embedd = model.compute_embedding(wav_filename, 1, 1) + output = embedd[0] + print("embead shapexx:")#, output)#.size()) + + np.save(out_dirname, output)#.detach().cpu().numpy()) + return + + ###finish文件下载以后,进行数据提取 + if args.filelist is not None: + with open(args.filelist, "r") as file: + # 按行读取文件内容 + lines = file.readlines() + #lines = lines[:10] + # # 遍历每一行并输出 + print("len_lines: ",len(lines)) + cnt = 0 + src_para = [] + des_para = [] + start_time = time() + for line in lines: + cnt += 1 + print(f"cnt: {cnt}/{len(lines)}") + #print(line.strip()) # 使用 strip() 方法移除行尾的换行符 + ###这块提取出来wav文件路径。。。。 + wav_filename = line.split('|')[0] #/data/bingxiao.fang/voice_conversion/Retrieval-based-Voice-Conversion-WebUIx/content/Retrieval-based-Voice-Conversion-WebUI/logs/xusong_v2_org_version_multispk_eaysing4/用户1-男_wav_part/0_gt_wavs/5_17.wav + + basename_tmp = os.path.basename(wav_filename) + dir_parent = os.path.dirname(os.path.dirname(wav_filename)) + out_dir = os.path.join(dir_parent, '5_embed256') + if(not os.path.exists(out_dir)): + os.mkdir(out_dir) + print("mkdir out_dir: ",out_dir) + + out_dirname = os.path.join(out_dir, basename_tmp)[:-4] + '.npy' + src_para.append(wav_filename) + des_para.append(out_dirname) + # print('wav_filename:',wav_filename) + # print('out_dirname:', out_dirname) + + for idx in range(len(des_para)): + process_embed_extra(src_para[idx], des_para[idx]) + #ThreadPool_process_core(process_embed_extra, src_para, des_para) + end_time = time() + duration = end_time - start_time + print("process finished cost {:.3f} seconds ,".format(duration)) + + return + +def get_embed_model(gs_embed_model_spk_path,gs_embed_config_spk_path): + parser = argparse.ArgumentParser() + #root_emb_path = os.path.dirname(__file__) + #root_emb_path = '/data/bingxiao.fang/speaker_identify/SpeakerEncoder/' + parser.add_argument('--model_path', type=str, default=gs_embed_model_spk_path, required=False) + parser.add_argument('--config_path', type=str, default=gs_embed_config_spk_path, required=False) + parser.add_argument('--output_path', type=str, default="outputs/", required=False) + parser.add_argument('--input_path', type=str, default="outputs/", required=False) + parser.add_argument('--input_type', type=str, default="file", required=False) # single_speaker, #multi_speaker + parser.add_argument('--speaker_name', type=str, default="default", required=False) + parser.add_argument('--num_wavs', type=int, default=20, required=False) + parser.add_argument('--num_workers', type=int, default=10, required=False) + # parser.add_argument('--output_name', type=str, required=True) + parser.add_argument('--mode', type=str, default="all_embs", required=False) + parser.add_argument( + "--filelist", + type=str, + default="", + help="Input filelist to extract embedding in rvc.", + ) + + args = parser.parse_args() + + # main(args) + #sys.exit(main_filelist_multi(args)) + + config = load_config(args.config_path) + model = SpeechEmbedding(config, args.model_path) + + return model + +def get_embed(wav_filename, embed_npy, model): + print("@@@@input wav name :", wav_filename) + print("embedding embed_npy: ", embed_npy) + + embedd = model.compute_embedding(wav_filename, 1, 1) + output = embedd[0] + print("embead shapexx:")#, output)#.size()) + + np.save(embed_npy, output)#.detach().cpu().numpy()) + return + + +def main(args): + + config = load_config(args.config_path) + speech_embedding = SpeechEmbedding(config, args.model_path) + if (not os.path.exists(args.output_path)): + os.mkdir(args.output_path) + + emb_dict = {} + # Compute speaker embeddings + if args.input_type == "single_file": + wav_file = args.input_path + embedd = speech_embedding.compute_embedding(wav_file, 1, 1) + embedd = embedd[0] + emb_dict[args.speaker_name] = embedd + print(embedd) + else: + if args.input_type == "single_speaker": + spk_list = [args.speaker_name] + elif args.input_type == "multi_speaker": + spk_list = os.listdir(args.input_path) + + # executor = ProcessPoolExecutor(max_workers=args.num_workers) + for spk_itr, spk_name in enumerate(spk_list): + print(f"========== Speaker {spk_itr}/{len(spk_list)}::") + if args.input_type == "single_speaker": + wav_files = glob.glob(os.path.join(args.input_path, "*.wav")) + elif args.input_type == "multi_speaker": + #wav_files = glob.glob(os.path.join(args.input_path, spk_name, "*.wav")) + wav_files = glob.glob(os.path.join(args.input_path, "*.wav")) + print('wav_files:',wav_files) + # Skip if no wav available for speaker + if len(wav_files) == 0: + continue + + # Randomly shuffle and select a sub-list of num_wavs != -1 + if args.num_wavs != -1: + print(f"Selecting {args.num_wavs} random wavs ...") + random.shuffle(wav_files) + wav_files = wav_files[:args.num_wavs] + + # Compute embeddings for all wav files + all_embdds = [] + + if 1: + for wav_filename in wav_files: + basename_tmp = os.path.basename(wav_filename) + # dir_parent = os.path.dirname(os.path.dirname(wav_filename)) + # out_dir = os.path.join(dir_parent, '4_embed256') + if (not os.path.exists(out_dir)): + os.mkdir(out_dir) + # print("mkdir out_dir: ",out_dir) + + out_dirname = os.path.join(out_dir, basename_tmp)[:-4] + '.npy' + src_para.append(wav_filename) + des_para.append(out_dirname) + + ThreadPool_process_core(process_embed_extra, src_para, des_para) + else: + for itr, wav_file in enumerate(wav_files): + # embedd = executor.submit(speech_embedding.compute_embedding, + # wav_file, + # itr, + # len(wav_files)) + embedd = speech_embedding.compute_embedding(wav_file, itr, len(wav_files)) + all_embdds.append((os.path.basename(wav_file), embedd[0])) + + # for basename_tmp, embedd in concurrent.futures.as_completed(all_embdds): + for basename_tmp, embedd in all_embdds: + print('embedd:',embedd.shape) + + # basename_tmp = os.path.basename(wav_files) + out_dirname = os.path.join(args.output_path, basename_tmp)[:-4] + '.npy' + print('out_dirname:',out_dirname) + np.save(out_dirname, embedd) + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--model_path', type=str, default="pretrained_model/best_model.pth.tar", required=False) + parser.add_argument('--config_path', type=str, default="pretrained_model/config.json", required=False) + parser.add_argument('--output_path', type=str, default="outputs/", required=False) + parser.add_argument('--input_path', type=str, default="outputs/", required=False) + parser.add_argument('--input_type', type=str, default="file", required=False) # single_speaker, #multi_speaker + parser.add_argument('--speaker_name', type=str, default="default", required=False) + parser.add_argument('--num_wavs', type=int, default=20, required=False) + parser.add_argument('--num_workers', type=int, default=10, required=False) + #parser.add_argument('--output_name', type=str, required=True) + parser.add_argument('--mode', type=str, default="all_embs", required=False) + parser.add_argument( + "--filelist", + type=str, + default="", + help="Input filelist to extract embedding in rvc.", + ) + + args = parser.parse_args() + + # main(args) + sys.exit(main_filelist_multi(args)) diff --git a/AIMeiSheng/SpeakerEncoder/limit_threads.py b/AIMeiSheng/SpeakerEncoder/limit_threads.py new file mode 100644 index 0000000..79ddef7 --- /dev/null +++ b/AIMeiSheng/SpeakerEncoder/limit_threads.py @@ -0,0 +1,9 @@ +import os + +os.environ["OMP_NUM_THREADS"] = "2" # export OMP_NUM_THREADS=4 +os.environ["OPENBLAS_NUM_THREADS"] = "2" # export OPENBLAS_NUM_THREADS=4 +os.environ["MKL_NUM_THREADS"] = "2" # export MKL_NUM_THREADS=6 +os.environ["VECLIB_MAXIMUM_THREADS"] = "2" # export VECLIB_MAXIMUM_THREADS=4 +os.environ["NUMEXPR_NUM_THREADS"] = "2" # export NUMEXPR_NUM_THREADS=6 + +print("NUMBER OF THREADS IS LIMITED NOW ...") diff --git a/AIMeiSheng/SpeakerEncoder/multi_threads_wraper.py b/AIMeiSheng/SpeakerEncoder/multi_threads_wraper.py new file mode 100644 index 0000000..125d378 --- /dev/null +++ b/AIMeiSheng/SpeakerEncoder/multi_threads_wraper.py @@ -0,0 +1,81 @@ +# -*- coding: utf-8 -*- +""" +Created on 2024/1/3 14:11 + +@author: bingxiao fang +""" +import threading +from time import sleep, time + + + + +def my_function1(param): + # 这里写需要被并发执行的任务 + print("Thread {} is running with parameter {}. Sleep for 2 seconds.".format(threading.current_thread().name, param)) + sleep(2) + end = time() + duration = end - start + print("Thread {} finished in {:.3f} seconds".format(threading.current_thread().name, duration)) + + +# 定义要执行的任务函数 +def my_function2(param1,param2): + + + # 模拟耗时操作 + sum = 0 + for i in range(10 ): + sum += param2 + # pass + + # end = time() + # duration = end - start + # print("Thread {} finished in {:.3f} seconds ,sum is : {}".format(threading.current_thread().name, duration,sum)) + print("Thread {} finished ".format(threading.current_thread().name)) + + return [sum,param1] ##可恶意找到输出输入 + + + + + + +# 线程池示例 +def my_function3(): + return 'Hello, World!' + +import concurrent.futures +def ThreadPool_process_core(func_process,params1,params2,num_threads =8): + ''' + function: 给定函数和输入,就可以给到输出,通过返回值,可以确定结果是哪个 + :param func_process: + :param params1: + :param params2: + :return: + ''' + #num_threads = 5 # 设置线程数量为5 + futures=[] + with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor: + for idx in range(len(params1)): + para1 = params1[idx] + para2 = params2[idx] + ret = executor.submit(func_process,para1,para2) + futures.append(ret) + cnt=0 + for future in concurrent.futures.as_completed(futures): + cnt += 1 + #print(f"process finised {cnt}") + #print(future.result()) + + +def ThreadPool_main_process(): + params1 = [1, 2, 3, 4, 5, 6, 7, 8, 9] # 定义参数列表 + params2 = [11, 21, 3, 4, 5, 6, 7, 8, 9] + ThreadPool_process_core(my_function2, params1,params2) + + + +if __name__ == "__main__": + + ThreadPool_main_process() diff --git a/AIMeiSheng/SpeakerEncoder/scripts/batch_folder.sh b/AIMeiSheng/SpeakerEncoder/scripts/batch_folder.sh new file mode 100644 index 0000000..b4bd394 --- /dev/null +++ b/AIMeiSheng/SpeakerEncoder/scripts/batch_folder.sh @@ -0,0 +1,18 @@ + +#!/bin/bash + + +input_path="path_to_wavs_folder" +input_type="single_speaker" +num_wavs=-1 # -1 for computing embeddings of all wav files +num_workers=8 +output_name="output" +speaker_name="speaker_name" + +python compute_embedding.py --input_path="$input_path"\ + --input_type="$input_type"\ + --num_wavs="$num_wavs"\ + --num_workers="$num_workers"\ + --output_name="$output_name" \ + --speaker_name="$speaker_name" + diff --git a/AIMeiSheng/SpeakerEncoder/scripts/multispk_comvoiceDE.sh b/AIMeiSheng/SpeakerEncoder/scripts/multispk_comvoiceDE.sh new file mode 100644 index 0000000..1cc8083 --- /dev/null +++ b/AIMeiSheng/SpeakerEncoder/scripts/multispk_comvoiceDE.sh @@ -0,0 +1,16 @@ +#!/bin/bash + + +input_path="/raid/hhemati/Datasets/Speech/TTS/CommonVoice/de/wavs/" +input_type="multi_speaker" +num_wavs=-1 +num_workers=10 +output_name="comvoiceDE" + + +python compute_embedding.py --input_path="$input_path"\ + --input_type="$input_type"\ + --num_wavs="$num_wavs"\ + --num_workers="$num_workers"\ + --output_name="$output_name" + diff --git a/AIMeiSheng/SpeakerEncoder/scripts/multispk_vctk.sh b/AIMeiSheng/SpeakerEncoder/scripts/multispk_vctk.sh new file mode 100644 index 0000000..29a762a --- /dev/null +++ b/AIMeiSheng/SpeakerEncoder/scripts/multispk_vctk.sh @@ -0,0 +1,16 @@ +#!/bin/bash + + +input_path="/raid/hhemati/Datasets/Speech/TTS/English/VCTK-Corpus/wavs/" +input_type="multi_speaker" +num_wavs=-1 +num_workers=8 +output_name="vctk" + + +python compute_embedding.py --input_path="$input_path"\ + --input_type="$input_type"\ + --num_wavs="$num_wavs"\ + --num_workers="$num_workers"\ + --output_name="$output_name" + diff --git a/AIMeiSheng/SpeakerEncoder/scripts/singlespk_css10de.sh b/AIMeiSheng/SpeakerEncoder/scripts/singlespk_css10de.sh new file mode 100644 index 0000000..ff4f94c --- /dev/null +++ b/AIMeiSheng/SpeakerEncoder/scripts/singlespk_css10de.sh @@ -0,0 +1,19 @@ + +#!/bin/bash + + +input_path="/raid/hhemati/Datasets/Speech/TTS/CSS10/GermanSingleSpeaker/de/*/" +input_type="single_speaker" +num_wavs=-1 +num_workers=8 +output_name="gss" +speaker_name="gss" + +python compute_embedding.py --input_path="$input_path"\ + --input_type="$input_type"\ + --num_wavs="$num_wavs"\ + --num_workers="$num_workers"\ + --output_name="$output_name" \ + --speaker_name="$speaker_name" + + diff --git a/AIMeiSheng/SpeakerEncoder/scripts/singlespk_lj.sh b/AIMeiSheng/SpeakerEncoder/scripts/singlespk_lj.sh new file mode 100644 index 0000000..ae187ab --- /dev/null +++ b/AIMeiSheng/SpeakerEncoder/scripts/singlespk_lj.sh @@ -0,0 +1,18 @@ + +#!/bin/bash + + +input_path="/raid/hhemati/Datasets/Speech/TTS/English/LJSpeech-1.1/wavs/" +input_type="single_speaker" +num_wavs=-1 +num_workers=8 +output_name="lj" +speaker_name="lj" + +python compute_embedding.py --input_path="$input_path"\ + --input_type="$input_type"\ + --num_wavs="$num_wavs"\ + --num_workers="$num_workers"\ + --output_name="$output_name" \ + --speaker_name="$speaker_name" + diff --git a/AIMeiSheng/SpeakerEncoder/scripts/singlespk_miriam.sh b/AIMeiSheng/SpeakerEncoder/scripts/singlespk_miriam.sh new file mode 100644 index 0000000..59e2590 --- /dev/null +++ b/AIMeiSheng/SpeakerEncoder/scripts/singlespk_miriam.sh @@ -0,0 +1,18 @@ + +#!/bin/bash + + +input_path="/raid/hhemati/Datasets/Speech/TTS/VocallyYours/MiriamMeckel_Split/audios" +input_type="single_speaker" +num_wavs=-1 +num_workers=8 +output_name="miriam" +speaker_name="miriam" + +python compute_embedding.py --input_path="$input_path"\ + --input_type="$input_type"\ + --num_wavs="$num_wavs"\ + --num_workers="$num_workers"\ + --output_name="$output_name" \ + --speaker_name="$speaker_name" + diff --git a/AIMeiSheng/SpeakerEncoder/speaker_encoder/__init__.py b/AIMeiSheng/SpeakerEncoder/speaker_encoder/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/AIMeiSheng/SpeakerEncoder/speaker_encoder/audio.py b/AIMeiSheng/SpeakerEncoder/speaker_encoder/audio.py new file mode 100644 index 0000000..3a77e9c --- /dev/null +++ b/AIMeiSheng/SpeakerEncoder/speaker_encoder/audio.py @@ -0,0 +1,392 @@ +import librosa +import soundfile as sf +import numpy as np +import scipy.io.wavfile +import scipy.signal +import pyworld as pw + + +# pylint: disable=attribute-defined-outside-init +class StandardScaler(): + + def set_stats(self, mean, scale): + self.mean_ = mean + self.scale_ = scale + + def reset_stats(self): + delattr(self, 'mean_') + delattr(self, 'scale_') + + def transform(self, X): + X = np.asarray(X) + X -= self.mean_ + X /= self.scale_ + return X + + def inverse_transform(self, X): + X = np.asarray(X) + X *= self.scale_ + X += self.mean_ + return X + + +#pylint: disable=too-many-public-methods +class AudioProcessor(object): + def __init__(self, + sample_rate=None, + num_mels=None, + min_level_db=None, + frame_shift_ms=None, + frame_length_ms=None, + hop_length=None, + win_length=None, + ref_level_db=None, + fft_size=1024, + power=None, + preemphasis=0.0, + signal_norm=None, + symmetric_norm=None, + max_norm=None, + mel_fmin=None, + mel_fmax=None, + spec_gain=20, + stft_pad_mode='reflect', + clip_norm=True, + griffin_lim_iters=None, + do_trim_silence=False, + trim_db=60, + do_sound_norm=False, + stats_path=None, + **_): + + print(" > Setting up Audio Processor...") + # setup class attributed + self.sample_rate = sample_rate + self.num_mels = num_mels + self.min_level_db = min_level_db or 0 + self.frame_shift_ms = frame_shift_ms + self.frame_length_ms = frame_length_ms + self.ref_level_db = ref_level_db + self.fft_size = fft_size + self.power = power + self.preemphasis = preemphasis + self.griffin_lim_iters = griffin_lim_iters + self.signal_norm = signal_norm + self.symmetric_norm = symmetric_norm + self.mel_fmin = mel_fmin or 0 + self.mel_fmax = mel_fmax + self.spec_gain = float(spec_gain) + self.stft_pad_mode = stft_pad_mode + self.max_norm = 1.0 if max_norm is None else float(max_norm) + self.clip_norm = clip_norm + self.do_trim_silence = do_trim_silence + self.trim_db = trim_db + self.do_sound_norm = do_sound_norm + self.stats_path = stats_path + # setup stft parameters + if hop_length is None: + # compute stft parameters from given time values + self.hop_length, self.win_length = self._stft_parameters() + else: + # use stft parameters from config file + self.hop_length = hop_length + self.win_length = win_length + assert min_level_db != 0.0, " [!] min_level_db is 0" + assert self.win_length <= self.fft_size, " [!] win_length cannot be larger than fft_size" + members = vars(self) + for key, value in members.items(): + print(" | > {}:{}".format(key, value)) + # create spectrogram utils + self.mel_basis = self._build_mel_basis() + self.inv_mel_basis = np.linalg.pinv(self._build_mel_basis()) + # setup scaler + if stats_path: + mel_mean, mel_std, linear_mean, linear_std, _ = self.load_stats(stats_path) + self.setup_scaler(mel_mean, mel_std, linear_mean, linear_std) + self.signal_norm = True + self.max_norm = None + self.clip_norm = None + self.symmetric_norm = None + + ### setting up the parameters ### + def _build_mel_basis(self, ): + if self.mel_fmax is not None: + assert self.mel_fmax <= self.sample_rate // 2 + return librosa.filters.mel( + self.sample_rate, + self.fft_size, + n_mels=self.num_mels, + fmin=self.mel_fmin, + fmax=self.mel_fmax) + + def _stft_parameters(self, ): + """Compute necessary stft parameters with given time values""" + factor = self.frame_length_ms / self.frame_shift_ms + assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms" + hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate) + win_length = int(hop_length * factor) + return hop_length, win_length + + ### normalization ### + def _normalize(self, S): + """Put values in [0, self.max_norm] or [-self.max_norm, self.max_norm]""" + #pylint: disable=no-else-return + S = S.copy() + if self.signal_norm: + # mean-var scaling + if hasattr(self, 'mel_scaler'): + if S.shape[0] == self.num_mels: + return self.mel_scaler.transform(S.T).T + elif S.shape[0] == self.fft_size / 2: + return self.linear_scaler.transform(S.T).T + else: + raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.') + # range normalization + S -= self.ref_level_db # discard certain range of DB assuming it is air noise + S_norm = ((S - self.min_level_db) / (-self.min_level_db)) + if self.symmetric_norm: + S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm + if self.clip_norm: + S_norm = np.clip(S_norm, -self.max_norm, self.max_norm) # pylint: disable=invalid-unary-operand-type + return S_norm + else: + S_norm = self.max_norm * S_norm + if self.clip_norm: + S_norm = np.clip(S_norm, 0, self.max_norm) + return S_norm + else: + return S + + def _denormalize(self, S): + """denormalize values""" + #pylint: disable=no-else-return + S_denorm = S.copy() + if self.signal_norm: + # mean-var scaling + if hasattr(self, 'mel_scaler'): + if S_denorm.shape[0] == self.num_mels: + return self.mel_scaler.inverse_transform(S_denorm.T).T + elif S_denorm.shape[0] == self.fft_size / 2: + return self.linear_scaler.inverse_transform(S_denorm.T).T + else: + raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.') + if self.symmetric_norm: + if self.clip_norm: + S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm) #pylint: disable=invalid-unary-operand-type + S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db + return S_denorm + self.ref_level_db + else: + if self.clip_norm: + S_denorm = np.clip(S_denorm, 0, self.max_norm) + S_denorm = (S_denorm * -self.min_level_db / + self.max_norm) + self.min_level_db + return S_denorm + self.ref_level_db + else: + return S_denorm + + ### Mean-STD scaling ### + def load_stats(self, stats_path): + stats = np.load(stats_path, allow_pickle=True).item() #pylint: disable=unexpected-keyword-arg + mel_mean = stats['mel_mean'] + mel_std = stats['mel_std'] + linear_mean = stats['linear_mean'] + linear_std = stats['linear_std'] + stats_config = stats['audio_config'] + # check all audio parameters used for computing stats + skip_parameters = ['griffin_lim_iters', 'stats_path', 'do_trim_silence', 'ref_level_db', 'power'] + for key in stats_config.keys(): + if key in skip_parameters: + continue + if key not in ['sample_rate', 'trim_db']: + assert stats_config[key] == self.__dict__[key],\ + f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}" + return mel_mean, mel_std, linear_mean, linear_std, stats_config + + # pylint: disable=attribute-defined-outside-init + def setup_scaler(self, mel_mean, mel_std, linear_mean, linear_std): + self.mel_scaler = StandardScaler() + self.mel_scaler.set_stats(mel_mean, mel_std) + self.linear_scaler = StandardScaler() + self.linear_scaler.set_stats(linear_mean, linear_std) + + ### DB and AMP conversion ### + # pylint: disable=no-self-use + def _amp_to_db(self, x): + return self.spec_gain * np.log10(np.maximum(1e-5, x)) + + # pylint: disable=no-self-use + def _db_to_amp(self, x): + return np.power(10.0, x / self.spec_gain) + + ### Preemphasis ### + def apply_preemphasis(self, x): + if self.preemphasis == 0: + raise RuntimeError(" [!] Preemphasis is set 0.0.") + return scipy.signal.lfilter([1, -self.preemphasis], [1], x) + + def apply_inv_preemphasis(self, x): + if self.preemphasis == 0: + raise RuntimeError(" [!] Preemphasis is set 0.0.") + return scipy.signal.lfilter([1], [1, -self.preemphasis], x) + + ### SPECTROGRAMs ### + def _linear_to_mel(self, spectrogram): + return np.dot(self.mel_basis, spectrogram) + + def _mel_to_linear(self, mel_spec): + return np.maximum(1e-10, np.dot(self.inv_mel_basis, mel_spec)) + + def spectrogram(self, y): + if self.preemphasis != 0: + D = self._stft(self.apply_preemphasis(y)) + else: + D = self._stft(y) + S = self._amp_to_db(np.abs(D)) + return self._normalize(S) + + def melspectrogram(self, y): + if self.preemphasis != 0: + D = self._stft(self.apply_preemphasis(y)) + else: + D = self._stft(y) + S = self._amp_to_db(self._linear_to_mel(np.abs(D))) + return self._normalize(S) + + def inv_spectrogram(self, spectrogram): + """Converts spectrogram to waveform using librosa""" + S = self._denormalize(spectrogram) + S = self._db_to_amp(S) + # Reconstruct phase + if self.preemphasis != 0: + return self.apply_inv_preemphasis(self._griffin_lim(S**self.power)) + return self._griffin_lim(S**self.power) + + def inv_melspectrogram(self, mel_spectrogram): + '''Converts melspectrogram to waveform using librosa''' + D = self._denormalize(mel_spectrogram) + S = self._db_to_amp(D) + S = self._mel_to_linear(S) # Convert back to linear + if self.preemphasis != 0: + return self.apply_inv_preemphasis(self._griffin_lim(S**self.power)) + return self._griffin_lim(S**self.power) + + def out_linear_to_mel(self, linear_spec): + S = self._denormalize(linear_spec) + S = self._db_to_amp(S) + S = self._linear_to_mel(np.abs(S)) + S = self._amp_to_db(S) + mel = self._normalize(S) + return mel + + ### STFT and ISTFT ### + def _stft(self, y): + return librosa.stft( + y=y, + n_fft=self.fft_size, + hop_length=self.hop_length, + win_length=self.win_length, + pad_mode=self.stft_pad_mode, + ) + + def _istft(self, y): + return librosa.istft( + y, hop_length=self.hop_length, win_length=self.win_length) + + def _griffin_lim(self, S): + angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) + S_complex = np.abs(S).astype(np.complex) + y = self._istft(S_complex * angles) + for _ in range(self.griffin_lim_iters): + angles = np.exp(1j * np.angle(self._stft(y))) + y = self._istft(S_complex * angles) + return y + + def compute_stft_paddings(self, x, pad_sides=1): + '''compute right padding (final frame) or both sides padding (first and final frames) + ''' + assert pad_sides in (1, 2) + pad = (x.shape[0] // self.hop_length + 1) * self.hop_length - x.shape[0] + if pad_sides == 1: + return 0, pad + return pad // 2, pad // 2 + pad % 2 + + ### Compute F0 ### + def compute_f0(self, x): + f0, t = pw.dio( + x.astype(np.double), + fs=self.sample_rate, + f0_ceil=self.mel_fmax, + frame_period=1000 * self.hop_length / self.sample_rate, + ) + f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate) + return f0 + + ### Audio Processing ### + def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8): + window_length = int(self.sample_rate * min_silence_sec) + hop_length = int(window_length / 4) + threshold = self._db_to_amp(threshold_db) + for x in range(hop_length, len(wav) - window_length, hop_length): + if np.max(wav[x:x + window_length]) < threshold: + return x + hop_length + return len(wav) + + def trim_silence(self, wav): + """ Trim silent parts with a threshold and 0.01 sec margin """ + margin = int(self.sample_rate * 0.01) + wav = wav[margin:-margin] + return librosa.effects.trim( + wav, top_db=self.trim_db, frame_length=self.win_length, hop_length=self.hop_length)[0] + + @staticmethod + def sound_norm(x): + return x / abs(x).max() * 0.9 + + ### save and load ### + def load_wav(self, filename, sr=None): + if sr is None: + x, sr = sf.read(filename) + assert self.sample_rate == sr, "%s vs %s"%(self.sample_rate, sr) + else: + x, sr = librosa.load(filename, sr=sr) + if self.do_trim_silence: + try: + x = self.trim_silence(x) + except ValueError: + print(f' [!] File cannot be trimmed for silence - {filename}') + if self.do_sound_norm: + x = self.sound_norm(x) + return x + + def save_wav(self, wav, path): + wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) + scipy.io.wavfile.write(path, self.sample_rate, wav_norm.astype(np.int16)) + + @staticmethod + def mulaw_encode(wav, qc): + mu = 2 ** qc - 1 + # wav_abs = np.minimum(np.abs(wav), 1.0) + signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1. + mu) + # Quantize signal to the specified number of levels. + signal = (signal + 1) / 2 * mu + 0.5 + return np.floor(signal,) + + @staticmethod + def mulaw_decode(wav, qc): + """Recovers waveform from quantized values.""" + mu = 2 ** qc - 1 + x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1) + return x + + + @staticmethod + def encode_16bits(x): + return np.clip(x * 2**15, -2**15, 2**15 - 1).astype(np.int16) + + @staticmethod + def quantize(x, bits): + return (x + 1.) * (2**bits - 1) / 2 + + @staticmethod + def dequantize(x, bits): + return 2 * x / (2**bits - 1) - 1 diff --git a/AIMeiSheng/SpeakerEncoder/speaker_encoder/io.py b/AIMeiSheng/SpeakerEncoder/speaker_encoder/io.py new file mode 100644 index 0000000..5188ef9 --- /dev/null +++ b/AIMeiSheng/SpeakerEncoder/speaker_encoder/io.py @@ -0,0 +1,35 @@ +import os +import re +import json + + +class AttrDict(dict): + """A custom dict which converts dict keys + to class attributes""" + def __init__(self, *args, **kwargs): + super(AttrDict, self).__init__(*args, **kwargs) + self.__dict__ = self + + +def load_config(config_path: str) -> AttrDict: + """Load config files and discard comments + Args: + config_path (str): path to config file. + """ + config = AttrDict() + + ext = os.path.splitext(config_path)[1] + if ext in (".yml", ".yaml"): + with open(config_path, "r") as f: + data = yaml.safe_load(f) + else: + # fallback to json + with open(config_path, "r") as f: + input_str = f.read() + # handle comments + input_str = re.sub(r'\\\n', '', input_str) + input_str = re.sub(r'//.*\n', '\n', input_str) + data = json.loads(input_str) + + config.update(data) + return config \ No newline at end of file diff --git a/AIMeiSheng/SpeakerEncoder/speaker_encoder/model.py b/AIMeiSheng/SpeakerEncoder/speaker_encoder/model.py new file mode 100644 index 0000000..fa241b0 --- /dev/null +++ b/AIMeiSheng/SpeakerEncoder/speaker_encoder/model.py @@ -0,0 +1,112 @@ +import torch +from torch import nn + + +class LSTMWithProjection(nn.Module): + def __init__(self, input_size, hidden_size, proj_size): + super().__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.proj_size = proj_size + self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True) + self.linear = nn.Linear(hidden_size, proj_size, bias=False) + + def forward(self, x): + self.lstm.flatten_parameters() + o, (_, _) = self.lstm(x) + return self.linear(o) + +class LSTMWithoutProjection(nn.Module): + def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers): + super().__init__() + self.lstm = nn.LSTM(input_size=input_dim, + hidden_size=lstm_dim, + num_layers=num_lstm_layers, + batch_first=True) + self.linear = nn.Linear(lstm_dim, proj_dim, bias=True) + self.relu = nn.ReLU() + def forward(self, x): + _, (hidden, _) = self.lstm(x) + return self.relu(self.linear(hidden[-1])) + +class SpeakerEncoder(nn.Module): + def __init__(self, input_dim, proj_dim=256, lstm_dim=768, num_lstm_layers=3, use_lstm_with_projection=True): + super().__init__() + self.use_lstm_with_projection = use_lstm_with_projection + layers = [] + # choise LSTM layer + if use_lstm_with_projection: + layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim)) + for _ in range(num_lstm_layers - 1): + layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim)) + self.layers = nn.Sequential(*layers) + else: + self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers) + + self._init_layers() + + def _init_layers(self): + for name, param in self.layers.named_parameters(): + if "bias" in name: + nn.init.constant_(param, 0.0) + elif "weight" in name: + nn.init.xavier_normal_(param) + + def forward(self, x): + # TODO: implement state passing for lstms + d = self.layers(x) + if self.use_lstm_with_projection: + d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1) + else: + d = torch.nn.functional.normalize(d, p=2, dim=1) + return d + + @torch.no_grad() + def inference(self, x): + d = self.layers.forward(x) + if self.use_lstm_with_projection: + d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1) + else: + d = torch.nn.functional.normalize(d, p=2, dim=1) + return d + + def compute_embedding(self, x, num_frames=160, overlap=0.5): + """ + Generate embeddings for a batch of utterances + x: 1xTxD + """ + num_overlap = int(num_frames * overlap) + max_len = x.shape[1] + embed = None + cur_iter = 0 + for offset in range(0, max_len, num_frames - num_overlap): + cur_iter += 1 + end_offset = min(x.shape[1], offset + num_frames) + frames = x[:, offset:end_offset] + if embed is None: + embed = self.inference(frames) + else: + embed += self.inference(frames) + return embed / cur_iter + + def batch_compute_embedding(self, x, seq_lens, num_frames=160, overlap=0.5): + """ + Generate embeddings for a batch of utterances + x: BxTxD + """ + num_overlap = num_frames * overlap + max_len = x.shape[1] + embed = None + num_iters = seq_lens / (num_frames - num_overlap) + cur_iter = 0 + for offset in range(0, max_len, num_frames - num_overlap): + cur_iter += 1 + end_offset = min(x.shape[1], offset + num_frames) + frames = x[:, offset:end_offset] + if embed is None: + embed = self.inference(frames) + else: + embed[cur_iter <= num_iters, :] += self.inference( + frames[cur_iter <= num_iters, :, :] + ) + return embed / num_iters \ No newline at end of file diff --git a/AIMeiSheng/cos_similar_ui_zoom.py b/AIMeiSheng/cos_similar_ui_zoom.py new file mode 100644 index 0000000..87b74f5 --- /dev/null +++ b/AIMeiSheng/cos_similar_ui_zoom.py @@ -0,0 +1,54 @@ +import json +import shutil + +import gradio as gr + +import zipfile +import os +import sys +import ffmpeg + + +import gradio as gr +import librosa,soundfile +#sys.path.append('./AIMeiSheng/SpeakerEncoder') +from SpeakerEncoder.compute_embedding_svc_multi_test import get_embed, get_embed_model +from SpeakerEncoder.cal_cos_distance_folder import load_and_cal_distance +from docker_demo.common import gs_embed_model_spk_path,gs_embed_config_spk_path + +class cos_similar(): + def __init__(self,): + self.embed_model = self.load_model() + self.embed_npy = 'wav1.npy' + self.svc_embed_npy = 'wav2.npy' + + def get_cos_similar_spkenc(self, wav1, wav2): + get_embed(wav1, self.embed_npy, self.embed_model) + get_embed(wav2, self.svc_embed_npy, self.embed_model) + + similar = load_and_cal_distance(self.embed_npy, self.svc_embed_npy) + print("target_npy:", self.embed_npy, "svc_npy:", self.svc_embed_npy) + print("######similar:", similar) + return similar + + def get_spk_embed(self,wav_in, embed_npy): + get_embed(wav_in, embed_npy, self.embed_model) + return + + def get_cos_similar_raw(self, wav1, wav2): + get_embed(wav1, self.embed_npy, self.embed_model) + get_embed(wav2, self.svc_embed_npy, self.embed_model) + + similar = load_and_cal_distance(self.embed_npy, self.svc_embed_npy) + print("target_npy:", self.embed_npy, "svc_npy:", self.svc_embed_npy) + print("######similar:", similar) + return similar + + def load_model(self): + embed_model = get_embed_model(gs_embed_model_spk_path, gs_embed_config_spk_path) + return embed_model + + + + + diff --git a/AIMeiSheng/docker_demo/Dockerfile b/AIMeiSheng/docker_demo/Dockerfile index dd9f0dd..6b9921f 100644 --- a/AIMeiSheng/docker_demo/Dockerfile +++ b/AIMeiSheng/docker_demo/Dockerfile @@ -1,25 +1,25 @@ # 系统版本 CUDA Version 11.8.0 # NAME="CentOS Linux" VERSION="7 (Core)" # FROM starmaker.tencentcloudcr.com/starmaker/av/av:1.1 # 基础镜像, python3.9,cuda118,centos7,外加ffmpeg #FROM starmaker.tencentcloudcr.com/starmaker/av/av_base:1.0 FROM av_base_test:1.0 RUN source /etc/profile && sed -i 's|mirrorlist=|#mirrorlist=|g' /etc/yum.repos.d/CentOS-Base.repo && sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-Base.repo && yum clean all && yum install -y unzip && yum install -y libsndfile && yum install -y libsamplerate libsamplerate-devel -RUN source /etc/profile && pip3 install librosa && pip3 install gradio && pip3 install torch==2.1.2 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 +RUN source /etc/profile && pip3 install librosa==0.9.1 && pip3 install gradio && pip3 install torch==2.1.2 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 RUN source /etc/profile && pip3 install urllib3==1.26.15 && pip3 install coscmd && coscmd config -a AKIDoQmshFWXGitnQmrfCTYNwEExPaU6RVHm -s F9n9E2ZonWy93f04qMaYFfogHadPt62h -b log-sg-1256122840 -r ap-singapore RUN source /etc/profile && pip3 install asteroid-filterbanks RUN source /etc/profile && pip3 install praat-parselmouth==0.4.3 RUN source /etc/profile && pip3 install pyworld RUN source /etc/profile && pip3 install faiss-cpu RUN source /etc/profile && pip3 install torchcrepe RUN source /etc/profile && pip3 install thop RUN source /etc/profile && pip3 install ffmpeg-python RUN source /etc/profile && pip3 install fairseq RUN source /etc/profile && pip3 install redis==4.5.0 WORKDIR /data/code -CMD ["/bin/bash", "-c", "source /etc/profile; export PYTHONPATH=/data/code; cd /data/code/AIMeiSheng/docker_demo; python3 offline_server.py"] \ No newline at end of file +CMD ["/bin/bash", "-c", "source /etc/profile; export PYTHONPATH=/data/code; cd /data/code/AIMeiSheng/docker_demo; python3 offline_server.py"] diff --git a/AIMeiSheng/docker_demo/common.py b/AIMeiSheng/docker_demo/common.py index ca42a27..7602d60 100644 --- a/AIMeiSheng/docker_demo/common.py +++ b/AIMeiSheng/docker_demo/common.py @@ -1,105 +1,108 @@ import os import time # import logging import urllib, urllib.request # 测试/正式环境 gs_prod = True gs_tmp_dir = "/tmp/ai_meisheng_tmp" gs_model_dir = "/tmp/ai_meisheng_models" gs_resource_cache_dir = "/tmp/ai_meisheng_resource_cache" -gs_svc_model_path = os.path.join(gs_model_dir, - "weights/xusong_v2_org_version_alldata_embed1_enzx_diff_fi_e15_s244110.pth") gs_embed_model_path = os.path.join(gs_model_dir, "RawNet3/models/weights/model.pt") +gs_svc_model_path = os.path.join(gs_model_dir, + "weights/xusong_v2_org_version_alldata_embed_spkenx200x_vocal_e22_s95040.pth") gs_hubert_model_path = os.path.join(gs_model_dir, "hubert.pt") gs_rmvpe_model_path = os.path.join(gs_model_dir, "rmvpe.pt") +gs_embed_model_spk_path = os.path.join(gs_model_dir, "SpeakerEncoder/pretrained_model/best_model.pth.tar") +gs_embed_config_spk_path = os.path.join(gs_model_dir, "SpeakerEncoder/pretrained_model/config.json") # errcode + gs_err_code_success = 0 gs_err_code_download_vocal = 100 gs_err_code_download_svc_url = 101 gs_err_code_svc_process = 102 gs_err_code_transcode = 103 gs_err_code_volume_adjust = 104 gs_err_code_upload = 105 gs_err_code_params = 106 gs_err_code_pending = 107 gs_err_code_target_silence = 108 gs_err_code_too_many_connections = 429 gs_redis_conf = { "host": "av-credis.starmaker.co", "port": 6379, "pwd": "lKoWEhz%jxTO", } gs_server_redis_conf = { "producer": "test_ai_meisheng_producer", # 输入的队列 "ai_meisheng_key_prefix": "test_ai_meisheng_key_", # 存储结果情况 } if gs_prod: gs_server_redis_conf = { "producer": "ai_meisheng_producer", # 输入的队列 "ai_meisheng_key_prefix": "ai_meisheng_key_", # 存储结果情况 } def download2disk(url, dst_path): try: urllib.request.urlretrieve(url, dst_path) return os.path.exists(dst_path) except Exception as ex: print(f"download url={url} error", ex) return False def exec_cmd(cmd): # gs_logger.info(cmd) print(cmd) ret = os.system(cmd) if ret != 0: return False return True def exec_cmd_and_result(cmd): r = os.popen(cmd) text = r.read() r.close() return text def upload_file2cos(key, file_path, region='ap-singapore', bucket_name='av-audit-sync-sg-1256122840'): """ 将文件上传到cos :param key: 桶上的具体地址 :param file_path: 本地文件地址 :param region: 区域 :param bucket_name: 桶地址 :return: """ gs_coscmd = "coscmd" gs_coscmd_conf = "~/.cos.conf" cmd = "{} -c {} -r {} -b {} upload {} {}".format(gs_coscmd, gs_coscmd_conf, region, bucket_name, file_path, key) if exec_cmd(cmd): cmd = "{} -c {} -r {} -b {} info {}".format(gs_coscmd, gs_coscmd_conf, region, bucket_name, key) \ + "| grep Content-Length |awk \'{print $2}\'" res_str = exec_cmd_and_result(cmd) # logging.info("{},res={}".format(key, res_str)) size = float(res_str) if size > 0: return True return False return False def check_input(input_data): key_list = ["record_song_url", "target_url", "start", "end", "vocal_loudness", "female_recording_url", "male_recording_url"] for key in key_list: if key not in input_data.keys(): return False return True diff --git a/AIMeiSheng/docker_demo/svc_online.py b/AIMeiSheng/docker_demo/svc_online.py index e910f5f..a52ab24 100644 --- a/AIMeiSheng/docker_demo/svc_online.py +++ b/AIMeiSheng/docker_demo/svc_online.py @@ -1,188 +1,190 @@ # -*- coding: UTF-8 -*- """ SVC的核心处理逻辑 """ import os import time import socket import shutil import hashlib from AIMeiSheng.meisheng_svc_final import load_model, process_svc_online +from AIMeiSheng.cos_similar_ui_zoom import cos_similar from AIMeiSheng.meisheng_env_preparex import meisheng_env_prepare from AIMeiSheng.voice_classification.online.voice_class_online_fang import VoiceClass, download_volume_balanced from AIMeiSheng.docker_demo.common import * import logging hostname = socket.gethostname() log_file_name = f"{os.path.dirname(os.path.abspath(__file__))}/av_meisheng_{hostname}.log" # 设置logger svc_offline_logger = logging.getLogger("svc_offline") file_handler = logging.FileHandler(log_file_name) file_handler.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %I:%M:%S') file_handler.setFormatter(formatter) if gs_prod: svc_offline_logger.addHandler(file_handler) if os.path.exists(gs_tmp_dir): shutil.rmtree(gs_tmp_dir) os.makedirs(gs_model_dir, exist_ok=True) os.makedirs(gs_resource_cache_dir, exist_ok=True) # 预设参数 gs_gender_models_url = "https://av-audit-sync-sg-1256122840.cos.ap-singapore.myqcloud.com/hub/voice_classification/models.zip" gs_volume_bin_url = "https://av-audit-sync-sg-1256122840.cos.ap-singapore.myqcloud.com/dataset/AIMeiSheng/ebur128_tool" class GSWorkerAttr: def __init__(self, input_data): # 取出输入资源 vocal_url = input_data["record_song_url"] target_url = input_data["target_url"] start = input_data["start"] # 单位是ms end = input_data["end"] # 单位是ms vocal_loudness = input_data["vocal_loudness"] female_recording_url = input_data["female_recording_url"] male_recording_url = input_data["male_recording_url"] self.distinct_id = hashlib.md5(vocal_url.encode()).hexdigest() self.tmp_dir = os.path.join(gs_tmp_dir, self.distinct_id) if os.path.exists(self.tmp_dir): shutil.rmtree(self.tmp_dir) os.makedirs(self.tmp_dir) self.vocal_url = vocal_url self.target_url = target_url ext = vocal_url.split(".")[-1] self.vocal_path = os.path.join(self.tmp_dir, self.distinct_id + f"_in.{ext}") self.target_wav_path = os.path.join(self.tmp_dir, self.distinct_id + "_out.wav") self.target_wav_ad_path = os.path.join(self.tmp_dir, self.distinct_id + "_out_ad.wav") self.target_path = os.path.join(self.tmp_dir, self.distinct_id + "_out.m4a") self.female_svc_source_url = female_recording_url self.male_svc_source_url = male_recording_url ext = female_recording_url.split(".")[-1] self.female_svc_source_path = os.path.join(gs_resource_cache_dir, hashlib.md5(female_recording_url.encode()).hexdigest() + "." + ext) ext = male_recording_url.split(".")[-1] self.male_svc_source_path = os.path.join(gs_resource_cache_dir, hashlib.md5(male_recording_url.encode()).hexdigest() + "." + ext) self.st_tm = start self.ed_tm = end self.target_loudness = vocal_loudness def log_info_name(self): return f"d_id={self.distinct_id}, vocal_url={self.vocal_url}" def rm_cache(self): if os.path.exists(self.tmp_dir): shutil.rmtree(self.tmp_dir) def init_gender_model(): """ 下载模型 :return: """ dst_model_dir = os.path.join(gs_model_dir, "voice_classification") if not os.path.exists(dst_model_dir): dst_zip_path = os.path.join(gs_model_dir, "models.zip") if not download2disk(gs_gender_models_url, dst_zip_path): svc_offline_logger.fatal(f"download gender_model err={gs_gender_models_url}") cmd = f"cd {gs_model_dir}; unzip {dst_zip_path}; mv models voice_classification; rm -f {dst_zip_path}" os.system(cmd) if not os.path.exists(dst_model_dir): svc_offline_logger.fatal(f"unzip {dst_zip_path} err") music_voice_pure_model = os.path.join(dst_model_dir, "voice_005_rec_v5.pth") music_voice_no_pure_model = os.path.join(dst_model_dir, "voice_10_v5.pth") gender_pure_model = os.path.join(dst_model_dir, "gender_8k_ratev5_v6_adam.pth") gender_no_pure_model = os.path.join(dst_model_dir, "gender_8k_v6_adam.pth") vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model) return vc def init_svc_model(): meisheng_env_prepare(logging, gs_model_dir) embed_model, hubert_model = load_model() - return embed_model, hubert_model + cs_sim = cos_similar() + return embed_model, hubert_model,cs_sim def download_volume_adjustment(): """ 下载音量调整工具 :return: """ volume_bin_path = os.path.join(gs_model_dir, "ebur128_tool") if not os.path.exists(volume_bin_path): if not download2disk(gs_volume_bin_url, volume_bin_path): svc_offline_logger.fatal(f"download volume_bin err={gs_volume_bin_url}") os.system(f"chmod +x {volume_bin_path}") def volume_adjustment(wav_path, target_loudness, out_path): """ 音量调整 :param wav_path: :param target_loudness: :param out_path: :return: """ volume_bin_path = os.path.join(gs_model_dir, "ebur128_tool") cmd = f"{volume_bin_path} {wav_path} {target_loudness} {out_path}" os.system(cmd) class SVCOnline: def __init__(self): st = time.time() self.gender_model = init_gender_model() - self.embed_model, self.hubert_model = init_svc_model() + self.embed_model, self.hubert_model, self.cs_sim = init_svc_model() download_volume_adjustment() download_volume_balanced() svc_offline_logger.info(f"svc init finished, sp = {time.time() - st}") def gender_process(self, worker_attr): st = time.time() gender, female_rate, is_pure = self.gender_model.process(worker_attr.vocal_path) svc_offline_logger.info( f"{worker_attr.vocal_url}, gender={gender}, female_rate={female_rate}, is_pure={is_pure}, " f"gender_process sp = {time.time() - st}") if gender == 0: gender = 'female' elif gender == 1: gender = 'male' elif female_rate > 0.5: gender = 'female' else: gender = 'male' svc_offline_logger.info(f"{worker_attr.vocal_url}, modified gender={gender}") # err = gs_err_code_success # if female_rate == -1: # err = gs_err_code_target_silence return gender, gs_err_code_success def process(self, worker_attr): gender, err = self.gender_process(worker_attr) if err != gs_err_code_success: return gender, err song_path = worker_attr.female_svc_source_path if gender == "male": song_path = worker_attr.male_svc_source_path params = {'gender': gender, 'tst': worker_attr.st_tm, "tnd": worker_attr.ed_tm, 'delay': 0, 'song_path': None} st = time.time() err_code = process_svc_online(song_path, worker_attr.vocal_path, worker_attr.target_wav_path, self.embed_model, - self.hubert_model, params) + self.hubert_model, self.cs_sim, params) svc_offline_logger.info(f"{worker_attr.vocal_url}, err_code={err_code} process svc sp = {time.time() - st}") return gender, err_code diff --git a/AIMeiSheng/lib/infer_pack/models_embed_in_dec_diff_control_enc_spken200x.py b/AIMeiSheng/lib/infer_pack/models_embed_in_dec_diff_control_enc_spken200x.py new file mode 100644 index 0000000..de081e2 --- /dev/null +++ b/AIMeiSheng/lib/infer_pack/models_embed_in_dec_diff_control_enc_spken200x.py @@ -0,0 +1,1271 @@ +import math, pdb, os +from time import time as ttime +import torch +from torch import nn +from torch.nn import functional as F +from lib.infer_pack import modules +from lib.infer_pack import attentions_in_dec as attentions +from lib.infer_pack import commons +from lib.infer_pack.commons import init_weights, get_padding +from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from lib.infer_pack.commons import init_weights +import numpy as np +from lib.infer_pack import commons +from thop import profile +from diffuse_fang.diffUse_wraper import diff_decoder,ddpm_para +ddpm_dp = ddpm_para() + +class TextEncoder256(nn.Module): + def __init__( + self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=True, + ): + super().__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.emb_phone = nn.Linear(256, hidden_channels) + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: + self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 + self.encoder = attentions.Encoder( + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, phone, pitch, lengths): + if pitch == None: + x = self.emb_phone(phone) + else: + x = self.emb_phone(phone) + self.emb_pitch(pitch) + x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = self.lrelu(x) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.encoder(x * x_mask, x_mask) + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask + + +class TextEncoder768(nn.Module): + def __init__( + self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=True, + ): + super().__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.emb_phone = nn.Linear(768, hidden_channels) + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: + self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 + self.encoder = attentions.Encoder( + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + #self.emb_g = nn.Linear(256, hidden_channels) + + def forward(self, phone, pitch, lengths,g):#fang add + if pitch == None: + x = self.emb_phone(phone) + else: + x = self.emb_phone(phone) + self.emb_pitch(pitch) #+ self.emb_g(g) + #print("@@@x:",x.shape) + x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = self.lrelu(x) + x = torch.transpose(x, 1, -1) # [b, h, t] + #print("@@@x1:",x.shape) + x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( + x.dtype + ) + #x = self.encoder(x * x_mask, x_mask,g) + x = self.encoder(x * x_mask, x_mask,g)#fang add + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask,x + + +class ResidualCouplingBlock(nn.Module): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0, + ): + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + for i in range(n_flows): + self.flows.append( + modules.ResidualCouplingLayer( + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + mean_only=True, + ) + ) + self.flows.append(modules.Flip()) + + def forward(self, x, x_mask, g=None, reverse=False): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in reversed(self.flows): + x = flow(x, x_mask, g=g, reverse=reverse) + return x + + def remove_weight_norm(self): + for i in range(self.n_flows): + self.flows[i * 2].remove_weight_norm() + + +class PosteriorEncoder(nn.Module): + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = modules.WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, x, x_lengths, g=None): + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1)#均值和方差 fang + z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask ##随机采样 fang + return z, m, logs, x_mask + + def remove_weight_norm(self): + self.enc.remove_weight_norm() + + +class Generator(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=0, + ): + super(Generator, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward(self, x, g=None): + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +class SineGen(torch.nn.Module): + """Definition of sine generator + SineGen(samp_rate, harmonic_num = 0, + sine_amp = 0.1, noise_std = 0.003, + voiced_threshold = 0, + flag_for_pulse=False) + samp_rate: sampling rate in Hz + harmonic_num: number of harmonic overtones (default 0) + sine_amp: amplitude of sine-wavefrom (default 0.1) + noise_std: std of Gaussian noise (default 0.003) + voiced_thoreshold: F0 threshold for U/V classification (default 0) + flag_for_pulse: this SinGen is used inside PulseGen (default False) + Note: when flag_for_pulse is True, the first time step of a voiced + segment is always sin(np.pi) or cos(0) + """ + + def __init__( + self, + samp_rate, + harmonic_num=0, + sine_amp=0.1, + noise_std=0.003, + voiced_threshold=0, + flag_for_pulse=False, + ): + super(SineGen, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + + def _f02uv(self, f0): + # generate uv signal + uv = torch.ones_like(f0) + uv = uv * (f0 > self.voiced_threshold) + return uv + + def forward(self, f0, upp): + """sine_tensor, uv = forward(f0) + input F0: tensor(batchsize=1, length, dim=1) + f0 for unvoiced steps should be 0 + output sine_tensor: tensor(batchsize=1, length, dim) + output uv: tensor(batchsize=1, length, 1) + """ + with torch.no_grad(): + f0 = f0[:, None].transpose(1, 2) + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) + # fundamental component + f0_buf[:, :, 0] = f0[:, :, 0] + for idx in np.arange(self.harmonic_num): + f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( + idx + 2 + ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic + rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化 + rand_ini = torch.rand( + f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device + ) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化 + tmp_over_one *= upp + tmp_over_one = F.interpolate( + tmp_over_one.transpose(2, 1), + scale_factor=upp, + mode="linear", + align_corners=True, + ).transpose(2, 1) + rad_values = F.interpolate( + rad_values.transpose(2, 1), scale_factor=upp, mode="nearest" + ).transpose( + 2, 1 + ) ####### + tmp_over_one %= 1 + tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 + cumsum_shift = torch.zeros_like(rad_values) + cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + sine_waves = torch.sin( + torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi + ) + sine_waves = sine_waves * self.sine_amp + uv = self._f02uv(f0) + uv = F.interpolate( + uv.transpose(2, 1), scale_factor=upp, mode="nearest" + ).transpose(2, 1) + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise + + +class SourceModuleHnNSF(torch.nn.Module): + """SourceModule for hn-nsf + SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0) + sampling_rate: sampling_rate in Hz + harmonic_num: number of harmonic above F0 (default: 0) + sine_amp: amplitude of sine source signal (default: 0.1) + add_noise_std: std of additive Gaussian noise (default: 0.003) + note that amplitude of noise in unvoiced is decided + by sine_amp + voiced_threshold: threhold to set U/V given F0 (default: 0) + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + uv (batchsize, length, 1) + """ + + def __init__( + self, + sampling_rate, + harmonic_num=0, + sine_amp=0.1, + add_noise_std=0.003, + voiced_threshod=0, + is_half=True, + ): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + self.is_half = is_half + # to produce sine waveforms + self.l_sin_gen = SineGen( + sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod + ) + + # to merge source harmonics into a single excitation + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + + def forward(self, x, upp=None): + sine_wavs, uv, _ = self.l_sin_gen(x, upp) + if self.is_half: + sine_wavs = sine_wavs.half() + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + return sine_merge, None, None # noise, uv + + +class GeneratorNSF(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels, + sr, + is_half=False, + ): + super(GeneratorNSF, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + + self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) + self.m_source = SourceModuleHnNSF( + sampling_rate=sr, harmonic_num=0, is_half=is_half + ) + self.noise_convs = nn.ModuleList() + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + self.ups_g = nn.ModuleList()# fang add + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + c_cur = upsample_initial_channel // (2 ** (i + 1)) + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + self.ups_g.append( + nn.Conv1d(upsample_initial_channel,upsample_initial_channel // (2 ** (i + 1) ), 1) + #F.interpolate(input, scale_factor=2, mode='nearest') + )# fang add + if i + 1 < len(upsample_rates): + stride_f0 = np.prod(upsample_rates[i + 1 :]) + self.noise_convs.append( + Conv1d( + 1, + c_cur, + kernel_size=stride_f0 * 2, + stride=stride_f0, + padding=stride_f0 // 2, + ) + ) + else: + self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + self.upp = np.prod(upsample_rates) + + def forward(self, x, f0, g=None): + har_source, noi_source, uv = self.m_source(f0, self.upp) + har_source = har_source.transpose(1, 2) + x = self.conv_pre(x) + if g is not None: + #x = x + self.cond(g) ##org + tmp_g = self.cond(g) ##fang add + x = x + tmp_g ##fang add + #print('###@@@@##x:',x.shape ) + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + x_source = self.noise_convs[i](har_source) + x = x + x_source + xg = self.ups_g[i](tmp_g) #fang add + x = x + xg #fang add + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + #print('@@@@##x:',x.shape) + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + return x + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +sr2sr = { + "32k": 32000, + "40k": 40000, + "48k": 48000, + "24k": 24000, +} + + +class SynthesizerTrnMs256NSFsid(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr, + **kwargs + ): + super().__init__() + if type(sr) == type("strr"): + sr = sr2sr[sr] + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder256( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) + self.dec = GeneratorNSF( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + is_half=kwargs["is_half"], + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def forward( + self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds + ): # 这里ds是id,[bs,1] + # print(1,pitch.shape)#[bs,t] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + #print("@@@pitch.shape: ",pitch.shape) + #g = ds.unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) #按照self.segment_size这个长度,进行随机切割z,长度固定,开始位置不同存在ids_slice中,z_slice是切割的结果, fang + # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) + pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) + # print(-2,pitchf.shape,z_slice.shape) + o = self.dec(z_slice, pitchf, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + if rate: + head = int(z_p.shape[2] * rate) + z_p = z_p[:, :, -head:] + x_mask = x_mask[:, :, -head:] + nsff0 = nsff0[:, -head:] + z = self.flow(z_p, x_mask, g=g, reverse=True) + print('z shape: ',z.shape) + print('x_mask shape: ',x_mask.shape) + z_x_mask = z * x_mask + print('z_x_mask shape: ',z_x_mask.shape) + print('nsff0 shape:p', nsff0.shape) + print('g shape: ',g.shape) + o = self.dec(z * x_mask, nsff0, g=g) + + self.get_floats() + return o, x_mask, (z, z_p, m_p, logs_p) + + def get_floats(self,): + T = 21.4 #郭宇_但愿人长久_40k.wav + z = torch.randn(1,192 ,2740)# 2s data(同时用2s数据验证,整数倍就对了,防止干扰) + x_mask = torch.randn(1,1 ,2740) + g = torch.randn(1,256 ,1) + + inputs_bfcc = z #z * x_mask + nsff0 = torch.randn(1, 2740) + devices = 'cuda' #'cpu' + self.dec = self.dec.to(devices).half() + inputs_bfcc , nsff0, g = inputs_bfcc.to(devices).half(), nsff0.to(devices).half(), g.to(devices).half() + flops, params = profile(self.dec, (inputs_bfcc, nsff0, g)) + print(f'@@@hifi-gan nsf decflops: {flops/(T*pow(10,9))} GFLOPS, params: { params/pow(10,6)} M') + return 0 + +class SynthesizerTrnMs768NSFsid(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr, + **kwargs + ): + super().__init__() + if type(sr) == type("strr"): + sr = sr2sr[sr] + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder768( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) + self.dec = GeneratorNSF( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + is_half=kwargs["is_half"], + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + #for p in self.flow.parameters(): + # p.requires_grad=False + #for p in self.enc_p.parameters(): + # p.requires_grad=False + + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + self.diff_decoder = diff_decoder + #self.diff_cond_g = nn.Conv1d(256,192, 1) + self.diff_cond_gx = self.zero_module(self.conv_nd(1, 256, 192, 3, padding=1)) + self.diff_cond_out = self.zero_module(self.conv_nd(1, 192, 192, 3, padding=1)) + self.lzp = 0.1 + self.ssl_proj = self.zero_module(nn.Conv1d(256*2, 256, 1, stride=1)) + self.ssl_proj1 = self.zero_module(nn.Conv1d(256, 256, 1, stride=1)) + self.ssl_proj1_norm = nn.BatchNorm1d(256)#, track_running_stats=False)#nn.LayerNorm(256) + self.ssl_proj2 = self.zero_module(nn.Conv1d(256, 256, 1, stride=1)) + self.ssl_proj2_norm = nn.BatchNorm1d(256)#,track_running_stats=False)#nn.LayerNorm(256) + + def zero_module(self,module): + """ + Zero out the parameters of a module and return it. + """ + for p in module.parameters(): + p.detach().zero_() + return module + + def conv_nd(self, dims, *args, **kwargs): + """ + Create a 1D, 2D, or 3D convolution module. + """ + if dims == 1: + return nn.Conv1d(*args, **kwargs) + elif dims == 2: + return nn.Conv2d(*args, **kwargs) + elif dims == 3: + return nn.Conv3d(*args, **kwargs) + raise ValueError(f"unsupported dimensions: {dims}") + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def forward( + self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds + ): # 这里ds是id,[bs,1] + + g = ds.unsqueeze(-1) + #g = self.ssl_proj(g)#[:,256:,:]) + g1 = self.ssl_proj1_norm( self.ssl_proj1(g[:,:256,:])) + g2 = self.ssl_proj2_norm( self.ssl_proj2(g[:,256:,:])) + g = g1 + g2 + + m_p, logs_p, x_mask, x_embed = self.enc_p(phone, pitch, phone_lengths,g)#fang add + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)#self.enc_q = PosteriorEncoder ##这里面预测出了随机采样的隐变量z,m_q是均值,logs_q是方差,y_mask是mask的数据 fangi + + z_p = self.flow(z, y_mask, g=g)# z是y_msk的输入 + z_p_sample = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * y_mask + zx = self.flow(z_p_sample, y_mask, g=g, reverse=True) + #print("@@@@@g:",g.shape) + g_z_p = self.diff_cond_gx(g) + #print("@@@@@g_z_p:",g_z_p.shape) + z_res = z - zx + + + z_p1 = x_embed + g_z_p + ###diff st + z_p_diff = z_p1.transpose(1,2) ##b,frames,feat + z_diff = z_res.transpose(1,2) ##b,frames,feat + + diff_loss,_ = self.diff_decoder(z_p_diff, gt_spec=z_diff, infer=False, infer_speedup=ddpm_dp.infer_speedup, method=ddpm_dp.method, use_tqdm=ddpm_dp.use_tqdm) + + + t = 200#np.random.randint(100,1000)#200#torch.randint(0, 1000, (b,), device=g.device).long() + z_diff = zx.transpose(1,2) + z_x_diff = self.diff_decoder(z_p_diff, gt_spec=z_diff*self.lzp, infer=True, infer_speedup=ddpm_dp.infer_speedup, method=ddpm_dp.method, k_step=t, use_tqdm=False) + #print("@@@z_x: ",z_x.shape) + z1 = z_x_diff.transpose(1,2) + z1 = self.diff_cond_out(z1) + z_in = (zx + z1) + + + z_slice, ids_slice = commons.rand_slice_segments( + z_in, y_lengths, self.segment_size + ) + # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) + pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) + # print(-2,pitchf.shape,z_slice.shape) + o = self.dec(z_slice, pitchf, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q),diff_loss + + def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None): + #g = self.emb_g(sid).unsqueeze(-1) + g = sid.unsqueeze(-1).unsqueeze(0) + g1 = self.ssl_proj1_norm( self.ssl_proj1(g[:,:256,:])) + g2 = self.ssl_proj2_norm( self.ssl_proj2(g[:,256:,:])) + g = g1 + g2 + + m_p, logs_p, x_mask, x_embed = self.enc_p(phone, pitch, phone_lengths,g) #fang add + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + if rate: + head = int(z_p.shape[2] * rate) + z_p = z_p[:, :, -head:] + x_mask = x_mask[:, :, -head:] + nsff0 = nsff0[:, -head:] + z = self.flow(z_p, x_mask, g=g, reverse=True) + + g_z_p = self.diff_cond_gx(g) + z_p1 = x_embed + g_z_p + + z_p_diff = z_p1.transpose(1,2).float() ##b,frames,feat + z_diff = z.transpose(1,2) ##b,frames,feat + self.diff_decoder = self.diff_decoder.float() + z_x = self.diff_decoder(z_p_diff, gt_spec=z_diff*self.lzp, infer=True, infer_speedup=ddpm_dp.infer_speedup, method=ddpm_dp.method, k_step=200, use_tqdm=ddpm_dp.use_tqdm) + z1 = z_x.transpose(1,2).half() + z_res = self.diff_cond_out(z1) + z = z + z_res + o = self.dec(z * x_mask, nsff0, g=g) + #self.get_floats() + return o, x_mask, (z, z_p, m_p, logs_p) + + +class SynthesizerTrnMs256NSFsid_nono(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr=None, + **kwargs + ): + super().__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder256( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=False, + ) + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + o = self.dec(z_slice, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + def infer(self, phone, phone_lengths, sid, rate=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + if rate: + head = int(z_p.shape[2] * rate) + z_p = z_p[:, :, -head:] + x_mask = x_mask[:, :, -head:] + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec(z * x_mask, g=g) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class SynthesizerTrnMs768NSFsid_nono(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr=None, + **kwargs + ): + super().__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder768( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=False, + ) + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1] + #g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + g = ds.unsqueeze(-1) + #m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) #org + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths,g=g)#fang add + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + o = self.dec(z_slice, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + def infer(self, phone, phone_lengths, sid, rate=None): + #g = self.emb_g(sid).unsqueeze(-1) + g = sid.unsqueeze(-1).unsqueeze(0) + #m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths,g=g)#fang add + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + if rate: + head = int(z_p.shape[2] * rate) + z_p = z_p[:, :, -head:] + x_mask = x_mask[:, :, -head:] + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec(z * x_mask, g=g) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminator, self).__init__() + periods = [2, 3, 5, 7, 11, 17] + # periods = [3, 5, 7, 11, 17, 23, 37] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] # + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + # for j in range(len(fmap_r)): + # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class MultiPeriodDiscriminatorV2(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminatorV2, self).__init__() + # periods = [2, 3, 5, 7, 11, 17] + periods = [2, 3, 5, 7, 11, 17, 23, 37] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] # + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + # for j in range(len(fmap_r)): + # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f(Conv1d(1, 16, 15, 1, padding=7)), + norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), + norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ] + ) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + self.use_spectral_norm = use_spectral_norm + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f( + Conv2d( + 1, + 32, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 32, + 128, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 128, + 512, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 512, + 1024, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 1024, + 1024, + (kernel_size, 1), + 1, + padding=(get_padding(kernel_size, 1), 0), + ) + ), + ] + ) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap diff --git a/AIMeiSheng/meisheng_env_preparex.py b/AIMeiSheng/meisheng_env_preparex.py index f0c9854..62f0afb 100644 --- a/AIMeiSheng/meisheng_env_preparex.py +++ b/AIMeiSheng/meisheng_env_preparex.py @@ -1,38 +1,55 @@ import os -from AIMeiSheng.docker_demo.common import (gs_svc_model_path, gs_hubert_model_path, gs_embed_model_path, - gs_rmvpe_model_path, download2disk) +from AIMeiSheng.docker_demo.common import (gs_svc_model_path, gs_hubert_model_path, gs_embed_model_path,gs_embed_model_spk_path, gs_embed_config_spk_path, gs_rmvpe_model_path, download2disk) def meisheng_env_prepare(logging, AIMeiSheng_Path='./'): cos_path = "https://av-audit-sync-sg-1256122840.cos.ap-singapore.myqcloud.com/dataset/AIMeiSheng/" rmvpe_model_url = cos_path + "rmvpe.pt" if not os.path.exists(gs_rmvpe_model_path): if not download2disk(rmvpe_model_url, gs_rmvpe_model_path): logging.fatal(f"download rmvpe_model err={rmvpe_model_url}") gs_hubert_model_url = cos_path + "hubert_base.pt" if not os.path.exists(gs_hubert_model_path): if not download2disk(gs_hubert_model_url, gs_hubert_model_path): logging.fatal(f"download hubert_model err={gs_hubert_model_url}") #model_svc = "xusong_v2_org_version_alldata_embed1_enzx_diff_fi_e15_s244110.pth" - model_svc = "xusong_v2_org_version_alldata_embed1_enzx_diff_ocean_ctl_enc_e22_s363704.pth" + #model_svc = "xusong_v2_org_version_alldata_embed1_enzx_diff_ocean_ctl_enc_e22_s363704.pth" + model_svc = "xusong_v2_org_version_alldata_embed_spkenx200x_vocal_e22_s95040.pth" base_dir = os.path.dirname(gs_svc_model_path) os.makedirs(base_dir, exist_ok=True) svc_model_url = cos_path + model_svc if not os.path.exists(gs_svc_model_path): if not download2disk(svc_model_url, gs_svc_model_path): logging.fatal(f"download svc_model err={svc_model_url}") model_embed = "model.pt" base_dir = os.path.dirname(gs_embed_model_path) os.makedirs(base_dir, exist_ok=True) embed_model_url = cos_path + model_embed if not os.path.exists(gs_embed_model_path): if not download2disk(embed_model_url, gs_embed_model_path): logging.fatal(f"download embed_model err={embed_model_url}") + model_spk_embed = "best_model.pth.tar" + base_dir = os.path.dirname(gs_embed_model_spk_path) + os.makedirs(base_dir, exist_ok=True) + embed_model_url = cos_path + model_spk_embed + if not os.path.exists(gs_embed_model_spk_path): + if not download2disk(embed_model_url, gs_embed_model_spk_path): + logging.fatal(f"download embed_model err={embed_model_url}") + + + model_spk_embed_cfg = "config.json" + base_dir = os.path.dirname(gs_embed_config_spk_path) + os.makedirs(base_dir, exist_ok=True) + embed_model_url = cos_path + model_spk_embed_cfg + if not os.path.exists(gs_embed_config_spk_path): + if not download2disk(embed_model_url, gs_embed_config_spk_path): + logging.fatal(f"download embed_model err={embed_model_url}") + if __name__ == "__main__": meisheng_env_prepare() diff --git a/AIMeiSheng/meisheng_svc_final.py b/AIMeiSheng/meisheng_svc_final.py index 6cddfdc..2080cba 100644 --- a/AIMeiSheng/meisheng_svc_final.py +++ b/AIMeiSheng/meisheng_svc_final.py @@ -1,242 +1,247 @@ import os import sys sys.path.append(os.path.dirname(__file__)) import time import shutil import glob import hashlib import librosa import soundfile import gradio as gr import pandas as pd import numpy as np from AIMeiSheng.RawNet3.infererence_fang_meisheng import get_embed, get_embed_model -from myinfer_multi_spk_embed_in_dec_diff_fi_meisheng import svc_main, load_hubert, get_vc, get_rmvpe +#from myinfer_multi_spk_embed_in_dec_diff_fi_meisheng import svc_main, load_hubert, get_vc, get_rmvpe +from AIMeiSheng.myinfer_multi_spk_embed_in_dec_diff_meisheng_ctl_enc_spk200x import svc_main,load_hubert, get_vc,get_rmvpe + from gender_classify import load_gender_model from AIMeiSheng.docker_demo.common import gs_svc_model_path, gs_embed_model_path, gs_rmvpe_model_path, gs_err_code_target_silence from slicex.slice_set_silence import del_noise gs_simple_mixer_path = "/data/gpu_env_common/bin/simple_mixer" ##混音执行文件 tmp_workspace_name = "batch_test_ocean_fi" # 工作空间名 song_folder = "./data_meisheng/" ##song folder gs_work_dir = f"./data_meisheng/{tmp_workspace_name}" # 工作空间路径 pth_model_path = "./weights/xusong_v2_org_version_alldata_embed1_enzx_diff_fi_e15_s244110.pth" ##模型文件 cur_dir = os.path.abspath(os.path.dirname(__file__)) abs_path = os.path.join(cur_dir, song_folder, tmp_workspace_name) + '/' f0_method = None def mix(in_path, acc_path, dst_path): # svc转码到442 svc_442_file = in_path + "_442.wav" st = time.time() cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {} -loglevel fatal".format(in_path, svc_442_file) os.system(cmd) if not os.path.exists(svc_442_file): return -1 print("transcode,{},sp={}".format(in_path, time.time() - st)) # 混合 st = time.time() cmd = "{} {} {} {} 1".format(gs_simple_mixer_path, svc_442_file, acc_path, dst_path) os.system(cmd) print("mixer,{},sp={}".format(in_path, time.time() - st)) def load_model(): global f0_method embed_model = get_embed_model(gs_embed_model_path) hubert_model = load_hubert() get_vc(gs_svc_model_path) f0_method = get_rmvpe(gs_rmvpe_model_path) print("model preload finish!!!") return embed_model, hubert_model # ,svc_model def meisheng_init(): embed_model, hubert_model = load_model() ##提前加载模型 gender_model = load_gender_model() return embed_model, hubert_model, gender_model def pyin_process_single_rmvpe(input_file): global f0_method if f0_method is None: f0_method = get_rmvpe() rate = 16000 # 44100 # 读取音频文件 y, sr = librosa.load(input_file, sr=rate) len_s = len(y) / sr lim_s = 15 # 10 f0_limit_10ms = 10 if (len_s > lim_s): y1 = y[:sr * lim_s] y2 = y[-sr * lim_s:] f0 = f0_method.infer_from_audio(y1, thred=0.03) f0 = f0[f0 < 600] valid_f0 = f0[f0 > 50] if len(valid_f0) > f0_limit_10ms: mean_pitch1 = np.mean(valid_f0) else: mean_pitch1 = 0 f0 = f0_method.infer_from_audio(y2, thred=0.03) f0 = f0[f0 < 600] valid_f0 = f0[f0 > 50] if len(valid_f0) > f0_limit_10ms: mean_pitch2 = np.mean(valid_f0) else: mean_pitch2 = 0 if mean_pitch2 == 0 and mean_pitch1 == 0: mean_pitch_cur = 0 elif mean_pitch2 == 0 or mean_pitch1 == 0: mean_pitch_cur = max(mean_pitch1, mean_pitch2) elif abs(mean_pitch1 - mean_pitch2) > 55: mean_pitch_cur = min(mean_pitch1, mean_pitch2) else: mean_pitch_cur = (mean_pitch1 + mean_pitch2) / 2 else: f0 = f0_method.infer_from_audio(y, thred=0.03) f0 = f0[f0 < 600] valid_f0 = f0[f0 > 50] if len(valid_f0) > f0_limit_10ms: mean_pitch_cur = np.mean(valid_f0) else: mean_pitch_cur = 0 return mean_pitch_cur -def meisheng_svc(song_wav, target_wav, svc_out_path, embed_npy, embed_md, hubert_md, paras): +def meisheng_svc(song_wav, target_wav, svc_out_path, embed_npy, embed_md, hubert_md, cs_sim, paras): ##计算pitch f0up_key = pyin_process_single_rmvpe(target_wav) if f0up_key < 40 or np.isnan(f0up_key):#unvoice return gs_err_code_target_silence ## get embed, 音色 get_embed(target_wav, embed_npy, embed_md) + embed_npy_spk = embed_npy[:-4] + '_spk.npy' + cs_sim.get_spk_embed(target_wav, embed_npy_spk) + print("get embed_npy_spk: {embed_npy_spk} ") print("svc main start...") svc_main(song_wav, svc_out_path, embed_npy, f0up_key, hubert_md, paras) print("svc main finished!!") del_noise(song_wav,svc_out_path,paras) print("del noise in silence") return 0 -def process_svc_online(song_wav, target_wav, svc_out_path, embed_md, hubert_md, paras): +def process_svc_online(song_wav, target_wav, svc_out_path, embed_md, hubert_md, cs_sim, paras): embed_npy = target_wav[:-4] + '.npy' ##embd npy存储位置 - err_code = meisheng_svc(song_wav, target_wav, svc_out_path, embed_npy, embed_md, hubert_md, paras) + err_code = meisheng_svc(song_wav, target_wav, svc_out_path, embed_npy, embed_md, hubert_md, cs_sim, paras) return err_code -def process_svc(song_wav, target_wav, svc_out_path, embed_md, hubert_md, paras): +def process_svc(song_wav, target_wav, svc_out_path, embed_md, hubert_md, cs_sim, paras): song_wav1, target_wav, svc_out_path = os.path.basename(song_wav), os.path.basename( target_wav), os.path.basename(svc_out_path) # 绝对路径 song_wav, target_wav, svc_out_path = song_wav, abs_path + target_wav, abs_path + svc_out_path embed_npy = target_wav[:-4] + '.npy' ##embd npy存储位置 # similar = meisheng_svc(song_wav,target_wav,svc_out_path,embed_npy,paras) - similar = meisheng_svc(song_wav, target_wav, svc_out_path, embed_npy, embed_md, hubert_md, paras) + similar = meisheng_svc(song_wav, target_wav, svc_out_path, embed_npy, embed_md, hubert_md, cs_sim, paras) return similar def get_svc(target_yinse_wav, song_name, embed_model, hubert_model, paras): ''' :param target_yinse_wav: 目标音色 :param song_name: 歌曲名字 ;param paras: 其他参数 :return: svc路径名 ''' ##清空工作空间临时路径 if os.path.exists(gs_work_dir): # shutil.rmtree(gs_work_dir) cmd = f"rm -rf {gs_work_dir}/*" os.system(cmd) else: os.makedirs(gs_work_dir) gender = paras['gender'] ##为了确定歌曲 ##目标音色读取 f_dst = os.path.join(gs_work_dir, os.path.basename(target_yinse_wav)) # print("dir :", f_dst,"target_yinse_wav:",target_yinse_wav) # shutil.move(target_yinse_wav, f_dst) ##放在工作目录 shutil.copy(target_yinse_wav, f_dst) target_yinse_wav = f_dst ##歌曲/伴奏 读取(路径需要修改) song_wav = os.path.join("{}{}/{}/vocal321.wav".format(song_folder, gender, song_name)) # 歌曲vocal inf_acc_path = os.path.join("{}{}/{}/acc.wav".format(song_folder, gender, song_name)) # song_wav = './xusong_long.wav' svc_out_path = os.path.join(gs_work_dir, "svc.wav") ###svc结果名字 print("inputMsg:", song_wav, target_yinse_wav, svc_out_path) ## svc process st = time.time() print("start inference...") - similar = process_svc(song_wav, target_yinse_wav, svc_out_path, embed_model, hubert_model, paras) + similar = process_svc(song_wav, target_yinse_wav, svc_out_path, embed_model, hubert_model, cs_sim, paras) print("svc finished!!") print("time cost = {}".format(time.time() - st)) print("out path name {} ".format(svc_out_path)) # ''' ##加混响 print("add reverbration...") svc_out_path_effect = svc_out_path[:-4] + '_effect.wav' cmd = f"/data/gpu_env_common/bin/effect_tool {svc_out_path} {svc_out_path_effect}" print("cmd :", cmd) os.system(cmd) # # 人声伴奏合并 print("add acc...") out_path = svc_out_path_effect[:-4] + '_music.wav' mix(svc_out_path_effect, inf_acc_path, out_path) print("time cost = {}".format(time.time() - st)) print("out path name {} ".format(out_path)) # ''' return svc_out_path def meisheng_func(target_yinse_wav, song_name, paras): ##init embed_model, hubert_model, gender_model = meisheng_init() ###gender predict gender, female_rate, is_pure = gender_model.process(target_yinse_wav) print('=====================') print("gender:{}, female_rate:{},is_pure:{}".format(gender, female_rate, is_pure)) if gender == 0: gender = 'female' elif gender == 1: gender = 'male' elif female_rate > 0.5: gender = 'female' else: gender = 'male' print("modified gender:{} ".format(gender)) print('=====================') ##美声main paras['gender'] = gender ##单位都是ms get_svc(target_yinse_wav, song_name, embed_model, hubert_model, paras) if __name__ == '__main__': # target_yinse_wav = "./raw/meisheng_yinse/female/changying.wav" # 需要完整路径 target_yinse_wav = "./raw/meisheng_yinse/female/target_yinse_cloris.m4a" song_name = "lost_stars" ##歌曲名字 paras = {'gender': None, 'tst': 0, "tnd": None, 'delay': 0, 'song_path': None} # paras = {'gender': 'female', 'tst': 0, "tnd": 30, 'delay': 0} ###片段svc测试 meisheng_func(target_yinse_wav, song_name, paras) diff --git a/AIMeiSheng/myinfer_multi_spk_embed_in_dec_diff_meisheng_ctl_enc_spk200x.py b/AIMeiSheng/myinfer_multi_spk_embed_in_dec_diff_meisheng_ctl_enc_spk200x.py new file mode 100644 index 0000000..f4944c7 --- /dev/null +++ b/AIMeiSheng/myinfer_multi_spk_embed_in_dec_diff_meisheng_ctl_enc_spk200x.py @@ -0,0 +1,217 @@ + +import os,sys,pdb,torch +now_dir = os.getcwd() +sys.path.append(now_dir) +import argparse +import glob +import sys +import torch +from multiprocessing import cpu_count +class Config: + def __init__(self,device,is_half): + self.device = device + self.is_half = is_half + self.n_cpu = 0 + self.gpu_name = None + self.gpu_mem = None + self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config() + + def device_config(self) -> tuple: + current_dir = os.path.dirname(os.path.abspath(__file__)) + config_path = os.path.join(current_dir, "configs") + if torch.cuda.is_available(): + i_device = int(self.device.split(":")[-1]) + self.gpu_name = torch.cuda.get_device_name(i_device) + if ( + ("16" in self.gpu_name and "V100" not in self.gpu_name.upper()) + or "P40" in self.gpu_name.upper() + or "1060" in self.gpu_name + or "1070" in self.gpu_name + or "1080" in self.gpu_name + ): + print("16系/10系显卡和P40强制单精度") + self.is_half = False + for config_file in ["32k.json", "40k.json", "48k.json"]: + with open(f"{config_path}/{config_file}", "r") as f: + strr = f.read().replace("true", "false") + with open(f"{config_path}/{config_file}", "w") as f: + f.write(strr) + with open(f"{current_dir}/trainset_preprocess_pipeline_print.py", "r") as f: + strr = f.read().replace("3.7", "3.0") + with open(f"{current_dir}/trainset_preprocess_pipeline_print.py", "w") as f: + f.write(strr) + else: + self.gpu_name = None + self.gpu_mem = int( + torch.cuda.get_device_properties(i_device).total_memory + / 1024 + / 1024 + / 1024 + + 0.4 + ) + if self.gpu_mem <= 4: + with open(f"{current_dir}/trainset_preprocess_pipeline_print.py", "r") as f: + strr = f.read().replace("3.7", "3.0") + with open(f"{current_dir}/trainset_preprocess_pipeline_print.py", "w") as f: + f.write(strr) + elif torch.backends.mps.is_available(): + print("没有发现支持的N卡, 使用MPS进行推理") + self.device = "mps" + else: + print("没有发现支持的N卡, 使用CPU进行推理") + self.device = "cpu" + self.is_half = True + + if self.n_cpu == 0: + self.n_cpu = cpu_count() + + if self.is_half: + # 6G显存配置 + x_pad = 3 + x_query = 10 + x_center = 80 #60 + x_max = 85#65 + else: + # 5G显存配置 + x_pad = 1 + x_query = 6 + x_center = 38 + x_max = 41 + + if self.gpu_mem != None and self.gpu_mem <= 4: + x_pad = 1 + x_query = 5 + x_center = 30 + x_max = 32 + + return x_pad, x_query, x_center, x_max + + +index_path="./logs/xusong_v2_org_version_multispk_charlie_puth_embed_in_dec_muloss_show/added_IVF614_Flat_nprobe_1_xusong_v2_org_version_multispk_charlie_puth_embed_in_dec_show_v2.index" +# f0method="rmvpe" #harvest or pm +index_rate=float("0.0") #index rate +device="cuda:0" +is_half=True +filter_radius=int(3) ##3 +resample_sr=int(0) # 0 +rms_mix_rate=float(1) # rms混合比例 1,不等于1混合 +protect=float(0.33 )## ??? 0.33 fang + + + +#print(sys.argv) +config=Config(device,is_half) +now_dir=os.getcwd() +sys.path.append(now_dir) + +from vc_infer_pipeline_org_embed_spk import VC +from lib.infer_pack.models_embed_in_dec_diff_control_enc_spken200x import ( + SynthesizerTrnMs256NSFsid, + SynthesizerTrnMs256NSFsid_nono, + SynthesizerTrnMs768NSFsid, + SynthesizerTrnMs768NSFsid_nono, +) +from lib.audio import load_audio +from fairseq import checkpoint_utils +from scipy.io import wavfile +from AIMeiSheng.docker_demo.common import gs_hubert_model_path +# hubert_model=None +def load_hubert(): + # global hubert_model + models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task([gs_hubert_model_path],suffix="",) + #models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(["checkpoint_best_legacy_500.pt"],suffix="",) + hubert_model = models[0] + hubert_model = hubert_model.to(device) + if(is_half):hubert_model = hubert_model.half() + else:hubert_model = hubert_model.float() + hubert_model.eval() + return hubert_model + +def vc_single(sid,input_audio,f0_up_key,f0_file,f0_method,file_index,index_rate,hubert_model,paras): + global tgt_sr,net_g,vc,version + if input_audio is None:return "You need to upload an audio", None + f0_up_key = int(f0_up_key) + # print("@@xxxf0_up_key:",f0_up_key) + audio = load_audio(input_audio,16000) + if paras != None: + st = int(paras['tst'] * 16000/1000) + en = len(audio) + if paras['tnd'] != None: + en = min(en,int(paras['tnd'] * 16000/1000)) + audio = audio[st:en] + + times = [0, 0, 0] + if(hubert_model==None): + hubert_model = load_hubert() + if_f0 = cpt.get("f0", 1) + audio_opt=vc.pipeline_mulprocess(hubert_model,net_g,sid,audio,input_audio,times,f0_up_key,f0_method,file_index,index_rate,if_f0,filter_radius,tgt_sr,resample_sr,rms_mix_rate,version,protect,f0_file=f0_file) + + #print(times) + #print("@@using multi process") + return audio_opt + + +def get_vc_core(model_path,is_half): + + #print("loading pth %s" % model_path) + cpt = torch.load(model_path, map_location="cpu") + tgt_sr = cpt["config"][-1] + cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] + if_f0 = cpt.get("f0", 1) + version = cpt.get("version", "v1") + if version == "v1": + if if_f0 == 1: + net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half) + else: + net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) + elif version == "v2": + if if_f0 == 1: # + net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=is_half) + else: + net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) + #print("load model finished") + del net_g.enc_q + net_g.load_state_dict(cpt["weight"], strict=False) + #print("load net_g finished") + + return tgt_sr,net_g,cpt,version + +def get_vc1(model_path,is_half): + tgt_sr, net_g, cpt, version = get_vc_core(model_path, is_half) + + net_g.eval().to(device) + if (is_half):net_g = net_g.half() + else:net_g = net_g.float() + vc = VC(tgt_sr, config) + n_spk=cpt["config"][-3] + return +def get_rmvpe(model_path="rmvpe.pt"): + from lib.rmvpe import RMVPE + global f0_method + #print("loading rmvpe model") + f0_method = RMVPE(model_path, is_half=True, device='cuda') + return f0_method + + +def get_vc(model_path): + global n_spk,tgt_sr,net_g,vc,cpt,device,is_half,version + tgt_sr, net_g, cpt, version = get_vc_core(model_path, is_half) + + net_g.eval().to(device) + if (is_half):net_g = net_g.half() + else:net_g = net_g.float() + vc = VC(tgt_sr, config) + n_spk=cpt["config"][-3] + # return {"visible": True,"maximum": n_spk, "__type__": "update"} + # return net_g + + +def svc_main(input_path,opt_path,sid_embed,f0up_key=0,hubert_model=None, paras=None): + #print("sid_embed: ",sid_embed) + wav_opt = vc_single(sid_embed,input_path,f0up_key,None,f0_method,index_path,index_rate,hubert_model,paras) + #print("out_path: ",opt_path) + wavfile.write(opt_path, tgt_sr, wav_opt) + + + + diff --git a/AIMeiSheng/vc_infer_pipeline_org_embed_spk.py b/AIMeiSheng/vc_infer_pipeline_org_embed_spk.py new file mode 100644 index 0000000..076184f --- /dev/null +++ b/AIMeiSheng/vc_infer_pipeline_org_embed_spk.py @@ -0,0 +1,778 @@ +import numpy as np, parselmouth, torch, pdb, sys, os +from time import time as ttime +import torch.nn.functional as F +import scipy.signal as signal +import pyworld, os, traceback, faiss, librosa, torchcrepe +from scipy import signal +from functools import lru_cache + +now_dir = os.getcwd() +sys.path.append(now_dir) + +bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) + +input_audio_path2wav = {} +fidx = 0 + +import threading +import concurrent.futures + + +@lru_cache +def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period): + audio = input_audio_path2wav[input_audio_path] + f0, t = pyworld.harvest( + audio, + fs=fs, + f0_ceil=f0max, + f0_floor=f0min, + frame_period=frame_period, + ) + f0 = pyworld.stonemask(audio, f0, t, fs) + return f0 + + +def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比 + # print(data1.max(),data2.max()) + rms1 = librosa.feature.rms( + y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2 + ) # 每半秒一个点 + rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2) + rms1 = torch.from_numpy(rms1) + rms1 = F.interpolate( + rms1.unsqueeze(0), size=data2.shape[0], mode="linear" + ).squeeze() + rms2 = torch.from_numpy(rms2) + rms2 = F.interpolate( + rms2.unsqueeze(0), size=data2.shape[0], mode="linear" + ).squeeze() + rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6) + data2 *= ( + torch.pow(rms1, torch.tensor(1 - rate)) + * torch.pow(rms2, torch.tensor(rate - 1)) + ).numpy() + return data2 + + +class VC(object): + def __init__(self, tgt_sr, config): + self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = ( + config.x_pad, ##config会根据设备配置不通知如:3 + config.x_query, # 10 等于x_max-x_center)*2 + config.x_center, #60 + config.x_max, #65 + config.is_half, + ) + self.sr = 16000 # hubert输入采样率 + self.window = 160 # 每帧点数 + self.t_pad = self.sr * self.x_pad # 每条前后pad时间 + self.t_pad_tgt = tgt_sr * self.x_pad + self.t_pad2 = self.t_pad * 2 + self.t_query = self.sr * self.x_query # 查询切点前后查询时间, + self.t_center = self.sr * self.x_center # 查询切点位置 + self.t_max = self.sr * self.x_max # 免查询时长阈值 + self.device = config.device + + def get_f0( + self, + input_audio_path, + x, + p_len, + f0_up_key, + f0_method, + filter_radius, + inp_f0=None, + ): + global input_audio_path2wav + time_step = self.window / self.sr * 1000 + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + if f0_method == "pm": + f0 = ( + parselmouth.Sound(x, self.sr) + .to_pitch_ac( + time_step=time_step / 1000, + voicing_threshold=0.6, + pitch_floor=f0_min, + pitch_ceiling=f0_max, + ) + .selected_array["frequency"] + ) + pad_size = (p_len - len(f0) + 1) // 2 + if pad_size > 0 or p_len - len(f0) - pad_size > 0: + f0 = np.pad( + f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" + ) + elif f0_method == "harvest": + input_audio_path2wav[input_audio_path] = x.astype(np.double) + f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10) + if filter_radius > 2: + f0 = signal.medfilt(f0, 3) + elif f0_method == "crepe": + model = "full" + # Pick a batch size that doesn't cause memory errors on your gpu + batch_size = 512 + # Compute pitch using first gpu + audio = torch.tensor(np.copy(x))[None].float() + f0, pd = torchcrepe.predict( + audio, + self.sr, + self.window, + f0_min, + f0_max, + model, + batch_size=batch_size, + device=self.device, + return_periodicity=True, + ) + pd = torchcrepe.filter.median(pd, 3) + f0 = torchcrepe.filter.mean(f0, 3) + f0[pd < 0.1] = 0 + f0 = f0[0].cpu().numpy() + elif f0_method == "rmvpe": + if hasattr(self, "model_rmvpe") == False: + from lib.rmvpe import RMVPE + + print("loading rmvpe model") + self.model_rmvpe = RMVPE( + "rmvpe.pt", is_half=self.is_half, device=self.device + ) + f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) + else: ##for meisheng + self.model_rmvpe = f0_method + f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) + + ##这里读文件,更改pitch st fang + valid_f0 = f0[f0 > 50] + mean_pitch_cur = np.mean(valid_f0[:min(len(valid_f0),500)]) + + + #print("@@f0_up_key:",f0_up_key) + deta = 0 + if(f0_up_key > 50 ): + deta = -mean_pitch_cur + f0_up_key + + #print("$$$$$$$$$fangxxxxx pitch shift: ",deta) + f0_up_key = int(np.log2(deta/(mean_pitch_cur + 1) + 1) * 12)##方法2 fang + if( abs(f0_up_key) <= 8 ): + f0_up_key = 0 + elif f0_up_key > 8: + f0_up_key = 12 + elif f0_up_key < -8: + f0_up_key = -12 + #if( abs(f0_up_key) < 3 ): + # f0_up_key = 0 + f0_up_key = max(min(12,f0_up_key),-12) + #print("f0_up_key: ",f0_up_key) + + f0 *= pow(2, f0_up_key / 12)#这块是音调更改 fang 我设置的0 + # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) + tf0 = self.sr // self.window # 每秒f0点数 + if inp_f0 is not None: + delta_t = np.round( + (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1 + ).astype("int16") + replace_f0 = np.interp( + list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1] + ) + shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0] + f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[ + :shape + ] + # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) + + + f0bak = f0.copy() + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( + f0_mel_max - f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + f0_coarse = np.rint(f0_mel).astype(int) + return f0_coarse, f0bak # 1-0 + + def vc( + self, + model, + net_g, + sid, + audio0, + pitch, + pitchf, + times, + index, + big_npy, + index_rate, + version, + protect, + ): # ,file_index,file_big_npy + feats = torch.from_numpy(audio0) + if self.is_half: + feats = feats.half() + else: + feats = feats.float() + if feats.dim() == 2: # double channels + feats = feats.mean(-1) + assert feats.dim() == 1, feats.dim() + feats = feats.view(1, -1) + padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) + #print("@@@feats: ",feats.shape) + #print("@@@padding_mask: ",padding_mask.shape) + inputs = { + "source": feats.to(self.device), + "padding_mask": padding_mask, + "output_layer": 9 if version == "v1" else 12, + #"output_layer": 6 if version == "v1" else 12, + } + t0 = ttime() + #''' + with torch.no_grad(): + logits = model.extract_features(**inputs) + feats = model.final_proj(logits[0]) if version == "v1" else logits[0]#为何v1要转化,维度问题??? fang + #''' + + #print("@@@feats: ",feats.shape) + ''' + global fidx + feats_name = f"./feats_{fidx}.pt" + fidx += 1 + torch.save(feats, feats_name) + feats = torch.load(feats_name) + #''' + + if protect < 0.5 and pitch != None and pitchf != None: + feats0 = feats.clone() + if ( + isinstance(index, type(None)) == False + and isinstance(big_npy, type(None)) == False + and index_rate != 0 + ): + npy = feats[0].cpu().numpy() + if self.is_half: + npy = npy.astype("float32") + + # _, I = index.search(npy, 1) + # npy = big_npy[I.squeeze()] + + score, ix = index.search(npy, k=8) + weight = np.square(1 / score) + weight /= weight.sum(axis=1, keepdims=True) + npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) + + if self.is_half: + npy = npy.astype("float16") + feats = ( + torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + + (1 - index_rate) * feats + )##基于index和实际音频的特征进行组合,作为输入 fang + + #print("@@@feats: ",feats.shape) + feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) + if protect < 0.5 and pitch != None and pitchf != None: + feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute( + 0, 2, 1 + )#feats0的维度1 插值增加一倍 fang + t1 = ttime() + p_len = audio0.shape[0] // self.window ##分帧求pitch fang + if feats.shape[1] < p_len: + p_len = feats.shape[1] + if pitch != None and pitchf != None: + pitch = pitch[:, :p_len] + pitchf = pitchf[:, :p_len] + + if protect < 0.5 and pitch != None and pitchf != None: + pitchff = pitchf.clone() + pitchff[pitchf > 0] = 1 + pitchff[pitchf < 1] = protect + pitchff = pitchff.unsqueeze(-1) + feats = feats * pitchff + feats0 * (1 - pitchff) + feats = feats.to(feats0.dtype) + p_len = torch.tensor([p_len], device=self.device).long() + #print("###feats:",feats.shape,"pitch:",pitch.shape,"p_len:",p_len) + with torch.no_grad(): + if pitch != None and pitchf != None: + audio1 = ( + (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]) + .data.cpu() + .float() + .numpy() + ) + else: + audio1 = ( + (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy() + ) + del feats, p_len, padding_mask + if torch.cuda.is_available(): + torch.cuda.empty_cache() + t2 = ttime() + times[0] += t1 - t0 + times[2] += t2 - t1 + return audio1 + + def pipeline( + self, + model, + net_g, + sid, + audio,## input wav + input_audio_path, #input wav name + times, + f0_up_key, + f0_method,# f0 meathod + file_index, #index 路径 + # file_big_npy, + index_rate, + if_f0, + filter_radius, + tgt_sr, + resample_sr, + rms_mix_rate, + version, + protect, + f0_file=None, + ): + if ( + file_index != "" #.index文件不为空 fang + # and file_big_npy != "" + # and os.path.exists(file_big_npy) == True + and os.path.exists(file_index) == True + and index_rate != 0 + ): + try: + index = faiss.read_index(file_index) + # big_npy = np.load(file_big_npy) + big_npy = index.reconstruct_n(0, index.ntotal) + except: + traceback.print_exc() + index = big_npy = None + else: + index = big_npy = None + #print("####audio 1:",audio.shape) + audio = signal.filtfilt(bh, ah, audio) + #print("####audio 2:",audio.shape) + audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") + opt_ts = [] + + #print("###t_max:",self.t_max) + #print("###window:",self.window,"self.t_query:",self.t_query,"self.t_pad2:",self.t_pad2) + if audio_pad.shape[0] > self.t_max: + audio_sum = np.zeros_like(audio) + for i in range(self.window): + audio_sum += audio_pad[i : i - self.window]#这样算循环了,每个idx是过去一帧的值的和 fang + for t in range(self.t_center, audio.shape[0], self.t_center):#一分钟一帧?? fang + opt_ts.append( + t + - self.t_query + + np.where( + np.abs(audio_sum[t - self.t_query : t + self.t_query]) + == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min() + )[0][0] + )#返回[ t - self.t_query, t+self.t_query] 区间最小值位置的索引保存,fang + s = 0 + audio_opt = [] + t = None + t1 = ttime() + audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") + p_len = audio_pad.shape[0] // self.window + inp_f0 = None + if hasattr(f0_file, "name") == True: + try: + with open(f0_file.name, "r") as f: + lines = f.read().strip("\n").split("\n") + inp_f0 = [] + for line in lines: + inp_f0.append([float(i) for i in line.split(",")]) + inp_f0 = np.array(inp_f0, dtype="float32") + except: + traceback.print_exc() + #sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() + + sid_embed = np.load(sid) + sid = torch.FloatTensor(sid_embed).to(self.device).half() + pitch, pitchf = None, None + if if_f0 == 1: + pitch, pitchf = self.get_f0( + input_audio_path, + audio_pad, + p_len, + f0_up_key, + f0_method, + filter_radius, + inp_f0, + ) + pitch = pitch[:p_len] + pitchf = pitchf[:p_len] + if self.device == "mps": + pitchf = pitchf.astype(np.float32) + pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() + pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() + + #print("&&&&pitch: ",pitchf) + t2 = ttime() + times[1] += t2 - t1 + #print("####len(audio_pad):",len(audio_pad)) + #print("###pitch:", pitch.shape) + for t in opt_ts: #分段推理每段音频,一段这里设置60s左右 fang + t = t // self.window * self.window + if if_f0 == 1: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[s : t + self.t_pad2 + self.window], + pitch[:, s // self.window : (t + self.t_pad2) // self.window], + pitchf[:, s // self.window : (t + self.t_pad2) // self.window], + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + else: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[s : t + self.t_pad2 + self.window], + None, + None, + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + s = t + if if_f0 == 1: ##后面是最后一段处理 fang + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[t:], + pitch[:, t // self.window :] if t is not None else pitch, + pitchf[:, t // self.window :] if t is not None else pitchf, + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + else: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[t:], + None, + None, + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + audio_opt = np.concatenate(audio_opt) + if rms_mix_rate != 1: + audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate) + if resample_sr >= 16000 and tgt_sr != resample_sr: + audio_opt = librosa.resample( + audio_opt, orig_sr=tgt_sr, target_sr=resample_sr + ) + audio_max = np.abs(audio_opt).max() / 0.99 + max_int16 = 32768 + if audio_max > 1: + max_int16 /= audio_max + audio_opt = (audio_opt * max_int16).astype(np.int16) + del pitch, pitchf, sid + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return audio_opt + + def infer_core_fang(self,para1,para2,para3,idx, + model, + net_g, + sid, + times, + index, + big_npy, + index_rate, + version, + protect): + return [ self.vc( + model, + net_g, + sid, + para1, para2, para3, + # audio_pad[s: t + self.t_pad2 + self.window], + # pitch[:, s // self.window: (t + self.t_pad2) // self.window], + # pitchf[:, s // self.window: (t + self.t_pad2) // self.window], + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt: -self.t_pad_tgt], idx] + + def ThreadPool_process_core(self, func_process,params1,params2,params3, + model, + net_g, + sid, + # audio_pad[s: t + self.t_pad2 + self.window], + # pitch[:, s // self.window: (t + self.t_pad2) // self.window], + # pitchf[:, s // self.window: (t + self.t_pad2) // self.window], + times, + index, + big_npy, + index_rate, + version, + protect + ): + num_threads = 2 + futures = [] + sort_ret = {} + with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor: + for idx in range(len(params1)): + para1 = params1[idx] + para2 = params2[idx] + para3 = params3[idx] + ret = executor.submit(self.infer_core_fang,para1,para2,para3,idx, + model, + net_g, + sid, + times, + index, + big_npy, + index_rate, + version, + protect) + futures.append(ret) + + cnt = 0 + for future in concurrent.futures.as_completed(futures): + cnt += 1 + #print(f"process finised {cnt}, and index :{future.result()[1]}") + + #print(future.result()) # result + # print(future.result()[1]) ##index + sort_ret[str(future.result()[1])] = future.result()[0] + + + fea_list = [] + for idx in range(len(sort_ret)): + fea_list.append(sort_ret[str(idx)]) + + return fea_list + + def pipeline_mulprocess( + self, + model, + net_g, + sid, + audio, ## input wav + input_audio_path, # input wav name + times, + f0_up_key, + f0_method, # f0 meathod + file_index, # index 路径 + # file_big_npy, + index_rate, + if_f0, + filter_radius, + tgt_sr, + resample_sr, + rms_mix_rate, + version, + protect, + f0_file=None, + ): + if ( + file_index != "" # .index文件不为空 fang + # and file_big_npy != "" + # and os.path.exists(file_big_npy) == True + and os.path.exists(file_index) == True + and index_rate != 0 + ): + try: + index = faiss.read_index(file_index) + # big_npy = np.load(file_big_npy) + big_npy = index.reconstruct_n(0, index.ntotal) + except: + traceback.print_exc() + index = big_npy = None + else: + index = big_npy = None + audio = signal.filtfilt(bh, ah, audio) + audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") + opt_ts = [] + if audio_pad.shape[0] > self.t_max: + audio_sum = np.zeros_like(audio) + for i in range(self.window): + audio_sum += audio_pad[i: i - self.window] # 这样算循环了,每个idx是过去一帧的值的和 fang + for t in range(self.t_center, audio.shape[0], self.t_center): # 一分钟一帧?? fang + opt_ts.append( + t + - self.t_query + + np.where( + np.abs(audio_sum[t - self.t_query: t + self.t_query]) + == np.abs(audio_sum[t - self.t_query: t + self.t_query]).min() + )[0][0] + ) # 返回[ t - self.t_query, t+self.t_query] 区间最小值位置的索引保存,fang + s = 0 + + t = None + t1 = ttime() + audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") + p_len = audio_pad.shape[0] // self.window + inp_f0 = None + if hasattr(f0_file, "name") == True: + try: + with open(f0_file.name, "r") as f: + lines = f.read().strip("\n").split("\n") + inp_f0 = [] + for line in lines: + inp_f0.append([float(i) for i in line.split(",")]) + inp_f0 = np.array(inp_f0, dtype="float32") + except: + traceback.print_exc() + # sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() + sid_embed = np.load(sid) + embed_npy_spk = sid[:-4] + '_spk.npy' + sid_spk_embed = np.load(embed_npy_spk ) + print("555555sid_embed:",np.shape(sid_embed),'type:',type(sid_embed)) + print('sid_spk_embed:', np.shape(sid_spk_embed), 'type:',type(sid_spk_embed)) + sid_embed = np.concatenate((sid_embed, sid_spk_embed),axis=0) + print('sid_embed:', np.shape(sid_embed), 'type:',type(sid_embed)) + sid = torch.FloatTensor(sid_embed).to(self.device).half() + + #sid_embed = np.load(sid) + #sid = torch.FloatTensor(sid_embed).to(self.device).half() + print('sid:',sid.shape) + + pitch, pitchf = None, None + #''' + if if_f0 == 1: + pitch, pitchf = self.get_f0( + input_audio_path, + audio_pad, + p_len, + f0_up_key, + f0_method, + filter_radius, + inp_f0, + ) + pitch = pitch[:p_len] + pitchf = pitchf[:p_len] + if self.device == "mps": + pitchf = pitchf.astype(np.float32) + pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() + pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() + #''' + + ''' + pitch_name = "./pitch_pitchf.npz" + #np.savez(pitch_name, pitch = pitch.detach().cpu().numpy(), pitchf = pitchf.detach().cpu().numpy()) + npz_obj = np.load(pitch_name) #文件名的后缀为npz + pitch, pitchf = npz_obj['pitch'], npz_obj['pitchf'] + pitch = torch.tensor(pitch, device=self.device).long() + pitchf = torch.tensor(pitchf, device=self.device).float() + #''' + + t2 = ttime() + times[1] += t2 - t1 + + audio_opt = [] + audio_pad_list = [] + pitch_list = [] + pitchf_list = [] + + + for t in opt_ts: # 分段推理每段音频,一段这里设置60s左右 fang + t = t // self.window * self.window + audio_pad_list.append(audio_pad[s: t + self.t_pad2 + self.window]) + pitch_list.append(pitch[:, s // self.window: (t + self.t_pad2) // self.window]) + pitchf_list.append(pitchf[:, s // self.window: (t + self.t_pad2) // self.window]) + s = t + + audio_pad_list.append(audio_pad[t:]) + pitch_list.append(pitch[:, t // self.window:] if t is not None else pitch) + pitchf_list.append(pitchf[:, t // self.window:] if t is not None else pitchf) + + audio_opt = self.ThreadPool_process_core(self.infer_core_fang, audio_pad_list, pitch_list, pitchf_list, + model, + net_g, + sid, + times, + index, + big_npy, + index_rate, + version, + protect + ) + ''' + if if_f0 == 1: ##后面是最后一段处理 fang + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[t:], + pitch[:, t // self.window:] if t is not None else pitch, + pitchf[:, t // self.window:] if t is not None else pitchf, + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt: -self.t_pad_tgt] + ) + else: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[t:], + None, + None, + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt: -self.t_pad_tgt] + ) + #''' + audio_opt = np.concatenate(audio_opt) + if rms_mix_rate != 1: + audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate) + if resample_sr >= 16000 and tgt_sr != resample_sr: + audio_opt = librosa.resample( + audio_opt, orig_sr=tgt_sr, target_sr=resample_sr + ) + audio_max = np.abs(audio_opt).max() / 0.99 + max_int16 = 32768 + if audio_max > 1: + max_int16 /= audio_max + audio_opt = (audio_opt * max_int16).astype(np.int16) + del pitch, pitchf, sid + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return audio_opt