diff --git a/AutoCoverTool/online/inference_one_v1.py b/AutoCoverTool/online/inference_one_v1.py index 92e9495..7e5a151 100644 --- a/AutoCoverTool/online/inference_one_v1.py +++ b/AutoCoverTool/online/inference_one_v1.py @@ -1,190 +1,190 @@ """ 单个处理的逻辑 环境安装: conda create -n auto_song_cover python=3.9 # 安装demucs环境[进入到ref.music_remover 执行pip install -r requirements.txt] # 安装so_vits_svc环境[进入到ref.so_vits_svc 执行pip install -r requirements.txt] pip install librosa pip install scikit-maad pip install praat-parselmouth pip install matplotlib pip install torchvision pip install madmom pip install torchstat 环境设置: export PATH=$PATH:/data/gpu_env_common/env/bin/ffmpeg/bin export PYTHONPATH=$PWD:$PWD/ref/music_remover/demucs:$PWD/ref/so_vits_svc:$PWD/ref/split_dirty_frame """ import os import glob import json import shutil import librosa from ref.so_vits_svc.inference_main import * -from ref.adaptive_voice_conversion.spk_compare1 import infer_load, infer_main +from ref.speaker_feature_extractor.sf_extractor_interface import SFExtractorInterface gs_res_dir = "/data/rsync/jianli.yang/AutoCoverTool/data/online_models" gs_model_dir = os.path.join(gs_res_dir, 'models') gs_config_path = os.path.join(gs_res_dir, 'config.json') gs_draw_volume_exe = "/opt/soft/bin/draw_volume" gs_simple_mixer_path = "/opt/soft/bin/simple_mixer" # 错误码 gs_scg_success = 0 gs_scg_no_vocal = 1 gs_scg_svc_trans_442 = 2 gs_scg_svc_volume = 3 gs_scg_svc_mix = 4 gs_scg_svc_trans_mix = 5 class SongCoverGenerator: def __init__(self): self.models = glob.glob(os.path.join(gs_model_dir, "*/*pth")) - self.gs_infer = infer_load() + self.gs_infer = SFExtractorInterface() def mix(self, cid, work_dir, svc_file, vocal_file, acc_file, mix_path): """ :param cid: :param work_dir: :param svc_file: :param vocal_file: :param acc_file: :param mix_path: :return: """ cache_dir = os.path.join(work_dir, "cache") if os.path.exists(cache_dir): shutil.rmtree(cache_dir) os.makedirs(cache_dir) # svc转码到442 svc_442_file = os.path.join(cache_dir, "442.wav") st = time.time() cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {} -loglevel fatal".format(svc_file, svc_442_file) os.system(cmd) if not os.path.exists(svc_442_file): return gs_scg_svc_trans_442 logging.info("cid={},transcode,{},sp={}".format(cid, svc_file, time.time() - st)) # 合并转码后再做一次拉伸,保证响度 st = time.time() volume_path = os.path.join(cache_dir, "volume.wav") cmd = "{} {} {} {}".format(gs_draw_volume_exe, svc_442_file, vocal_file, volume_path) os.system(cmd) if not os.path.exists(volume_path): print("{} {} ERROR draw volume".format(cid, volume_path)) return gs_scg_svc_volume logging.info("cid={},draw_volume2,{},sp={}".format(cid, svc_file, time.time() - st)) # 混合 st = time.time() mix_wav_path = os.path.join(cache_dir, "mix.wav") cmd = "{} {} {} {}".format(gs_simple_mixer_path, volume_path, acc_file, mix_wav_path) os.system(cmd) if not os.path.exists(mix_wav_path): return gs_scg_svc_mix logging.info("cid={},mixer,{},sp={}".format(cid, svc_file, time.time() - st)) # 编码为m4a st = time.time() cmd = "ffmpeg -i {} -ab 128k -y {} -loglevel fatal".format(mix_wav_path, mix_path) print(cmd) os.system(cmd) if not os.path.exists(mix_path): return gs_scg_svc_trans_mix logging.info("cid={},encode,{},sp={}".format(cid, svc_file, time.time() - st)) return gs_scg_success def process_logic(self, cid, work_dir): """ work_dir: ---vocal.wav # 默认人声和伴奏都是44k双声道 ---acc.wav ---svc_vocals model1.wav model2.wav ---cache model1_tmp.wav model2_tmp.wav ---output model1.m4a model2.m4a ---emb model1.npy model2.npy :param cid: :param work_dir: :return: """ p_start = time.time() # vocal_wav = os.path.join(work_dir, "vocal.wav") vocal_wav = os.path.join(work_dir, "vocal_01.wav") vocal_32_wav = os.path.join(work_dir, "vocal_32.wav") # acc_wav = os.path.join(work_dir, "acc.wav") acc_wav = os.path.join(work_dir, "acc_01.wav") if not os.path.exists(vocal_wav) or not os.path.exists(acc_wav): return gs_scg_no_vocal # 将vocal采样率转为32位 audio, sr = librosa.load(vocal_wav, sr=32000, mono=True) soundfile.write(vocal_32_wav, audio, sr, format="wav") # 开始生成 svc_vocal_dir = os.path.join(work_dir, "svc_vocals") if not os.path.exists(svc_vocal_dir): os.makedirs(svc_vocal_dir) print("cid={}, start svc ...".format(cid)) st = time.time() out_files = [] for model_path in self.models: model_name = model_path.split("/")[-1].replace(".pth", "") dst_path = os.path.join(svc_vocal_dir, "{}_{}.wav".format(cid, model_name)) if os.path.exists(dst_path): continue if not os.path.exists(dst_path): try: inf(model_path, gs_config_path, vocal_32_wav, dst_path, 'cuda') except Exception as ex: print(ex) if os.path.exists(dst_path): out_files.append(dst_path) print("cid={}, svc finish sp={}, len={}".format(cid, time.time() - st, len(out_files))) # 提取特征 print("cid={}, start get emb".format(cid)) output_dir = os.path.join(work_dir, "output") if not os.path.exists(output_dir): os.makedirs(output_dir) emb_dir = os.path.join(work_dir, "emb") if not os.path.exists(emb_dir): os.makedirs(emb_dir) for file in out_files: - spk_emb = infer_main(self.gs_infer, file).detach().cpu().numpy() + spk_emb = self.gs_infer.process(file) fname = file.split("/")[-1].replace(".wav", "") emb_file = os.path.join(emb_dir, "{}".format(fname)) np.save(emb_file, spk_emb) cur_name = file.split("/")[-1].replace(".wav", ".m4a") mix_path = os.path.join(output_dir, "{}".format(cur_name)) err = self.mix(cid, work_dir, file, vocal_wav, acc_wav, mix_path) if err != gs_scg_success: print("cid={}, mix err code={}".format(cid, err)) print("cid={}, finish, sp={}".format(cid, time.time() - p_start)) if __name__ == '__main__': scg = SongCoverGenerator() ww_dir = "/data/rsync/jianli.yang/AutoCoverTool/data/online_data/step1" # cid = "AIYpdjQVidc." # 女 # cid = "Vds8ddYXYZY." # 男 for cid in ["AIYpdjQVidc.", "Vds8ddYXYZY."]: scg.process_logic(cid, os.path.join(ww_dir, cid)) diff --git a/AutoCoverTool/ref/adaptive_voice_conversion b/AutoCoverTool/ref/adaptive_voice_conversion deleted file mode 160000 index d1cad9f..0000000 --- a/AutoCoverTool/ref/adaptive_voice_conversion +++ /dev/null @@ -1 +0,0 @@ -Subproject commit d1cad9fb4eff74ca56714f9a2527124132fb1ed7 diff --git a/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/LICENSE b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/LICENSE new file mode 100644 index 0000000..9d27e6d --- /dev/null +++ b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [2020] [Ju-Chieh Chou] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/README.md b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/README.md new file mode 100644 index 0000000..8bd3489 --- /dev/null +++ b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/README.md @@ -0,0 +1,80 @@ +# One-shot Voice Conversion by Separating Speaker and Content Representations with Instance Normalization +This is the official implementation of the paper [One-shot Voice Conversion by Separating Speaker and Content Representations with Instance Normalization](https://arxiv.org/abs/1904.05742). +By separately learning speaker and content representations, we can achieve one-shot VC by only one utterance from source speaker and one utterace from target speaker. +You can found the demo webpage [here](https://jjery2243542.github.io/one-shot-vc-demo/), and download the pretrain model from [here](http://speech.ee.ntu.edu.tw/~jjery2243542/resource/model/is19/vctk_model.ckpt) and the coresponding normalization parameters for inference from [here](http://speech.ee.ntu.edu.tw/~jjery2243542/resource/model/is19/attr.pkl). + +# Dependencies +- python 3.6+ +- pytorch 1.0.1 +- numpy 1.16.0 +- librosa 0.6.3 +- SoundFile 0.10.2 +- tensorboardX +
+We also use some preprocess script from [Kyubyong/tacotron](https://github.com/Kyubyong/tacotron) and [magenta/magenta/models/gansynth](https://github.com/tensorflow/magenta/tree/master/magenta/models/gansynth). + +# Differences from the paper +The implementations are a little different from the paper, which I found them useful to stablize training process or improve audio quality. However, the experiments requires human evaluation, we only update the code but not updating the paper. The differences are listed below: +- Not to apply dropout to the speaker encoder and content encoder. +- Normalization placed at pre-activation position. +- Use the original KL-divergence loss for VAE rather than unit variance version. +- Use KL annealing, and the weight will increase to 1. + +# Preprocess +We provide the preprocess script for two datasets: VCTK and LibriTTS. The download links are below. +- [CSTR VCTK Corpus](https://homepages.inf.ed.ac.uk/jyamagis/page3/page58/page58.html) +- [LibriTTS Corpus](http://www.openslr.org/60/) + +The experiments in the paper is done on VCTK. + +The preprocess code is at ```preprocess/```. +The configuation for preprocessing is at ```preprocess/libri.config``` and ```preprocess/vctk.config```. Depends on which dataset you used. +where: +- **segment\_size** is the segment size for training. Default: 128 +- **data\_dir** is the directory to put preprocessed files. +- **raw\_data\_dir** is the directory to put the raw data. Like ```LibriTTS/``` or ```VCTK-Corpus/```. +- **n_out_speakers** is the number of speakers for testing. Default: 20. +- **test\_prop** is the proportion for validation utterances. Default: 0.1 +- **training\_samples** is the number of sampled segments for training (we sample it in the preprocess stage). Default: 10000000. +- **testing_samples** is the number of sampled segments for testing. Default: 10000. +- **n\_utt\_attr** is the number of utterances to compute mean and standard deviation for normalization. Default: 5000. +- **train_set**: only for LibriTTS. The subset used for training. Default: train-clean-100. +- **test_set**: only for LibriTTS. The subset used for testing. Default: dev-clean. + +Once you edited the config file, you can run ```preprocess_vctk.sh``` or ```preprocess_libri.sh``` to preprocess the dataset. +
+Also, you can change the feature extraction config in ```preprocess/tacotron/hyperparams.py``` + +# Training +The default arguments can be found in ```train.sh```. The usage of each arguments are listed below. +- **-c**: the path of config file, the default hyper-parameters can be found at ```config.yaml```. +- **-iters**: train the model with how many iterations. default: 200000 +- **-summary_steps**: record training loss every n steps. +- **-t**: the tag for tensorboard. +- **-train_set**: the data file for training (```train``` if the file is train.pkl). Default: ```train``` +- **-train_index_file**: the name of training index file. Default: ```train_samples_128.json``` +- **-data_dir**: the directory for processed data. +- **-store_model_path**: the path to store the model. + +# Inference +You can use ```inference.py``` to inference. +- **-c**: the path of config file. +- **-m**: the path of model checkpoint. +- **-a**: the attribute file for normalization ad denormalization. +- **-s**: the path of source file (.wav). +- **-t**: the path of target file (.wav). +- **-o**: the path of output converted file (.wav). + +# Reference +Please cite our paper if you find this repository useful. +``` +@article{chou2019one, + title={One-shot Voice Conversion by Separating Speaker and Content Representations with Instance Normalization}, + author={Chou, Ju-chieh and Yeh, Cheng-chieh and Lee, Hung-yi}, + journal={arXiv preprint arXiv:1904.05742}, + year={2019} +} +``` + +# Contact +If you have any question about the paper or the code, feel free to email me at [jjery2243542@gmail.com](jjery2243542@gmail.com). diff --git a/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/config.yaml b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/config.yaml new file mode 100644 index 0000000..2ec1c40 --- /dev/null +++ b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/config.yaml @@ -0,0 +1,52 @@ +SpeakerEncoder: + c_in: 512 + c_h: 128 + c_out: 128 + kernel_size: 5 + bank_size: 8 + bank_scale: 1 + c_bank: 128 + n_conv_blocks: 6 + n_dense_blocks: 6 + subsample: [1, 2, 1, 2, 1, 2] + act: 'relu' + dropout_rate: 0 +ContentEncoder: + c_in: 512 + c_h: 128 + c_out: 128 + kernel_size: 5 + bank_size: 8 + bank_scale: 1 + c_bank: 128 + n_conv_blocks: 6 + subsample: [1, 2, 1, 2, 1, 2] + act: 'relu' + dropout_rate: 0 +Decoder: + c_in: 128 + c_cond: 128 + c_h: 128 + c_out: 512 + kernel_size: 5 + n_conv_blocks: 6 + upsample: [2, 1, 2, 1, 2, 1] + act: 'relu' + sn: False + dropout_rate: 0 +data_loader: + segment_size: 128 + frame_size: 1 + batch_size: 128 + shuffle: True +optimizer: + lr: 0.0005 + beta1: 0.9 + beta2: 0.999 + amsgrad: True + weight_decay: 0.0001 + grad_norm: 5 +lambda: + lambda_rec: 10 + lambda_kl: 1 +annealing_iters: 20000 diff --git a/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/data_utils.py b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/data_utils.py new file mode 100644 index 0000000..37bfaab --- /dev/null +++ b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/data_utils.py @@ -0,0 +1,58 @@ +import torch +from torch.utils.data import Dataset +import os +import pickle +import json +import numpy as np +import torch +from torch.utils.data import DataLoader + +class CollateFn(object): + def __init__(self, frame_size): + self.frame_size = frame_size + + def make_frames(self, tensor): + out = tensor.view(tensor.size(0), tensor.size(1) // self.frame_size, self.frame_size * tensor.size(2)) + out = out.transpose(1, 2) + return out + + def __call__(self, l): + data_tensor = torch.from_numpy(np.array(l)) + segment = self.make_frames(data_tensor) + return segment + +def get_data_loader(dataset, batch_size, frame_size, shuffle=True, num_workers=4, drop_last=False): + _collate_fn = CollateFn(frame_size=frame_size) + dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, + num_workers=num_workers, collate_fn=_collate_fn, pin_memory=True) + return dataloader + +class SequenceDataset(Dataset): + def __init__(self, data): + self.data = data + self.utt_ids = list(self.data.keys()) + + def __getitem__(self, ind): + utt_id = self.utt_ids[ind] + ret = self.data[utt_id].transpose() + return ret + + def __len__(self): + return len(self.utt_ids) + +class PickleDataset(Dataset): + def __init__(self, pickle_path, sample_index_path, segment_size): + with open(pickle_path, 'rb') as f: + self.data = pickle.load(f) + with open(sample_index_path, 'r') as f: + self.indexes = json.load(f) + self.segment_size = segment_size + + def __getitem__(self, ind): + utt_id, t = self.indexes[ind] + segment = self.data[utt_id][t:t + self.segment_size] + return segment + + def __len__(self): + return len(self.indexes) + diff --git a/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/inference.py b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/inference.py new file mode 100644 index 0000000..3c80b5c --- /dev/null +++ b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/inference.py @@ -0,0 +1,112 @@ +import torch +import numpy as np +import sys +import os +import torch.nn as nn +import torch.nn.functional as F +import yaml +import pickle +from model import AE +from utils import * +from functools import reduce +import json +from collections import defaultdict +from torch.utils.data import Dataset +from torch.utils.data import TensorDataset +from torch.utils.data import DataLoader +from argparse import ArgumentParser, Namespace +from scipy.io.wavfile import write +import random +from preprocess.tacotron.utils import melspectrogram2wav +from preprocess.tacotron.utils import get_spectrograms +import librosa + +class Inferencer(object): + def __init__(self, config, args): + # config store the value of hyperparameters, turn to attr by AttrDict + self.config = config + print(config) + # args store other information + self.args = args + print(self.args) + + # init the model with config + self.build_model() + + # load model + self.load_model() + + with open(self.args.attr, 'rb') as f: + self.attr = pickle.load(f) + + def load_model(self): + print(f'Load model from {self.args.model}') + self.model.load_state_dict(torch.load(f'{self.args.model}')) + return + + def build_model(self): + # create model, discriminator, optimizers + self.model = cc(AE(self.config)) + print(self.model) + self.model.eval() + return + + def utt_make_frames(self, x): + frame_size = self.config['data_loader']['frame_size'] + remains = x.size(0) % frame_size + if remains != 0: + x = F.pad(x, (0, remains)) + out = x.view(1, x.size(0) // frame_size, frame_size * x.size(1)).transpose(1, 2) + return out + + def inference_one_utterance(self, x, x_cond): + x = self.utt_make_frames(x) + x_cond = self.utt_make_frames(x_cond) + dec,spk_emb = self.model.inference(x, x_cond) + print('@@@@spk_emb.size',spk_emb.shape) + dec = dec.transpose(1, 2).squeeze(0) + dec = dec.detach().cpu().numpy() + dec = self.denormalize(dec) + wav_data = melspectrogram2wav(dec) + return wav_data, dec + + def denormalize(self, x): + m, s = self.attr['mean'], self.attr['std'] + ret = x * s + m + return ret + + def normalize(self, x): + m, s = self.attr['mean'], self.attr['std'] + ret = (x - m) / s + return ret + + def write_wav_to_file(self, wav_data, output_path): + write(output_path, rate=self.args.sample_rate, data=wav_data) + return + + def inference_from_path(self): + src_mel, _ = get_spectrograms(self.args.source) + tar_mel, _ = get_spectrograms(self.args.target) + src_mel = torch.from_numpy(self.normalize(src_mel)).cuda() + tar_mel = torch.from_numpy(self.normalize(tar_mel)).cuda() + conv_wav, conv_mel = self.inference_one_utterance(src_mel, tar_mel) + self.write_wav_to_file(conv_wav, self.args.output) + return + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument('-attr', '-a', help='attr file path') + parser.add_argument('-config', '-c', help='config file path') + parser.add_argument('-model', '-m', help='model path') + parser.add_argument('-source', '-s', help='source wav path') + parser.add_argument('-target', '-t', help='target wav path') + parser.add_argument('-output', '-o', help='output wav path') + parser.add_argument('-sample_rate', '-sr', help='sample rate', default=24000, type=int) + args = parser.parse_args() + # load config file + print('args.config: ',args.config) + with open(args.config) as f: + config = yaml.load(f, Loader=yaml.FullLoader) + inferencer = Inferencer(config=config, args=args) + inferencer.inference_from_path() + print("args.output:",args.output) diff --git a/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/main.py b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/main.py new file mode 100644 index 0000000..7a56794 --- /dev/null +++ b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/main.py @@ -0,0 +1,33 @@ +from argparse import ArgumentParser, Namespace +import torch +from solver import Solver +import yaml +import sys + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument('-config', '-c', default='config.yaml') + parser.add_argument('-data_dir', '-d', + default='/storage/feature/LibriTTS/sr_24000_mel_norm') + parser.add_argument('-train_set', default='train') + parser.add_argument('-train_index_file', default='train_samples_64.json') + parser.add_argument('-logdir', default='log/') + parser.add_argument('--load_model', action='store_true') + parser.add_argument('--load_opt', action='store_true') + parser.add_argument('-store_model_path', default='/storage/model/adaptive_vc/model') + parser.add_argument('-load_model_path', default='/storage/model/adaptive_vc/model') + parser.add_argument('-summary_steps', default=100, type=int) + parser.add_argument('-save_steps', default=5000, type=int) + parser.add_argument('-tag', '-t', default='init') + parser.add_argument('-iters', default=0, type=int) + + args = parser.parse_args() + + # load config file + with open(args.config) as f: + config = yaml.load(f) + + solver = Solver(config=config, args=args) + + if args.iters > 0: + solver.train(n_iterations=args.iters) diff --git a/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/model.png b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/model.png new file mode 100644 index 0000000..766aaca Binary files /dev/null and b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/model.png differ diff --git a/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/model.py b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/model.py new file mode 100644 index 0000000..5f6aebb --- /dev/null +++ b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/model.py @@ -0,0 +1,412 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.autograd as ag +import numpy as np +from math import ceil +from functools import reduce +from torch.nn.utils import spectral_norm +from adaptive_voice_conversion.utils import cc + + +class DummyEncoder(object): + def __init__(self, encoder): + self.encoder = encoder + + def load(self, target_network): + self.encoder.load_state_dict(target_network.state_dict()) + + def __call__(self, x): + return self.encoder(x) + + +def pad_layer(inp, layer, pad_type='reflect'): + kernel_size = layer.kernel_size[0] + if kernel_size % 2 == 0: + pad = (kernel_size // 2, kernel_size // 2 - 1) + else: + pad = (kernel_size // 2, kernel_size // 2) + # padding + inp = F.pad(inp, + pad=pad, + mode=pad_type) + out = layer(inp) + return out + + +def pad_layer_2d(inp, layer, pad_type='reflect'): + kernel_size = layer.kernel_size + if kernel_size[0] % 2 == 0: + pad_lr = [kernel_size[0] // 2, kernel_size[0] // 2 - 1] + else: + pad_lr = [kernel_size[0] // 2, kernel_size[0] // 2] + if kernel_size[1] % 2 == 0: + pad_ud = [kernel_size[1] // 2, kernel_size[1] // 2 - 1] + else: + pad_ud = [kernel_size[1] // 2, kernel_size[1] // 2] + pad = tuple(pad_lr + pad_ud) + # padding + inp = F.pad(inp, + pad=pad, + mode=pad_type) + out = layer(inp) + return out + + +def pixel_shuffle_1d(inp, scale_factor=2): + batch_size, channels, in_width = inp.size() + channels //= scale_factor + out_width = in_width * scale_factor + inp_view = inp.contiguous().view(batch_size, channels, scale_factor, in_width) + shuffle_out = inp_view.permute(0, 1, 3, 2).contiguous() + shuffle_out = shuffle_out.view(batch_size, channels, out_width) + return shuffle_out + + +def upsample(x, scale_factor=2): + x_up = F.interpolate(x, scale_factor=scale_factor, mode='nearest') + return x_up + + +def flatten(x): + out = x.contiguous().view(x.size(0), x.size(1) * x.size(2)) + return out + + +def concat_cond(x, cond): + # x = [batch_size, x_channels, length] + # cond = [batch_size, c_channels] + cond = cond.unsqueeze(dim=2) + cond = cond.expand(*cond.size()[:-1], x.size(-1)) + out = torch.cat([x, cond], dim=1) + return out + + +def append_cond(x, cond): + # x = [batch_size, x_channels, length] + # cond = [batch_size, x_channels * 2] + p = cond.size(1) // 2 + mean, std = cond[:, :p], cond[:, p:] + out = x * std.unsqueeze(dim=2) + mean.unsqueeze(dim=2) + return out + + +def conv_bank(x, module_list, act, pad_type='reflect'): + outs = [] + for layer in module_list: + out = act(pad_layer(x, layer, pad_type)) + outs.append(out) + out = torch.cat(outs + [x], dim=1) + return out + + +def get_act(act): + if act == 'relu': + return nn.ReLU() + elif act == 'lrelu': + return nn.LeakyReLU() + else: + return nn.ReLU() + + +class MLP(nn.Module): + def __init__(self, c_in, c_h, n_blocks, act, sn): + super(MLP, self).__init__() + self.act = get_act(act) + self.n_blocks = n_blocks + f = spectral_norm if sn else lambda x: x + self.in_dense_layer = f(nn.Linear(c_in, c_h)) + self.first_dense_layers = nn.ModuleList([f(nn.Linear(c_h, c_h)) for _ in range(n_blocks)]) + self.second_dense_layers = nn.ModuleList([f(nn.Linear(c_h, c_h)) for _ in range(n_blocks)]) + + def forward(self, x): + h = self.in_dense_layer(x) + for l in range(self.n_blocks): + y = self.first_dense_layers[l](h) + y = self.act(y) + y = self.second_dense_layers[l](y) + y = self.act(y) + h = h + y + return h + + +class Prenet(nn.Module): + def __init__(self, c_in, c_h, c_out, + kernel_size, n_conv_blocks, + subsample, act, dropout_rate): + super(Prenet, self).__init__() + self.act = get_act(act) + self.subsample = subsample + self.n_conv_blocks = n_conv_blocks + self.in_conv_layer = nn.Conv2d(1, c_h, kernel_size=kernel_size) + self.first_conv_layers = nn.ModuleList([nn.Conv2d(c_h, c_h, kernel_size=kernel_size) for _ \ + in range(n_conv_blocks)]) + self.second_conv_layers = nn.ModuleList([nn.Conv2d(c_h, c_h, kernel_size=kernel_size, stride=sub) + for sub, _ in zip(subsample, range(n_conv_blocks))]) + output_size = c_in + for l, sub in zip(range(n_conv_blocks), self.subsample): + output_size = ceil(output_size / sub) + self.out_conv_layer = nn.Conv1d(c_h * output_size, c_out, kernel_size=1) + self.dropout_layer = nn.Dropout(p=dropout_rate) + self.norm_layer = nn.InstanceNorm2d(c_h, affine=False) + + def forward(self, x): + # reshape x to 4D + x = x.contiguous().view(x.size(0), 1, x.size(1), x.size(2)) + out = pad_layer_2d(x, self.in_conv_layer) + out = self.act(out) + out = self.norm_layer(out) + for l in range(self.n_conv_blocks): + y = pad_layer_2d(out, self.first_conv_layers[l]) + y = self.act(y) + y = self.norm_layer(y) + y = self.dropout_layer(y) + y = pad_layer_2d(y, self.second_conv_layers[l]) + y = self.act(y) + y = self.norm_layer(y) + y = self.dropout_layer(y) + if self.subsample[l] > 1: + out = F.avg_pool2d(out, kernel_size=self.subsample[l], ceil_mode=True) + out = y + out + out = out.contiguous().view(out.size(0), out.size(1) * out.size(2), out.size(3)) + out = pad_layer(out, self.out_conv_layer) + out = self.act(out) + return out + + +class Postnet(nn.Module): + def __init__(self, c_in, c_h, c_out, c_cond, + kernel_size, n_conv_blocks, + upsample, act, sn): + super(Postnet, self).__init__() + self.act = get_act(act) + self.upsample = upsample + self.c_h = c_h + self.n_conv_blocks = n_conv_blocks + f = spectral_norm if sn else lambda x: x + total_upsample = reduce(lambda x, y: x * y, upsample) + self.in_conv_layer = f(nn.Conv1d(c_in, c_h * c_out // total_upsample, kernel_size=1)) + self.first_conv_layers = nn.ModuleList([f(nn.Conv2d(c_h, c_h, kernel_size=kernel_size)) for _ \ + in range(n_conv_blocks)]) + self.second_conv_layers = nn.ModuleList([f(nn.Conv2d(c_h, c_h * up * up, kernel_size=kernel_size)) + for up, _ in zip(upsample, range(n_conv_blocks))]) + self.out_conv_layer = f(nn.Conv2d(c_h, 1, kernel_size=1)) + self.conv_affine_layers = nn.ModuleList( + [f(nn.Linear(c_cond, c_h * 2)) for _ in range(n_conv_blocks * 2)]) + self.norm_layer = nn.InstanceNorm2d(c_h, affine=False) + self.ps = nn.PixelShuffle(max(upsample)) + + def forward(self, x, cond): + out = pad_layer(x, self.in_conv_layer) + out = out.contiguous().view(out.size(0), self.c_h, out.size(1) // self.c_h, out.size(2)) + for l in range(self.n_conv_blocks): + y = pad_layer_2d(out, self.first_conv_layers[l]) + y = self.act(y) + y = self.norm_layer(y) + y = append_cond_2d(y, self.conv_affine_layers[l * 2](cond)) + y = pad_layer_2d(y, self.second_conv_layers[l]) + y = self.act(y) + if self.upsample[l] > 1: + y = self.ps(y) + y = self.norm_layer(y) + y = append_cond_2d(y, self.conv_affine_layers[l * 2 + 1](cond)) + out = y + upsample(out, scale_factor=(self.upsample[l], self.upsample[l])) + else: + y = self.norm_layer(y) + y = append_cond(y, self.conv_affine_layers[l * 2 + 1](cond)) + out = y + out + out = self.out_conv_layer(out) + out = out.squeeze(dim=1) + return out + + +class SpeakerEncoder(nn.Module): + def __init__(self, c_in, c_h, c_out, kernel_size, + bank_size, bank_scale, c_bank, + n_conv_blocks, n_dense_blocks, + subsample, act, dropout_rate): + super(SpeakerEncoder, self).__init__() + self.c_in = c_in + self.c_h = c_h + self.c_out = c_out + self.kernel_size = kernel_size + self.n_conv_blocks = n_conv_blocks + self.n_dense_blocks = n_dense_blocks + self.subsample = subsample + self.act = get_act(act) + self.conv_bank = nn.ModuleList( + [nn.Conv1d(c_in, c_bank, kernel_size=k) for k in range(bank_scale, bank_size + 1, bank_scale)]) + in_channels = c_bank * (bank_size // bank_scale) + c_in + self.in_conv_layer = nn.Conv1d(in_channels, c_h, kernel_size=1) + self.first_conv_layers = nn.ModuleList([nn.Conv1d(c_h, c_h, kernel_size=kernel_size) for _ \ + in range(n_conv_blocks)]) + self.second_conv_layers = nn.ModuleList([nn.Conv1d(c_h, c_h, kernel_size=kernel_size, stride=sub) + for sub, _ in zip(subsample, range(n_conv_blocks))]) + self.pooling_layer = nn.AdaptiveAvgPool1d(1) + self.first_dense_layers = nn.ModuleList([nn.Linear(c_h, c_h) for _ in range(n_dense_blocks)]) + self.second_dense_layers = nn.ModuleList([nn.Linear(c_h, c_h) for _ in range(n_dense_blocks)]) + self.output_layer = nn.Linear(c_h, c_out) + self.dropout_layer = nn.Dropout(p=dropout_rate) + + def conv_blocks(self, inp): + out = inp + # convolution blocks + for l in range(self.n_conv_blocks): + y = pad_layer(out, self.first_conv_layers[l]) + y = self.act(y) + y = self.dropout_layer(y) + y = pad_layer(y, self.second_conv_layers[l]) + y = self.act(y) + y = self.dropout_layer(y) + if self.subsample[l] > 1: + out = F.avg_pool1d(out, kernel_size=self.subsample[l], ceil_mode=True) + out = y + out + return out + + def dense_blocks(self, inp): + out = inp + # dense layers + for l in range(self.n_dense_blocks): + y = self.first_dense_layers[l](out) + y = self.act(y) + y = self.dropout_layer(y) + y = self.second_dense_layers[l](y) + y = self.act(y) + y = self.dropout_layer(y) + out = y + out + return out + + def forward(self, x): + out = conv_bank(x, self.conv_bank, act=self.act) + # dimension reduction layer + out = pad_layer(out, self.in_conv_layer) + out = self.act(out) + # conv blocks + out = self.conv_blocks(out) + # avg pooling + out = self.pooling_layer(out).squeeze(2) + # dense blocks + out = self.dense_blocks(out) + out = self.output_layer(out) + return out + + +class ContentEncoder(nn.Module): + def __init__(self, c_in, c_h, c_out, kernel_size, + bank_size, bank_scale, c_bank, + n_conv_blocks, subsample, + act, dropout_rate): + super(ContentEncoder, self).__init__() + self.n_conv_blocks = n_conv_blocks + self.subsample = subsample + self.act = get_act(act) + self.conv_bank = nn.ModuleList( + [nn.Conv1d(c_in, c_bank, kernel_size=k) for k in range(bank_scale, bank_size + 1, bank_scale)]) + in_channels = c_bank * (bank_size // bank_scale) + c_in + self.in_conv_layer = nn.Conv1d(in_channels, c_h, kernel_size=1) + self.first_conv_layers = nn.ModuleList([nn.Conv1d(c_h, c_h, kernel_size=kernel_size) for _ \ + in range(n_conv_blocks)]) + self.second_conv_layers = nn.ModuleList([nn.Conv1d(c_h, c_h, kernel_size=kernel_size, stride=sub) + for sub, _ in zip(subsample, range(n_conv_blocks))]) + self.norm_layer = nn.InstanceNorm1d(c_h, affine=False) + self.mean_layer = nn.Conv1d(c_h, c_out, kernel_size=1) + self.std_layer = nn.Conv1d(c_h, c_out, kernel_size=1) + self.dropout_layer = nn.Dropout(p=dropout_rate) + + def forward(self, x): + out = conv_bank(x, self.conv_bank, act=self.act) + # dimension reduction layer + out = pad_layer(out, self.in_conv_layer) + out = self.norm_layer(out) + out = self.act(out) + out = self.dropout_layer(out) + # convolution blocks + for l in range(self.n_conv_blocks): + y = pad_layer(out, self.first_conv_layers[l]) + y = self.norm_layer(y) + y = self.act(y) + y = self.dropout_layer(y) + y = pad_layer(y, self.second_conv_layers[l]) + y = self.norm_layer(y) + y = self.act(y) + y = self.dropout_layer(y) + if self.subsample[l] > 1: + out = F.avg_pool1d(out, kernel_size=self.subsample[l], ceil_mode=True) + out = y + out + mu = pad_layer(out, self.mean_layer) + log_sigma = pad_layer(out, self.std_layer) + return mu, log_sigma + + +class Decoder(nn.Module): + def __init__(self, + c_in, c_cond, c_h, c_out, + kernel_size, + n_conv_blocks, upsample, act, sn, dropout_rate): + super(Decoder, self).__init__() + self.n_conv_blocks = n_conv_blocks + self.upsample = upsample + self.act = get_act(act) + f = spectral_norm if sn else lambda x: x + self.in_conv_layer = f(nn.Conv1d(c_in, c_h, kernel_size=1)) + self.first_conv_layers = nn.ModuleList([f(nn.Conv1d(c_h, c_h, kernel_size=kernel_size)) for _ \ + in range(n_conv_blocks)]) + self.second_conv_layers = nn.ModuleList( \ + [f(nn.Conv1d(c_h, c_h * up, kernel_size=kernel_size)) \ + for _, up in zip(range(n_conv_blocks), self.upsample)]) + self.norm_layer = nn.InstanceNorm1d(c_h, affine=False) + self.conv_affine_layers = nn.ModuleList( + [f(nn.Linear(c_cond, c_h * 2)) for _ in range(n_conv_blocks * 2)]) + self.out_conv_layer = f(nn.Conv1d(c_h, c_out, kernel_size=1)) + self.dropout_layer = nn.Dropout(p=dropout_rate) + + def forward(self, z, cond): + out = pad_layer(z, self.in_conv_layer) + out = self.norm_layer(out) + out = self.act(out) + out = self.dropout_layer(out) + # convolution blocks + for l in range(self.n_conv_blocks): + y = pad_layer(out, self.first_conv_layers[l]) + y = self.norm_layer(y) + y = append_cond(y, self.conv_affine_layers[l * 2](cond)) + y = self.act(y) + y = self.dropout_layer(y) + y = pad_layer(y, self.second_conv_layers[l]) + if self.upsample[l] > 1: + y = pixel_shuffle_1d(y, scale_factor=self.upsample[l]) + y = self.norm_layer(y) + y = append_cond(y, self.conv_affine_layers[l * 2 + 1](cond)) + y = self.act(y) + y = self.dropout_layer(y) + if self.upsample[l] > 1: + out = y + upsample(out, scale_factor=self.upsample[l]) + else: + out = y + out + out = pad_layer(out, self.out_conv_layer) + return out + + +class AE(nn.Module): + def __init__(self, config): + super(AE, self).__init__() + self.speaker_encoder = SpeakerEncoder(**config['SpeakerEncoder']) + self.content_encoder = ContentEncoder(**config['ContentEncoder']) + self.decoder = Decoder(**config['Decoder']) + + def forward(self, x): + emb = self.speaker_encoder(x) + mu, log_sigma = self.content_encoder(x) + eps = log_sigma.new(*log_sigma.size()).normal_(0, 1) + dec = self.decoder(mu + torch.exp(log_sigma / 2) * eps, emb) + return mu, log_sigma, emb, dec + + def inference(self, x, x_cond): + emb = self.speaker_encoder(x_cond) + mu, _ = self.content_encoder(x) + dec = self.decoder(mu, emb) + return dec, emb + + def get_speaker_embeddings(self, x): + emb = self.speaker_encoder(x) + return emb diff --git a/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/premodes/attr.pkl b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/premodes/attr.pkl new file mode 100644 index 0000000..eb92c79 Binary files /dev/null and b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/premodes/attr.pkl differ diff --git a/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/premodes/vctk_model.ckpt b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/premodes/vctk_model.ckpt new file mode 100644 index 0000000..7885987 Binary files /dev/null and b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/premodes/vctk_model.ckpt differ diff --git a/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/libri.config b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/libri.config new file mode 100644 index 0000000..bc38790 --- /dev/null +++ b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/libri.config @@ -0,0 +1,10 @@ +stage=0 +segment_size=128 +data_dir=/groups/jjery2243542/data/LibriTTS/sr_24000_mel_norm/ +raw_data_dir=/groups/jjery2243542/data/raw/LibriTTS/ +test_prop=0.05 +training_samples=10000000 +testing_samples=10000 +n_utts_attr=5000 +train_set=train-clean-100 +test_set=dev-clean diff --git a/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/make_datasets_libri.py b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/make_datasets_libri.py new file mode 100644 index 0000000..6640594 --- /dev/null +++ b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/make_datasets_libri.py @@ -0,0 +1,95 @@ +import pickle +import librosa +import sys +import glob +import random +import os +from collections import defaultdict +import re +import numpy as np +import json +from tacotron.utils import get_spectrograms + +def read_speaker_info(speaker_info_path): + speaker_ids = [] + with open(speaker_info_path, 'r') as f: + for i, line in enumerate(f): + if i == 0: + continue + speaker_id = line.strip().split()[0] + speaker_ids.append(speaker_id) + return speaker_ids + +def read_paths(root_dir, dset): + paths = sorted(glob.glob(os.path.join(root_dir, f'{dset}/*/*/*.wav'))) + return paths + +def get_speaker2path(root_dir, dset): + speaker2path = defaultdict(lambda : []) + for path in sorted(glob.glob(os.path.join(root_dir, f'{dset}/*/*/*.wav'))): + filename = path.strip().split('/')[-1] + speaker_id = re.match(r'(\d+)_(\d+)_(\d+)_(\d+)\.wav', filename).groups()[0] + speaker2path[speaker_id].append(path) + return speaker2path + +def spec_feature_extraction(wav_file): + mel, mag = get_spectrograms(wav_file) + return mel, mag + +if __name__ == '__main__': + data_dir = sys.argv[1] + output_dir = sys.argv[2] + dev_proportion = float(sys.argv[3]) + n_utts_attr = int(sys.argv[4]) + train_set = sys.argv[5] + test_set = sys.argv[6] + + paths = read_paths(data_dir, train_set) + random.shuffle(paths) + dev_data_size = int(len(paths) * dev_proportion) + train_paths = paths[:-dev_data_size] + dev_paths = paths[-dev_data_size:] + test_paths = read_paths(data_dir, test_set) + print(f'{len(train_paths)} training data, {len(dev_paths)} dev data, {len(test_paths)} test data') + + with open(os.path.join(output_dir, 'train_files.txt'), 'w') as f: + for path in sorted(train_paths): + filename = path.strip().split('/')[-1] + f.write(f'{filename}\n') + + with open(os.path.join(output_dir, 'dev_files.txt'), 'w') as f: + for path in sorted(dev_paths): + filename = path.strip().split('/')[-1] + f.write(f'{filename}\n') + + with open(os.path.join(output_dir, 'test_files.txt'), 'w') as f: + for path in sorted(test_paths): + filename = path.strip().split('/')[-1] + f.write(f'{filename}\n') + + for dset, paths in zip(['train', 'dev', 'test'], \ + [train_paths, dev_paths, test_paths]): + print(f'processing {dset} set, {len(paths)} files') + data = {} + output_path = os.path.join(output_dir, f'{dset}.pkl') + all_train_data = [] + for i, path in enumerate(paths): + if i % 500 == 0 or i == len(paths) - 1: + print(f'processing {i} files') + filename = path.strip().split('/')[-1] + mel, mag = spec_feature_extraction(path) + data[filename] = mel + if dset == 'train' and i < n_utts_attr: + all_train_data.append(mel) + if dset == 'train': + all_train_data = np.concatenate(all_train_data) + mean = np.mean(all_train_data, axis=0) + std = np.std(all_train_data, axis=0) + attr = {'mean': mean, 'std': std} + with open(os.path.join(output_dir, 'attr.pkl'), 'wb') as f: + pickle.dump(attr, f) + for key, val in data.items(): + val = (val - mean) / std + data[key] = val + with open(output_path, 'wb') as f: + pickle.dump(data, f) diff --git a/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/make_datasets_vctk.py b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/make_datasets_vctk.py new file mode 100644 index 0000000..8351a08 --- /dev/null +++ b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/make_datasets_vctk.py @@ -0,0 +1,104 @@ +import pickle +import librosa +import sys +import glob +import random +import os +from collections import defaultdict +import re +import numpy as np +import json +from tacotron.utils import get_spectrograms + +def read_speaker_info(speaker_info_path): + speaker_ids = [] + with open(speaker_info_path, 'r') as f: + for i, line in enumerate(f): + if i == 0: + continue + speaker_id = line.strip().split()[0] + speaker_ids.append(speaker_id) + return speaker_ids + +def read_filenames(root_dir): + speaker2filenames = defaultdict(lambda : []) + for path in sorted(glob.glob(os.path.join(root_dir, '*/*'))): + filename = path.strip().split('/')[-1] + speaker_id, utt_id = re.match(r'p(\d+)_(\d+)\.wav', filename).groups() + speaker2filenames[speaker_id].append(path) + return speaker2filenames + +def wave_feature_extraction(wav_file, sr): + y, sr = librosa.load(wav_file, sr) + y, _ = librosa.effects.trim(y, top_db=20) + return y + +def spec_feature_extraction(wav_file): + mel, mag = get_spectrograms(wav_file) + return mel, mag + +if __name__ == '__main__': + data_dir = sys.argv[1] + speaker_info_path = sys.argv[2] + output_dir = sys.argv[3] + test_speakers = int(sys.argv[4]) + test_proportion = float(sys.argv[5]) + sample_rate = int(sys.argv[6]) + n_utts_attr = int(sys.argv[7]) + + speaker_ids = read_speaker_info(speaker_info_path) + random.shuffle(speaker_ids) + + train_speaker_ids = speaker_ids[:-test_speakers] + test_speaker_ids = speaker_ids[-test_speakers:] + + speaker2filenames = read_filenames(data_dir) + + train_path_list, in_test_path_list, out_test_path_list = [], [], [] + + for speaker in train_speaker_ids: + path_list = speaker2filenames[speaker] + random.shuffle(path_list) + test_data_size = int(len(path_list) * test_proportion) + train_path_list += path_list[:-test_data_size] + in_test_path_list += path_list[-test_data_size:] + + with open(os.path.join(output_dir, 'in_test_files.txt'), 'w') as f: + for path in in_test_path_list: + f.write(f'{path}\n') + + for speaker in test_speaker_ids: + path_list = speaker2filenames[speaker] + out_test_path_list += path_list + + with open(os.path.join(output_dir, 'out_test_files.txt'), 'w') as f: + for path in out_test_path_list: + f.write(f'{path}\n') + + for dset, path_list in zip(['train', 'in_test', 'out_test'], \ + [train_path_list, in_test_path_list, out_test_path_list]): + print(f'processing {dset} set, {len(path_list)} files') + data = {} + output_path = os.path.join(output_dir, f'{dset}.pkl') + all_train_data = [] + for i, path in enumerate(sorted(path_list)): + if i % 500 == 0 or i == len(path_list) - 1: + print(f'processing {i} files') + filename = path.strip().split('/')[-1] + mel, mag = spec_feature_extraction(path) + data[filename] = mel + if dset == 'train' and i < n_utts_attr: + all_train_data.append(mel) + if dset == 'train': + all_train_data = np.concatenate(all_train_data) + mean = np.mean(all_train_data, axis=0) + std = np.std(all_train_data, axis=0) + attr = {'mean': mean, 'std': std} + with open(os.path.join(output_dir, 'attr.pkl'), 'wb') as f: + pickle.dump(attr, f) + for key, val in data.items(): + val = (val - mean) / std + data[key] = val + with open(output_path, 'wb') as f: + pickle.dump(data, f) + diff --git a/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/preprocess_libri.sh b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/preprocess_libri.sh new file mode 100755 index 0000000..7c7df11 --- /dev/null +++ b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/preprocess_libri.sh @@ -0,0 +1,19 @@ +. libri.config + +if [ $stage -le 0 ]; then + python3 make_datasets_libri.py $raw_data_dir/ $data_dir $test_prop $n_utts_attr $train_set $test_set +fi + +if [ $stage -le 1 ]; then + python3 reduce_dataset.py $data_dir/train.pkl $data_dir/train_$segment_size.pkl +fi + +if [ $stage -le 2 ]; then + # sample training samples + python3 sample_single_segments.py $data_dir/train.pkl $data_dir/train_samples_$segment_size.json $training_samples $segment_size +fi +if [ $stage -le 3 ]; then + # sample testing samples + python3 sample_single_segments.py $data_dir/dev.pkl $data_dir/dev_samples_$segment_size.json $testing_samples $segment_size + python3 sample_single_segments.py $data_dir/test.pkl $data_dir/test_samples_$segment_size.json $testing_samples $segment_size +fi diff --git a/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/preprocess_vctk.sh b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/preprocess_vctk.sh new file mode 100755 index 0000000..2df3974 --- /dev/null +++ b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/preprocess_vctk.sh @@ -0,0 +1,19 @@ +. vctk.config + +if [ $stage -le 0 ]; then + python3 make_datasets_vctk.py $raw_data_dir/wav48 $raw_data_dir/speaker-info.txt $data_dir $n_out_speakers $test_prop $sample_rate $n_utt_attr +fi + +if [ $stage -le 1 ]; then + python3 reduce_dataset.py $data_dir/train.pkl $data_dir/train_$segment_size.pkl $segment_size +fi + +if [ $stage -le 2 ]; then + # sample training samples + python3 sample_single_segments.py $data_dir/train.pkl $data_dir/train_samples_$segment_size.json $training_samples $segment_size +fi +if [ $stage -le 3 ]; then + # sample testing samples + python3 sample_single_segments.py $data_dir/in_test.pkl $data_dir/in_test_samples_$segment_size.json $testing_samples $segment_size + python3 sample_single_segments.py $data_dir/out_test.pkl $data_dir/out_test_samples_$segment_size.json $testing_samples $segment_size +fi diff --git a/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/reduce_dataset.py b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/reduce_dataset.py new file mode 100644 index 0000000..1672ee2 --- /dev/null +++ b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/reduce_dataset.py @@ -0,0 +1,15 @@ +import pickle +import sys + +if __name__ == '__main__': + pkl_path = sys.argv[1] + output_path = sys.argv[2] + segment_size = int(sys.argv[3]) + + with open(pkl_path, 'rb') as f: + data = pickle.load(f) + + reduced_data = {key:val for key, val in data.items() if val.shape[0] > segment_size} + + with open(output_path, 'wb') as f: + pickle.dump(reduced_data, f) diff --git a/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/sample_segments.py b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/sample_segments.py new file mode 100644 index 0000000..c9412dd --- /dev/null +++ b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/sample_segments.py @@ -0,0 +1,40 @@ +import json +import pickle +import sys +import os +import random + + +if __name__ == '__main__': + pickle_path = sys.argv[1] + sample_path = sys.argv[2] + n_samples = int(sys.argv[3]) + segment_size = int(sys.argv[4]) + + with open(pickle_path, 'rb') as f: + data = pickle.load(f) + + # (utt_id, timestep_1, timestep_2, neg_utt_id, neg_timestep) + samples = [] + + # filter length > 2 * segment_size + utt_list = [key for key in data] + utt_list = sorted(list(filter(lambda u : len(data[u]) > 2 * segment_size, utt_list))) + print(f'{len(utt_list)} utterances') + sample_utt_index_list = random.choices(range(len(utt_list)), k=n_samples) + + for i, utt_ind in enumerate(sample_utt_index_list): + if i % 500 == 0: + print(f'sample {i} samples') + pos_utt_id = utt_list[utt_ind] + neg_utt_id = random.choice(utt_list[:utt_ind] + utt_list[utt_ind + 1:]) + t1 = random.randint(0, len(data[pos_utt_id]) - 2 * segment_size) + t2 = random.randint(t1 + segment_size, len(data[pos_utt_id]) - segment_size) + # random swap t1, t2 + t1, t2 = random.sample([t1, t2], k=2) + t_neg = random.randint(0, len(data[neg_utt_id]) - segment_size) + samples.append((pos_utt_id, t1, t2, neg_utt_id, t_neg)) + + with open(sample_path, 'w') as f: + json.dump(samples, f) + diff --git a/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/sample_single_segments.py b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/sample_single_segments.py new file mode 100644 index 0000000..8bc3cf7 --- /dev/null +++ b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/sample_single_segments.py @@ -0,0 +1,34 @@ +import json +import pickle +import sys +import os +import random + +if __name__ == '__main__': + pickle_path = sys.argv[1] + sample_path = sys.argv[2] + n_samples = int(sys.argv[3]) + segment_size = int(sys.argv[4]) + + with open(pickle_path, 'rb') as f: + data = pickle.load(f) + + # (utt_id, timestep, neg_utt_id, neg_timestep) + samples = [] + + # filter length > segment_size + utt_list = [key for key in data] + utt_list = sorted(list(filter(lambda u : len(data[u]) > segment_size, utt_list))) + print(f'{len(utt_list)} utterances') + sample_utt_index_list = random.choices(range(len(utt_list)), k=n_samples) + + for i, utt_ind in enumerate(sample_utt_index_list): + if i % 500 == 0: + print(f'sample {i} samples') + utt_id = utt_list[utt_ind] + t = random.randint(0, len(data[utt_id]) - segment_size) + samples.append((utt_id, t)) + + with open(sample_path, 'w') as f: + json.dump(samples, f) + diff --git a/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/tacotron/hyperparams.py b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/tacotron/hyperparams.py new file mode 100644 index 0000000..988774d --- /dev/null +++ b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/tacotron/hyperparams.py @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- +#/usr/bin/python2 +''' +By kyubyong park. kbpark.linguist@gmail.com. +https://www.github.com/kyubyong/tacotron +''' +class Hyperparams: + '''Hyper parameters''' + + # pipeline + prepro = False # if True, run `python prepro.py` first before running `python train.py`. + + vocab = "PE abcdefghijklmnopqrstuvwxyz'.?" # P: Padding E: End of Sentence + + # data + data = "/data/private/voice/LJSpeech-1.0" + # data = "/data/private/voice/nick" + test_data = 'harvard_sentences.txt' + max_duration = 10.0 + top_db = 15 + + # signal processing + sr = 24000 # Sample rate. + n_fft = 2048 # fft points (samples) + frame_shift = 0.0125 # seconds + frame_length = 0.05 # seconds + hop_length = int(sr*frame_shift) # samples. + win_length = int(sr*frame_length) # samples. + n_mels = 512 # Number of Mel banks to generate + power = 1.2 # Exponent for amplifying the predicted magnitude + n_iter = 100 # Number of inversion iterations + preemphasis = .97 # or None + max_db = 100 + ref_db = 20 + + # model + embed_size = 256 # alias = E + encoder_num_banks = 16 + decoder_num_banks = 8 + num_highwaynet_blocks = 4 + r = 5 # Reduction factor. Paper => 2, 3, 5 + dropout_rate = .5 + + # training scheme + lr = 0.001 # Initial learning rate. + logdir = "logdir/01" + sampledir = 'samples' + batch_size = 32 + + + + diff --git a/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/tacotron/utils.py b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/tacotron/utils.py new file mode 100644 index 0000000..256a286 --- /dev/null +++ b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/tacotron/utils.py @@ -0,0 +1,183 @@ +# -*- coding: utf-8 -*- +# /usr/bin/python2 +''' +By kyubyong park. kbpark.linguist@gmail.com. +https://www.github.com/kyubyong/dc_tts +''' +from __future__ import print_function, division + +from .hyperparams import Hyperparams as hp +import numpy as np +import tensorflow as tf +import librosa +import copy +#import matplotlib +#matplotlib.use('pdf') +#import matplotlib.pyplot as plt +from scipy import signal +import os + +def _mel_to_linear_matrix(sr, n_fft, n_mels): + m = librosa.filters.mel(sr, n_fft, n_mels) + m_t = np.transpose(m) + p = np.matmul(m, m_t) + d = [1.0 / x if np.abs(x) > 1.0e-8 else x for x in np.sum(p, axis=0)] + return np.matmul(m_t, np.diag(d)) + +def get_spectrograms(fpath): + '''Returns normalized log(melspectrogram) and log(magnitude) from `sound_file`. + Args: + sound_file: A string. The full path of a sound file. + + Returns: + mel: A 2d array of shape (T, n_mels) <- Transposed + mag: A 2d array of shape (T, 1+n_fft/2) <- Transposed + ''' + # num = np.random.randn() + # if num < .2: + # y, sr = librosa.load(fpath, sr=hp.sr) + # else: + # if num < .4: + # tempo = 1.1 + # elif num < .6: + # tempo = 1.2 + # elif num < .8: + # tempo = 0.9 + # else: + # tempo = 0.8 + # cmd = "ffmpeg -i {} -y ar {} -hide_banner -loglevel panic -ac 1 -filter:a atempo={} -vn temp.wav".format(fpath, hp.sr, tempo) + # os.system(cmd) + # y, sr = librosa.load('temp.wav', sr=hp.sr) + + # Loading sound file + y, sr = librosa.load(fpath, sr=hp.sr) + + + # Trimming + y, _ = librosa.effects.trim(y, top_db=hp.top_db) + + # Preemphasis + y = np.append(y[0], y[1:] - hp.preemphasis * y[:-1]) + + # stft + linear = librosa.stft(y=y, + n_fft=hp.n_fft, + hop_length=hp.hop_length, + win_length=hp.win_length) + + # magnitude spectrogram + mag = np.abs(linear) # (1+n_fft//2, T) + + # mel spectrogram + mel_basis = librosa.filters.mel(hp.sr, hp.n_fft, hp.n_mels) # (n_mels, 1+n_fft//2) + mel = np.dot(mel_basis, mag) # (n_mels, t) + + # to decibel + mel = 20 * np.log10(np.maximum(1e-5, mel)) + mag = 20 * np.log10(np.maximum(1e-5, mag)) + + # normalize + mel = np.clip((mel - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1) + mag = np.clip((mag - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1) + + # Transpose + mel = mel.T.astype(np.float32) # (T, n_mels) + mag = mag.T.astype(np.float32) # (T, 1+n_fft//2) + + return mel, mag + +def melspectrogram2wav(mel): + '''# Generate wave file from spectrogram''' + # transpose + mel = mel.T + + # de-noramlize + mel = (np.clip(mel, 0, 1) * hp.max_db) - hp.max_db + hp.ref_db + + # to amplitude + mel = np.power(10.0, mel * 0.05) + m = _mel_to_linear_matrix(hp.sr, hp.n_fft, hp.n_mels) + mag = np.dot(m, mel) + + # wav reconstruction + wav = griffin_lim(mag) + + # de-preemphasis + wav = signal.lfilter([1], [1, -hp.preemphasis], wav) + + # trim + wav, _ = librosa.effects.trim(wav) + + return wav.astype(np.float32) + +def spectrogram2wav(mag): + '''# Generate wave file from spectrogram''' + # transpose + mag = mag.T + + # de-noramlize + mag = (np.clip(mag, 0, 1) * hp.max_db) - hp.max_db + hp.ref_db + + # to amplitude + mag = np.power(10.0, mag * 0.05) + + # wav reconstruction + wav = griffin_lim(mag) + + # de-preemphasis + wav = signal.lfilter([1], [1, -hp.preemphasis], wav) + + # trim + wav, _ = librosa.effects.trim(wav) + + return wav.astype(np.float32) + + +def griffin_lim(spectrogram): + '''Applies Griffin-Lim's raw. + ''' + X_best = copy.deepcopy(spectrogram) + for i in range(hp.n_iter): + X_t = invert_spectrogram(X_best) + est = librosa.stft(X_t, hp.n_fft, hp.hop_length, win_length=hp.win_length) + phase = est / np.maximum(1e-8, np.abs(est)) + X_best = spectrogram * phase + X_t = invert_spectrogram(X_best) + y = np.real(X_t) + + return y + + +def invert_spectrogram(spectrogram): + ''' + spectrogram: [f, t] + ''' + return librosa.istft(spectrogram, hp.hop_length, win_length=hp.win_length, window="hann") + + +def plot_alignment(alignment, gs): + """Plots the alignment + alignments: A list of (numpy) matrix of shape (encoder_steps, decoder_steps) + gs : (int) global step + """ + fig, ax = plt.subplots() + im = ax.imshow(alignment) + + # cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7]) + fig.colorbar(im) + plt.title('{} Steps'.format(gs)) + plt.savefig('{}/alignment_{}k.png'.format(hp.logdir, gs//1000), format='png') + +def learning_rate_decay(init_lr, global_step, warmup_steps=4000.): + '''Noam scheme from tensor2tensor''' + step = tf.cast(global_step + 1, dtype=tf.float32) + return init_lr * warmup_steps ** 0.5 * tf.minimum(step * warmup_steps ** -1.5, step ** -0.5) + +def load_spectrograms(fpath): + fname = os.path.basename(fpath) + mel, mag = get_spectrograms(fpath) + t = mel.shape[0] + num_paddings = hp.r - (t % hp.r) if t % hp.r != 0 else 0 # for reduction + mel = np.pad(mel, [[0, num_paddings], [0, 0]], mode="constant") + mag = np.pad(mag, [[0, num_paddings], [0, 0]], mode="constant") + return fname, mel.reshape((-1, hp.n_mels*hp.r)), mag diff --git a/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/vctk.config b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/vctk.config new file mode 100644 index 0000000..b8e5fcc --- /dev/null +++ b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/preprocess/vctk.config @@ -0,0 +1,10 @@ +stage=0 +segment_size=128 +data_dir=/groups/jjery2243542/data/vctk/trimmed_vctk_spectrograms/sr_24000_mel_norm/ #数据放入 +raw_data_dir=/groups/jjery2243542/data/raw/VCTK-Corpus ##输入数据 +n_out_speakers=20 +test_prop=0.1 +sample_rate=24000 +training_samples=10000000 +testing_samples=10000 +n_utt_attr=5000 diff --git a/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/solver.py b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/solver.py new file mode 100644 index 0000000..a6ae961 --- /dev/null +++ b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/solver.py @@ -0,0 +1,119 @@ +import torch +import numpy as np +import sys +import os +import torch.nn as nn +import torch.nn.functional as F +import yaml +import pickle +from model import AE +from data_utils import get_data_loader +from data_utils import PickleDataset +from utils import * +from functools import reduce +from collections import defaultdict + +class Solver(object): + def __init__(self, config, args): + # config store the value of hyperparameters, turn to attr by AttrDict + self.config = config + print(config) + + # args store other information + self.args = args + print(self.args) + + # logger to use tensorboard + self.logger = Logger(self.args.logdir) + + # get dataloader + self.get_data_loaders() + + # init the model with config + self.build_model() + self.save_config() + + if args.load_model: + self.load_model() + + def save_model(self, iteration): + # save model and discriminator and their optimizer + torch.save(self.model.state_dict(), f'{self.args.store_model_path}.ckpt') + torch.save(self.opt.state_dict(), f'{self.args.store_model_path}.opt') + + def save_config(self): + with open(f'{self.args.store_model_path}.config.yaml', 'w') as f: + yaml.dump(self.config, f) + with open(f'{self.args.store_model_path}.args.yaml', 'w') as f: + yaml.dump(vars(self.args), f) + return + + def load_model(self): + print(f'Load model from {self.args.load_model_path}') + self.model.load_state_dict(torch.load(f'{self.args.load_model_path}.ckpt')) + self.opt.load_state_dict(torch.load(f'{self.args.load_model_path}.opt')) + return + + def get_data_loaders(self): + data_dir = self.args.data_dir + self.train_dataset = PickleDataset(os.path.join(data_dir, f'{self.args.train_set}.pkl'), + os.path.join(data_dir, self.args.train_index_file), + segment_size=self.config['data_loader']['segment_size']) + self.train_loader = get_data_loader(self.train_dataset, + frame_size=self.config['data_loader']['frame_size'], + batch_size=self.config['data_loader']['batch_size'], + shuffle=self.config['data_loader']['shuffle'], + num_workers=4, drop_last=False) + self.train_iter = infinite_iter(self.train_loader) + return + + def build_model(self): + # create model, discriminator, optimizers + self.model = cc(AE(self.config)) + print(self.model) + optimizer = self.config['optimizer'] + self.opt = torch.optim.Adam(self.model.parameters(), + lr=optimizer['lr'], betas=(optimizer['beta1'], optimizer['beta2']), + amsgrad=optimizer['amsgrad'], weight_decay=optimizer['weight_decay']) + print(self.opt) + return + + def ae_step(self, data, lambda_kl): + x = cc(data) + mu, log_sigma, emb, dec = self.model(x) + criterion = nn.L1Loss() + loss_rec = criterion(dec, x) + loss_kl = 0.5 * torch.mean(torch.exp(log_sigma) + mu ** 2 - 1 - log_sigma) + loss = self.config['lambda']['lambda_rec'] * loss_rec + \ + lambda_kl * loss_kl + self.opt.zero_grad() + loss.backward() + grad_norm = torch.nn.utils.clip_grad_norm_(self.model.parameters(), + max_norm=self.config['optimizer']['grad_norm']) + self.opt.step() + meta = {'loss_rec': loss_rec.item(), + 'loss_kl': loss_kl.item(), + 'grad_norm': grad_norm} + return meta + + def train(self, n_iterations): + for iteration in range(n_iterations): + if iteration >= self.config['annealing_iters']: + lambda_kl = self.config['lambda']['lambda_kl'] + else: + lambda_kl = self.config['lambda']['lambda_kl'] * (iteration + 1) / self.config['annealing_iters'] + data = next(self.train_iter) + meta = self.ae_step(data, lambda_kl) + # add to logger + if iteration % self.args.summary_steps == 0: + self.logger.scalars_summary(f'{self.args.tag}/ae_train', meta, iteration) + loss_rec = meta['loss_rec'] + loss_kl = meta['loss_kl'] + + print(f'AE:[{iteration + 1}/{n_iterations}], loss_rec={loss_rec:.2f}, ' + f'loss_kl={loss_kl:.2f}, lambda={lambda_kl:.1e} ', end='\r') + if (iteration + 1) % self.args.save_steps == 0 or iteration + 1 == n_iterations: + self.save_model(iteration=iteration) + print() + return + diff --git a/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/spk_compare.py b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/spk_compare.py new file mode 100755 index 0000000..b684fc5 --- /dev/null +++ b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/spk_compare.py @@ -0,0 +1,171 @@ +import torch +import numpy as np +import sys +import os +import torch.nn as nn +import torch.nn.functional as F +import yaml +import pickle +from model import AE +from utils import * +from functools import reduce +import json +from collections import defaultdict +from torch.utils.data import Dataset +from torch.utils.data import TensorDataset +from torch.utils.data import DataLoader +from argparse import ArgumentParser, Namespace +from scipy.io.wavfile import write +import random +from preprocess.tacotron.utils import melspectrogram2wav +from preprocess.tacotron.utils import get_spectrograms +import librosa + +class Inferencer(object): + def __init__(self, config, args): + # config store the value of hyperparameters, turn to attr by AttrDict + self.config = config + print(config) + # args store other information + self.args = args + print(self.args) + + # init the model with config + self.build_model() + + # load model + self.load_model() + + with open(self.args.attr, 'rb') as f: + self.attr = pickle.load(f) + + def load_model(self): + print(f'Load model from {self.args.model}') + self.model.load_state_dict(torch.load(f'{self.args.model}')) + return + + def build_model(self): + # create model, discriminator, optimizers + self.model = cc(AE(self.config)) + print(self.model) + self.model.eval() + return + + def utt_make_frames(self, x): + frame_size = self.config['data_loader']['frame_size'] + remains = x.size(0) % frame_size + if remains != 0: + x = F.pad(x, (0, remains)) + out = x.view(1, x.size(0) // frame_size, frame_size * x.size(1)).transpose(1, 2) + return out + + def inference_one_utterance(self, x, x_cond): + x = self.utt_make_frames(x) + x_cond = self.utt_make_frames(x_cond) + dec,spk_emb = self.model.inference(x, x_cond) + print('@@@@spk_emb.size',spk_emb.shape) + dec = dec.transpose(1, 2).squeeze(0) + dec = dec.detach().cpu().numpy() + dec = self.denormalize(dec) + wav_data = melspectrogram2wav(dec) + return wav_data, dec,spk_emb + + def denormalize(self, x): + m, s = self.attr['mean'], self.attr['std'] + ret = x * s + m + return ret + + def normalize(self, x): + m, s = self.attr['mean'], self.attr['std'] + ret = (x - m) / s + return ret + + def write_wav_to_file(self, wav_data, output_path): + write(output_path, rate=self.args.sample_rate, data=wav_data) + return + + def inference_from_path(self): + src_mel, _ = get_spectrograms(self.args.source) + tar_mel, _ = get_spectrograms(self.args.target) + src_mel = torch.from_numpy(self.normalize(src_mel)).cuda() + tar_mel = torch.from_numpy(self.normalize(tar_mel)).cuda() + conv_wav, conv_mel,spk_emb = self.inference_one_utterance(src_mel, tar_mel) + self.write_wav_to_file(conv_wav, self.args.output) + return spk_emb + +def infer_main(source_wav_name): + parser = ArgumentParser() + parser.add_argument('-attr', '-a', help='attr file path') + parser.add_argument('-config', '-c', help='config file path') + parser.add_argument('-model', '-m', help='model path') + parser.add_argument('-source', '-s', help='source wav path') + parser.add_argument('-target', '-t', help='target wav path') + parser.add_argument('-output', '-o', help='output wav path') + parser.add_argument('-sample_rate', '-sr', help='sample rate', default=24000, type=int) + args = parser.parse_args() + + + args.attr = './premodes/attr.pkl' + args.config = './config.yaml' + args.model = './premodes/vctk_model.ckpt' + args.source = '/data/bingxiao.fang/voice_conversion/voice_wav/target_path/jl_24k.wav' #source_wav_name + args.target = source_wav_name # '/data/bingxiao.fang/voice_conversion/voice_wav/target_path/jl_24k.wav' + args.output = '/data/bingxiao.fang/voice_conversion/voice_wav/synthesis_pth/yibo2jl_24k.wav' + + # load config file + print('args.config: ',args.config) + with open(args.config) as f: + config = yaml.load(f, Loader=yaml.FullLoader) + inferencer = Inferencer(config=config, args=args) + spk_emb = inferencer.inference_from_path() + # print("args.output:",args.output) + return spk_emb + +def l2_norm(s1, s2): + norm = torch.sum(s1 * s2, -1, keepdim=True) + return norm + +def cos_distance(s1,s2,eps=1e-8): + + s1_s2_norm = l2_norm(s1, s2) + s2_s2_norm = l2_norm(s2, s2) + s1_s1_norm = l2_norm(s1, s1) + print('s1_s1_norm: ',s1_s1_norm) + print('s1_s2_norm: ',s1_s2_norm) + print('s2_s2_norm: ',s2_s2_norm) + loss = s1_s2_norm / (torch.sqrt(s2_s2_norm *s1_s1_norm) + eps) + + return loss + +def similarity_pro(wav_name1,wav_name2): + + spk_emb1 = infer_main(wav_name1) + spk_emb2 = infer_main(wav_name2) + + print('spk_emb1 size: ',spk_emb1.shape) + pro = cos_distance(spk_emb1,spk_emb2) + print("probability size is :", pro.shape) + return pro + + + + + +if __name__ == '__main__': + + wav_name1 = '/data/bingxiao.fang/voice_conversion/voice_wav/target_path/jl_24k.wav' + wav_name2 = '/data/bingxiao.fang/voice_conversion/voice_wav/source_path/yibo_24k.wav' + wav_name3 = '/data/bingxiao.fang/voice_conversion/voice_wav/target_path/180_3_3_24k.wav' + pro1 = similarity_pro(wav_name1,wav_name3) + pro2 = similarity_pro(wav_name1,wav_name2) + pro3 = similarity_pro(wav_name3,wav_name2) + + print('###jl and suiyanzi wavcos distance :',pro1) + print('###jl and yibo wav cos distance :',pro2) + print('###yibo and sunyanzi wav cos distance :',pro3) + + + + + + diff --git a/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/spk_compare1.py b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/spk_compare1.py new file mode 100755 index 0000000..17d57de --- /dev/null +++ b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/spk_compare1.py @@ -0,0 +1,180 @@ +import torch +import numpy as np +import sys +import os +import torch.nn as nn +import torch.nn.functional as F +import yaml +import pickle +from adaptive_voice_conversion.model import AE +from adaptive_voice_conversion.utils import * +from functools import reduce +import json +from collections import defaultdict +from torch.utils.data import Dataset +from torch.utils.data import TensorDataset +from torch.utils.data import DataLoader +from argparse import ArgumentParser, Namespace +from scipy.io.wavfile import write +import random +from adaptive_voice_conversion.preprocess.tacotron.utils import melspectrogram2wav +from adaptive_voice_conversion.preprocess.tacotron.utils import get_spectrograms +import librosa + + +class Inferencer(object): + def __init__(self, config, args): + # config store the value of hyperparameters, turn to attr by AttrDict + self.config = config + print(config) + # args store other information + self.args = args + print(self.args) + + # init the model with config + self.build_model() + + # load model + self.load_model() + + with open(self.args.attr, 'rb') as f: + self.attr = pickle.load(f) + + def load_model(self): + # print(f'Load model from {self.args.model}') + self.model.load_state_dict(torch.load(f'{self.args.model}')) + return + + def build_model(self): + # create model, discriminator, optimizers + self.model = cc(AE(self.config)) + # print(self.model) + self.model.eval() + return + + def utt_make_frames(self, x): + frame_size = self.config['data_loader']['frame_size'] + remains = x.size(0) % frame_size + if remains != 0: + x = F.pad(x, (0, remains)) + out = x.view(1, x.size(0) // frame_size, frame_size * x.size(1)).transpose(1, 2) + return out + + def inference_one_utterance(self, x, x_cond): + x = self.utt_make_frames(x) + x_cond = self.utt_make_frames(x_cond) + dec, spk_emb = self.model.inference(x, x_cond) + print('@@@@spk_emb.size', spk_emb.shape) + dec = dec.transpose(1, 2).squeeze(0) + dec = dec.detach().cpu().numpy() + dec = self.denormalize(dec) + wav_data = melspectrogram2wav(dec) + return wav_data, dec, spk_emb + + def get_emb(self, x): + x = self.utt_make_frames(x) + return self.model.get_speaker_embeddings(x) + + def denormalize(self, x): + m, s = self.attr['mean'], self.attr['std'] + ret = x * s + m + return ret + + def normalize(self, x): + m, s = self.attr['mean'], self.attr['std'] + ret = (x - m) / s + return ret + + def write_wav_to_file(self, wav_data, output_path): + write(output_path, rate=self.args.sample_rate, data=wav_data) + return + + def inference_from_path(self, target_wav): + self.args.target = target_wav + src_mel, _ = get_spectrograms(self.args.source) + tar_mel, _ = get_spectrograms(self.args.target) + src_mel = torch.from_numpy(self.normalize(src_mel)).cuda() + tar_mel = torch.from_numpy(self.normalize(tar_mel)).cuda() + conv_wav, conv_mel, spk_emb = self.inference_one_utterance(src_mel, tar_mel) + # self.write_wav_to_file(conv_wav, self.args.output) + return spk_emb + + def inference_from_path_v1(self, target_wav): + tar_mel, _ = get_spectrograms(target_wav) + tar_mel = torch.from_numpy(self.normalize(tar_mel)).cuda() + with torch.no_grad(): + spk_emb = self.get_emb(tar_mel) + return spk_emb + + +def infer_load(): + parser = ArgumentParser() + parser.add_argument('-attr', '-a', help='attr file path') + parser.add_argument('-config', '-c', help='config file path') + parser.add_argument('-model', '-m', help='model path') + parser.add_argument('-source', '-s', help='source wav path') + parser.add_argument('-target', '-t', help='target wav path') + parser.add_argument('-output', '-o', help='output wav path') + parser.add_argument('-sample_rate', '-sr', help='sample rate', default=24000, type=int) + args = parser.parse_args() + + base_dir = os.path.dirname(__file__) + args.attr = os.path.join(base_dir, 'premodes/attr.pkl') + args.config = os.path.join(base_dir, 'config.yaml') + args.model = os.path.join(base_dir, 'premodes/vctk_model.ckpt') + + # load config file + print('args.config: ', args.config) + with open(args.config) as f: + config = yaml.load(f, Loader=yaml.FullLoader) + inferencer = Inferencer(config=config, args=args) + + return inferencer + + +def infer_main(inferencer, target_wav_name): + spk_emb = inferencer.inference_from_path_v1(target_wav_name).detach().cpu().numpy().squeeze() + return spk_emb + + +def l2_norm(s1, s2): + norm = np.sum(s1 * s2, -1) + return norm + + +def cos_distance(s1, s2, eps=1e-8): + s1_s2_norm = l2_norm(s1, s2) + s2_s2_norm = l2_norm(s2, s2) + s1_s1_norm = l2_norm(s1, s1) + print('s1_s1_norm: ', s1_s1_norm) + print('s1_s2_norm: ', s1_s2_norm) + print('s2_s2_norm: ', s2_s2_norm) + loss = s1_s2_norm / (np.sqrt(s2_s2_norm * s1_s1_norm) + eps) + + return loss + + +def similarity_pro(inferencer, wav_name1, wav_name2): + spk_emb1 = infer_main(inferencer, wav_name1) + spk_emb2 = infer_main(inferencer, wav_name2) + + # print('spk_emb1 size: ',spk_emb1.shape) + pro = cos_distance(spk_emb1, spk_emb2) + # print("probability size is :", pro.shape) + return pro + + +if __name__ == '__main__': + inferencer = infer_load() + wav_name1 = '/data/bingxiao.fang/voice_conversion/voice_wav/target_path/jl_24k.wav' + wav_name2 = '/data/bingxiao.fang/voice_conversion/voice_wav/source_path/yibo_24k.wav' + wav_name3 = '/data/bingxiao.fang/voice_conversion/voice_wav/target_path/180_3_3_24k.wav' + pro1 = similarity_pro(inferencer, wav_name1, wav_name3) + pro2 = similarity_pro(inferencer, wav_name1, wav_name2) + pro3 = similarity_pro(inferencer, wav_name3, wav_name2) + + print('###jl and suiyanzi wavcos distance :', pro1) + print('###jl and yibo wav cos distance :', pro2) + print('###yibo and sunyanzi wav cos distance :', pro3) + + pass diff --git a/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/train.sh b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/train.sh new file mode 100755 index 0000000..d1ad65b --- /dev/null +++ b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/train.sh @@ -0,0 +1 @@ +python3 main.py -c config.yaml -d /groups/jjery2243542/data/vctk/trimmed_vctk_spectrograms/sr_24000_mel_norm -train_set train_128 -train_index_file train_samples_128.json -store_model_path /groups/jjery2243542/model/adaptive_vc/vctk_model -t vctk_model -iters 500000 -summary_step 500 diff --git a/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/utils.py b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/utils.py new file mode 100644 index 0000000..bc910e0 --- /dev/null +++ b/AutoCoverTool/ref/speaker_feature_extractor/adaptive_voice_conversion/utils.py @@ -0,0 +1,35 @@ +import torch +import numpy as np +from tensorboardX import SummaryWriter +import editdistance +import torch.nn as nn +import torch.nn.init as init + +def cc(net): + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + return net.to(device) + +class Logger(object): + def __init__(self, logdir='./log'): + self.writer = SummaryWriter(logdir) + + def scalar_summary(self, tag, value, step): + self.writer.add_scalar(tag, value, step) + + def scalars_summary(self, tag, dictionary, step): + self.writer.add_scalars(tag, dictionary, step) + + def text_summary(self, tag, value, step): + self.writer.add_text(tag, value, step) + + def audio_summary(self, tag, value, step, sr): + writer.add_audio(tag, value, step, sample_rate=sr) + +def infinite_iter(iterable): + it = iter(iterable) + while True: + try: + ret = next(it) + yield ret + except StopIteration: + it = iter(iterable) diff --git a/AutoCoverTool/ref/speaker_feature_extractor/readme.txt b/AutoCoverTool/ref/speaker_feature_extractor/readme.txt new file mode 100644 index 0000000..ba9bf5f --- /dev/null +++ b/AutoCoverTool/ref/speaker_feature_extractor/readme.txt @@ -0,0 +1,17 @@ +简介: 提取说话人的特征 +输入和输出: 输入一段24k单声道音频(wav), 输出一个128维度的特征向量 +性能: nvidia v100 32G显存 8c40G 机型 +输入音频: 9.34s 耗时:0.086s + + +环境安装: +pip install torch +pip install numpy +pip install pyyaml +pip install tensorboardX +pip install editdistance +pip install scipy +pip install tensorflow +pip install librosa==0.9.1 + +快速使用: python3 sf_extractor_interface.py [这是一个测试文件,建议阅读该测试文件后直接使用内部数据] diff --git a/AutoCoverTool/ref/speaker_feature_extractor/res/test.wav b/AutoCoverTool/ref/speaker_feature_extractor/res/test.wav new file mode 100644 index 0000000..e5afa4b Binary files /dev/null and b/AutoCoverTool/ref/speaker_feature_extractor/res/test.wav differ diff --git a/AutoCoverTool/ref/speaker_feature_extractor/sf_extractor_interface.py b/AutoCoverTool/ref/speaker_feature_extractor/sf_extractor_interface.py new file mode 100644 index 0000000..a23af19 --- /dev/null +++ b/AutoCoverTool/ref/speaker_feature_extractor/sf_extractor_interface.py @@ -0,0 +1,47 @@ +""" +声纹特征提取器 +输入一段wav音频(采样率24k,单声道) 时长不限 输出是一段长度128的编码 +""" +import os +import sys +import time + +sys.path.append(os.path.dirname(os.path.abspath(__file__))) +from adaptive_voice_conversion.spk_compare1 import infer_load, infer_main, cos_distance + + +class SFExtractorInterface: + def __init__(self): + st = time.time() + self.inst = infer_load() + print("load data sp={}".format(time.time() - st)) + + def process(self, wav_path): + return infer_main(self.inst, wav_path) + + +def test(): + sf = SFExtractorInterface() + tot = [] + for i in range(0, 10): + st = time.time() + emb = sf.process("res/test.wav") + print("sp={}".format(time.time() - st)) + tot.append(time.time() - st) + print(emb.shape, type(emb)) + print(emb) + print("avg_sp={}".format(sum(tot[1:]) / len(tot[1:]))) + # cos距离,0到1,越大越相近 + rate = cos_distance(emb, emb) + print(rate) + + +def run(): + input_wav = sys.argv[1] + sf = SFExtractorInterface() + emb = sf.process(input_wav) + print(emb) + + +if __name__ == '__main__': + test() diff --git a/AutoCoverTool/script/get_user_recordings.py b/AutoCoverTool/script/get_user_recordings.py index 3d56aed..e589dc8 100644 --- a/AutoCoverTool/script/get_user_recordings.py +++ b/AutoCoverTool/script/get_user_recordings.py @@ -1,178 +1,184 @@ """ 获取用户数据 """ import os import time import glob import json import librosa import soundfile from script.common import * def exec_cmd(cmd): r = os.popen(cmd) text = r.read() r.close() return text def get_d(audio_path): cmd = "ffprobe -v quiet -print_format json -show_format -show_streams {}".format(audio_path) data = exec_cmd(cmd) data = json.loads(data) if "format" in data.keys(): if "duration" in data['format']: return float(data["format"]["duration"]) return 0 def get_user_recordings(user_id): sql = "select id, recording_url from recording where user_id={} and created_on > {} and is_public = 1 and is_deleted = 0 and media_type in (1, 2, 3, 4, 9, 10) ".format( user_id, time.time() - 86400 * 30) res = get_shard_data_by_sql(sql, user_id) true_num = 0 for id, url in res: if download_url(url, user_id, str(id)): true_num += 1 if true_num > 15: break def download_url(url, uid, rid): url = str(url).replace("master.mp4", "origin_master.mp4") c_dir = "/data/rsync/jianli.yang/AutoCoverTool/data/train_users/0414_0514/{}".format(uid) if not os.path.exists(c_dir): os.makedirs(c_dir) c_dir = os.path.join(c_dir, "src") if not os.path.exists(c_dir): os.makedirs(c_dir) cmd = "wget {} -O {}/{}.mp4".format(url, c_dir, rid) os.system(cmd) # 转码为44k双声道音频 in_path = os.path.join(c_dir, rid + ".mp4") if os.path.exists(in_path): duration = get_d(in_path) print("duration={}".format(duration)) if duration > 30: dst_path = in_path.replace(".mp4", ".wav") - cmd = "ffmpeg -i {} -ar 44100 -ac 1 {}".format(in_path, dst_path) + cmd = "ffmpeg -i {} -ar 44100 -ac 1 -y {}".format(in_path, dst_path) print("exec={}".format(cmd)) os.system(cmd) return os.path.exists(dst_path) return False def split_to_idx(ppath, dst_path, user_id): frame_len = 32000 * 15 files = glob.glob(os.path.join(ppath, "*mp4")) mmax = 0 for file in files: try: audio, sr = librosa.load(file, sr=32000, mono=True) except Exception as ex: continue print("audio_len:={}".format(audio.shape)) for i in range(0, len(audio), frame_len): if i + frame_len > len(audio): break cur_data = audio[i:i + frame_len] out_path = os.path.join(dst_path, "{}_{}.wav".format(user_id, mmax)) print("save to {}".format(out_path)) # librosa.output.write_wav(out_path, cur_data, 32000) soundfile.write(out_path, cur_data, 32000, format="wav") mmax += 1 def process(): from online.beanstalk_helper import BeanstalkHelper config = {"addr": "sg-test-common-box-1:11300", "consumer": "auto_cover_tool_download_user"} bean_helper = BeanstalkHelper(config) bean = bean_helper.get_beanstalkd() bean.watch(config["consumer"]) while True: payload = bean.reserve(5) if not payload: logging.info("bean sleep...") continue in_data = json.loads(payload.body) - get_user_recordings(in_data["user_id"]) + user_id = in_data["user_id"] + try: + user_id_int = int(float(user_id)) + get_user_recordings(in_data["user_id"]) + except Exception as ex: + pass + payload.delete() def put_data(file_path): lines = [] with open(file_path, "r") as f: while True: line = f.readline().strip() if not line: break lines.append(line) from online.beanstalk_helper import BeanstalkHelper config = {"addr": "sg-test-common-box-1:11300", "consumer": "auto_cover_tool_download_user"} bean_helper = BeanstalkHelper(config) for idx, line in enumerate(lines): if idx == 0: continue user_id = line.split(",")[0] message = json.dumps({"user_id": str(user_id)}) bean_helper.put_payload_to_beanstalk(config["consumer"], message) def copy_data(): base_dir = "/data/rsync/jianli.yang/AutoCoverTool/data/train_users/0414_0514" dst_dir = "/data/rsync/jianli.yang/AutoCoverTool/data/train_users/0414_0514_finish" # 只要10首干声以及以上的 dirs = glob.glob(os.path.join(base_dir, "*")) for cur_dir in dirs: cur_name = cur_dir.split("/")[-1] cur_mp4_files = glob.glob(os.path.join(cur_dir, "src/*wav")) if len(cur_mp4_files) > 10: print("mv {} {}".format(cur_dir, os.path.join(dst_dir, cur_name))) if __name__ == '__main__': process() # put_data("res/0414_0514.csv") # arr = [ # "5348024335101054", # "4222124657245641", # "5629499489117674", # "12384898975368914", # "5629499489839033", # "5348024336648185", # "5910973794961321", # "3635518643", # "844424937670811", # "4785074600577375", # "6755399442719465", # "4785074603156924", # "11540474053041727", # "6473924129711210", # "7036874421386111", # "7599824376482810", # "6755399447475416", # "8444249306118343", # "3377699721107378", # "12947848931397021", # "7599824374449011", # "3096224748076687", # "12103424006572822", # "1125899914308640", # "12666373952417962", # "281474982845813", # "11821949029679778", # "12947848937379499", # "12947848936090348", # "3096224747262571", # "2814749767432467", # "5066549357604730", # "3096224751151928" # ] # for uuid in arr: # get_user_recordings(uuid) # print("finish =={} ".format(uuid)) # copy_data() diff --git a/AutoCoverTool/script/multi_trainer.py b/AutoCoverTool/script/multi_trainer.py new file mode 100644 index 0000000..fa4b194 --- /dev/null +++ b/AutoCoverTool/script/multi_trainer.py @@ -0,0 +1,87 @@ +""" +分布式训练 +""" +import os +import json +import glob + + +def put_data(): + dir_list = glob.glob("/data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/*") + + already_user_ids = [ + "10133099162839896", + "10133099162997509", + "10133099163727028", + "10133099163890661", + "10133099163991355", + "10133099164311744", + "10133099164313669", + "10133099165386135", + "10133099166041782", + "10133099166050735", + "10133099166238022", + "10133099166605472", + "10133099166892845", + "10133099166898301", + "10133099167125366", + "10133099167394822", + "10133099167940583", + "10133099168376799", + "10133099168924385", + "8725724286358130", + "finish" + ] + + from online.beanstalk_helper import BeanstalkHelper + config = {"addr": "sg-test-common-box-1:11300", "consumer": "auto_user_svc_trainer"} + bean_helper = BeanstalkHelper(config) + for idx, line in enumerate(dir_list): + cur_dir = str(line.split("/")[-1]) + if cur_dir not in already_user_ids: + message = json.dumps({"user_id": str(cur_dir)}) + bean_helper.put_payload_to_beanstalk(config["consumer"], message, ttr=2 * 86400) + print(len("tot_num={}".format(len(dir_list)))) + + +def process(): + from online.beanstalk_helper import BeanstalkHelper + bb_dir = "/data/rsync/jianli.yang/AutoCoverTool/data/train_users/multi_users/" + config = {"addr": "sg-test-common-box-1:11300", "consumer": "auto_user_svc_trainer"} + bean_helper = BeanstalkHelper(config) + bean = bean_helper.get_beanstalkd() + bean.watch(config["consumer"]) + while True: + payload = bean.reserve(5) + if not payload: + print("bean sleep...") + continue + in_data = json.loads(payload.body) + user_id = in_data["user_id"] + try: + user_id_int = int(float(user_id)) + g_2000_path = os.path.join(bb_dir, os.path.join(str(user_id_int), "logs/32k/G_2000.pth")) + if os.path.exists(g_2000_path): + print("log, {} already exists!....".format(user_id_int)) + payload.delete() + continue + + src_dir = os.path.join(bb_dir, os.path.join(str(user_id_int), "src")) + if not os.path.exists(src_dir): + print("log, {} src not exists!....".format(user_id_int)) + payload.delete() + continue + + cmd = "bash script/train.sh {}".format(user_id_int) + print(cmd) + os.system(cmd) + print("log, train {} ok....".format(user_id_int)) + + except Exception as ex: + pass + + payload.delete() + + +if __name__ == '__main__': + process() diff --git a/AutoCoverTool/svc_inference/config.json b/AutoCoverTool/svc_inference/config.json new file mode 100644 index 0000000..8399ea3 --- /dev/null +++ b/AutoCoverTool/svc_inference/config.json @@ -0,0 +1,90 @@ +{ + "train": { + "log_interval": 200, + "eval_interval": 1000, + "seed": 1234, + "epochs": 1000, + "learning_rate": 0.0001, + "betas": [ + 0.8, + 0.99 + ], + "eps": 1e-09, + "batch_size": 12, + "fp16_run": false, + "lr_decay": 0.999875, + "segment_size": 17920, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0, + "use_sr": true, + "max_speclen": 384, + "port": "8002" + }, + "data": { + "training_files": "/data/rsync/jianli.yang/AutoCoverTool/data/train_users/dlj_v1/filelists/train.txt", + "validation_files": "/data/rsync/jianli.yang/AutoCoverTool/data/train_users/dlj_v1/filelists/val.txt", + "max_wav_value": 32768.0, + "sampling_rate": 32000, + "filter_length": 1280, + "hop_length": 320, + "win_length": 1280, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0.1, + "resblock": "1", + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "upsample_rates": [ + 10, + 8, + 2, + 2 + ], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4 + ], + "n_layers_q": 3, + "use_spectral_norm": false, + "gin_channels": 256, + "ssl_dim": 256, + "n_speakers": 2 + }, + "spk": { + "speaker0": 0 + } +} \ No newline at end of file diff --git a/AutoCoverTool/svc_inference/svc_inference_one.py b/AutoCoverTool/svc_inference/svc_inference_one.py new file mode 100644 index 0000000..1bab8e0 --- /dev/null +++ b/AutoCoverTool/svc_inference/svc_inference_one.py @@ -0,0 +1,187 @@ +""" +SVC推理逻辑 +""" +import os +import shutil +from ref.so_vits_svc.inference_main import * +from ref.speaker_feature_extractor.sf_extractor_interface import SFExtractorInterface + +gs_draw_volume_exe = "/data/gpu_env_common/bin/draw_volume" +gs_simple_mixer_path = "/data/gpu_env_common/bin/simple_mixer" + +gs_svci_success = 0 +gs_svci_data_params_check_model_path = 1 +gs_svci_data_params_check_vocal_path = 2 +gs_svci_data_params_check_acc_path = 3 +gs_svci_data_params_check_video_path = 4 +gs_svci_data_prepare_transcode_media = 5 +gs_svci_data_inference = 6 +gs_svci_svc_trans_442 = 7 +gs_svci_svc_volume = 8 +gs_svci_svc_mix = 9 +gs_svci_svc_mix_gen = 10 +gs_svci_svc_mix_audio_video = 11 + + +class SVCInferenceOne: + def __init__(self): + self.vocal_32_wav_path = None + self.vocal_wav_path = None + self.acc_wav_path = None + self.config = os.path.join(os.path.dirname(os.path.abspath(__file__)), "config.json") + self.spk_emb_inst = SFExtractorInterface() + + def mix(self, work_dir, svc_file, vocal_file, acc_file, mix_path): + """ + :param work_dir: + :param svc_file: + :param vocal_file: + :param acc_file: + :param mix_path: + :return: + """ + cache_dir = os.path.join(work_dir, "cache") + if os.path.exists(cache_dir): + shutil.rmtree(cache_dir) + os.makedirs(cache_dir) + + # svc转码到442 + svc_442_file = os.path.join(cache_dir, "442.wav") + st = time.time() + cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {} -loglevel fatal".format(svc_file, svc_442_file) + os.system(cmd) + if not os.path.exists(svc_442_file): + return gs_svci_svc_trans_442 + logging.info("transcode,{},sp={}".format(svc_file, time.time() - st)) + + # 合并转码后再做一次拉伸,保证响度 + st = time.time() + volume_path = os.path.join(cache_dir, "volume.wav") + cmd = "{} {} {} {}".format(gs_draw_volume_exe, svc_442_file, vocal_file, volume_path) + os.system(cmd) + if not os.path.exists(volume_path): + print("{} ERROR draw volume".format(volume_path)) + return gs_svci_svc_volume + logging.info("draw_volume2,{},sp={}".format(svc_file, time.time() - st)) + + # 混合 + st = time.time() + mix_wav_path = os.path.join(cache_dir, "mix.wav") + cmd = "{} {} {} {}".format(gs_simple_mixer_path, volume_path, acc_file, mix_wav_path) + os.system(cmd) + if not os.path.exists(mix_wav_path): + return gs_svci_svc_mix + logging.info("mixer,{},sp={}".format(svc_file, time.time() - st)) + + # 编码为m4a + st = time.time() + cmd = "ffmpeg -i {} -ab 128k -y {} -loglevel fatal".format(mix_wav_path, mix_path) + print(cmd) + os.system(cmd) + if not os.path.exists(mix_path): + return gs_svci_svc_mix + logging.info("encode,{},sp={}".format(svc_file, time.time() - st)) + return gs_svci_success + + def params_check(self, model_path, vocal_path, acc_path, video_path): + if not os.path.exists(model_path): + print("model_path={} is null".format(model_path)) + return gs_svci_data_params_check_model_path + if not os.path.exists(vocal_path): + print("vocal_path={} is null".format(vocal_path)) + return gs_svci_data_params_check_vocal_path + if not os.path.exists(acc_path): + print("acc_path={} is null".format(acc_path)) + return gs_svci_data_params_check_acc_path + if not os.path.exists(video_path): + print("video_path={} is null".format(video_path)) + return gs_svci_data_params_check_video_path + return gs_svci_success + + def data_prepare(self, work_dir, vocal_path, acc_path): + self.vocal_32_wav_path = os.path.join(work_dir, "vocal_32.wav") + self.vocal_wav_path = os.path.join(work_dir, "vocal.wav") + self.acc_wav_path = os.path.join(work_dir, "acc.wav") + cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {}".format(vocal_path, self.vocal_wav_path) + os.system(cmd) + + cmd = "ffmpeg -i {} -ar 32000 -ac 1 -y {}".format(vocal_path, self.vocal_32_wav_path) + os.system(cmd) + cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {}".format(acc_path, self.acc_wav_path) + os.system(cmd) + return os.path.exists(self.vocal_32_wav_path) and os.path.exists(self.acc_wav_path) + + def process_logic(self, work_dir, model_path, vocal_path, acc_path, video_path, out_path): + # 1. 先转码人声和伴奏 + st = time.time() + if not self.data_prepare(work_dir, vocal_path, acc_path): + print("transcode vocal={} or acc={} err!\n".format(vocal_path, acc_path)) + return gs_svci_data_prepare_transcode_media + print("transcode vocal and acc sp={}".format(time.time() - st)) + + # 2. 进行推理 + # raw_audio_path, dst_path + st = time.time() + svc_file = os.path.join(work_dir, "trans_vocal.wav") + # try: + inf(model_path, self.config, self.vocal_32_wav_path, svc_file, 'prod') + # except Exception as ex: + # print(ex) + if not os.path.exists(svc_file): + print("inference err vocal_path={}, model_path={}".format(vocal_path, model_path)) + return gs_svci_data_inference, [] + print("inf sp={}".format(time.time() - st)) + + # 3. 生成作品 + st = time.time() + mix_tmp_path = os.path.join(work_dir, "mix.wav") + err = self.mix(work_dir, svc_file, self.vocal_wav_path, self.acc_wav_path, mix_tmp_path) + if err != gs_svci_success: + return err, [] + if not os.path.exists(mix_tmp_path): + return gs_svci_svc_mix_gen, [] + print("mix sp={}".format(time.time() - st)) + + st = time.time() + # 4. 音频编码,并且和视频合并 + cmd = "ffmpeg -i {} -i {} -acodec aac -strict -2 -b:a 128k -shortest -af apad -y {}".format(video_path, + mix_tmp_path, out_path) + os.system(cmd) + if not os.path.exists(out_path): + print("mix audio_video err={}".format(video_path, mix_tmp_path)) + return gs_svci_svc_mix_audio_video, [] + print("mix audio and video sp={}".format(time.time() - st)) + + # 5. 提取emb + st = time.time() + emb = self.spk_emb_inst.process(svc_file) + print("get emb sp={}".format(time.time() - st)) + return gs_svci_success, emb + + def process(self, work_dir, model_path, vocal_path, acc_path, video_path, out_path): + err = self.params_check(model_path, vocal_path, acc_path, video_path) + if err != gs_svci_success: + return err, [] + + if os.path.exists(work_dir): + shutil.rmtree(work_dir) + os.makedirs(work_dir) + st = time.time() + err, emb = self.process_logic(work_dir, model_path, vocal_path, acc_path, video_path, out_path) + print("process_logic sp={}".format(time.time() - st)) + shutil.rmtree(work_dir) + return err, emb + + +if __name__ == '__main__': + svc_inst = SVCInferenceOne() + b_dir = "/data/rsync/jianli.yang/AutoCoverTool/data/test_svc_inference_one/" + w_dir = os.path.join(b_dir, "rg_input") + in_m4a = os.path.join(b_dir, "rg_input.m4a") + in_acc_m4a = os.path.join(b_dir, "acc.m4a") + in_video = os.path.join(b_dir, "rg.mp4") + out_video = os.path.join(b_dir, "rg_input_out.mp4") + m_path = "/data/rsync/jianli.yang/AutoCoverTool/data/train_users/jianli/logs/32k/G_2000.pth" + err, emb = svc_inst.process(w_dir, m_path, in_m4a, in_acc_m4a, in_video, out_video) + print(err) + print(emb)