diff --git a/AIMeiSheng/RawNet3/infererence_fang_meisheng.py b/AIMeiSheng/RawNet3/infererence_fang_meisheng.py index 5612582..fd9cb53 100644 --- a/AIMeiSheng/RawNet3/infererence_fang_meisheng.py +++ b/AIMeiSheng/RawNet3/infererence_fang_meisheng.py @@ -1,270 +1,272 @@ -import argparse -import itertools import os import sys +sys.path.append(os.path.dirname(__file__)) + +import argparse +import itertools from typing import Dict import numpy as np import soundfile as sf import torch import torch.nn.functional as F from tqdm import tqdm from models.RawNet3 import RawNet3 from models.RawNetBasicBlock import Bottle2neck from utils import tuneThresholdfromScore, ComputeErrorRates, ComputeMinDcf #model_directory = '/data/bingxiao.fang/speaker_identify/RawNet/python/RawNet3' #sys.path.append(os.path.abspath(model_directory)) def get_embed_model(model_path): model = RawNet3( Bottle2neck, model_scale=8, context=True, summed=True, encoder_type="ECA", nOut=256, out_bn=False, sinc_stride=10, log_sinc=True, norm_sinc="mean", grad_mult=1, ) model.load_state_dict( torch.load( model_path, # "/data/bingxiao.fang/speaker_identify/RawNet/python/RawNet3/models/weights/model.pt", map_location=lambda storage, loc: storage, )["model"] ) model.eval() return model def main(args: Dict, model=None) -> None: if model == None: model = RawNet3( Bottle2neck, model_scale=8, context=True, summed=True, encoder_type="ECA", nOut=256, out_bn=False, sinc_stride=10, log_sinc=True, norm_sinc="mean", grad_mult=1, ) model.load_state_dict( torch.load( "./models/weights/model.pt", map_location=lambda storage, loc: storage, )["model"] ) model.eval() # gpu = False gpu = True if torch.cuda.is_available() else False #print("RawNet3 initialised & weights loaded!") if torch.cuda.is_available(): #print("Cuda available, conducting inference on GPU") model = model.to("cuda") gpu = True if args.inference_utterance: output = extract_speaker_embd( model, fn=args.input, n_samples=48000, n_segments=args.n_segments, gpu=gpu, ).mean(0) #print("embead shape:", output.size()) np.save(args.out_dir, output.detach().cpu().numpy()) return if args.vox1_o_benchmark: with open("../../trials/cleaned_test_list.txt", "r") as f: trials = f.readlines() ## Get a list of unique file names files = list(itertools.chain(*[x.strip().split()[-2:] for x in trials])) setfiles = list(set(files)) setfiles.sort() embd_dic = {} for f in tqdm(setfiles): embd_dic[f] = extract_speaker_embd( model, os.path.join(args.DB_dir, f), n_samples=64000, gpu=gpu ) labels, scores = [], [] for line in trials: data = line.split() ref_feat = F.normalize(embd_dic[data[1]], p=2, dim=1) com_feat = F.normalize(embd_dic[data[2]], p=2, dim=1) if gpu: ref_feat = ref_feat.cuda() com_feat = com_feat.cuda() dist = ( torch.cdist( ref_feat.reshape((args.n_segments, -1)), com_feat.reshape((args.n_segments, -1)), ) .detach() .cpu() .numpy() ) score = -1.0 * np.mean(dist) labels.append(int(data[0])) scores.append(score) result = tuneThresholdfromScore(scores, labels, [1, 0.1]) fnrs, fprs, thresholds = ComputeErrorRates(scores, labels) p_target, c_miss, c_fa = 0.05, 1, 1 mindcf, _ = ComputeMinDcf( fnrs, fprs, thresholds, p_target, c_miss, c_fa ) print( "Vox1-O benchmark Finished. EER: %2.4f, minDCF:%.5f" % (result[1], mindcf) ) import librosa def extract_speaker_embd( model, fn: str, n_samples: int, n_segments: int = 10, gpu: bool = False ) -> np.ndarray: #audio, sample_rate = sf.read(fn) audio, sample_rate = librosa.load(fn,sr=16000) ##fang add if len(audio.shape) > 1: raise ValueError( f"RawNet3 supports mono input only. Input data has a shape of {audio.shape}." ) if sample_rate != 16000: raise ValueError( f"RawNet3 supports 16k sampling rate only. Input data's sampling rate is {sample_rate}." ) if ( len(audio) < n_samples ): # RawNet3 was trained using utterances of 3 seconds shortage = n_samples - len(audio) + 1 audio = np.pad(audio, (0, shortage), "wrap") audios = [] startframe = np.linspace(0, len(audio) - n_samples, num=n_segments) for asf in startframe: audios.append(audio[int(asf) : int(asf) + n_samples]) audios = torch.from_numpy(np.stack(audios, axis=0).astype(np.float32)) if gpu: audios = audios.to("cuda") with torch.no_grad(): output = model(audios) return output def get_embed(target_wav, embed_npy, model=None): parser = argparse.ArgumentParser(description="RawNet3 inference") parser.add_argument( "--inference_utterance", default=True, action="store_true" ) parser.add_argument( "--input", type=str, default="", help="Input file to extract embedding. Required when 'inference_utterance' is True", ) parser.add_argument( "--vox1_o_benchmark", default=False, action="store_true" ) parser.add_argument( "--DB_dir", type=str, default="", help="Directory for VoxCeleb1. Required when 'vox1_o_benchmark' is True", ) parser.add_argument("--out_dir", type=str, default="./out.npy") parser.add_argument( "--n_segments", type=int, default=10, help="number of segments to make using each utterance", ) args = parser.parse_args() args.input = target_wav args.out_dir = embed_npy assert args.inference_utterance or args.vox1_o_benchmark if args.inference_utterance: assert args.input != "" if args.vox1_o_benchmark: assert args.DB_dir != "" #sys.exit(main(args,model)) main(args,model) if __name__ == "__main__": parser = argparse.ArgumentParser(description="RawNet3 inference") parser.add_argument( "--inference_utterance", default=False, action="store_true" ) parser.add_argument( "--input", type=str, default="", help="Input file to extract embedding. Required when 'inference_utterance' is True", ) parser.add_argument( "--vox1_o_benchmark", default=False, action="store_true" ) parser.add_argument( "--DB_dir", type=str, default="", help="Directory for VoxCeleb1. Required when 'vox1_o_benchmark' is True", ) parser.add_argument("--out_dir", type=str, default="./out.npy") parser.add_argument( "--n_segments", type=int, default=10, help="number of segments to make using each utterance", ) args = parser.parse_args() assert args.inference_utterance or args.vox1_o_benchmark if args.inference_utterance: assert args.input != "" if args.vox1_o_benchmark: assert args.DB_dir != "" sys.exit(main(args)) diff --git a/AIMeiSheng/docker_demo/Dockerfile b/AIMeiSheng/docker_demo/Dockerfile index 84159ab..3ba096e 100644 --- a/AIMeiSheng/docker_demo/Dockerfile +++ b/AIMeiSheng/docker_demo/Dockerfile @@ -1,29 +1,25 @@ -# 指定基础映像 -FROM python:3.8.12 - -# 设置软件包源为中科大镜像源 -#RUN echo "deb https://mirrors.ustc.edu.cn/debian/ buster main" > /etc/apt/sources.list && \ -# echo "deb-src https://mirrors.ustc.edu.cn/debian/ buster main" >> /etc/apt/sources.list -# 更新软件包源 -#RUN apt-get update - -# 安装 libgl1-mesa-glx 软件包 -#RUN apt-get install -y libgl1-mesa-glx - -# 设置工作目录 -WORKDIR /data/bingxiao.fang/docker +# 系统版本 CUDA Version 11.8.0 +# NAME="CentOS Linux" VERSION="7 (Core)" +# FROM starmaker.tencentcloudcr.com/starmaker/av/av:1.1 -##复制工程文件放到容器中 -COPY . /data/bingxiao.fang/docker +# 基础镜像, python3.9,cuda118,centos7,外加ffmpeg -# 安装依赖项 -RUN pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple/ --default-timeout=60 --no-cache-dir -r requirements.txt - -# 复制 Python 文件和依赖项清单 +#FROM starmaker.tencentcloudcr.com/starmaker/av/av_base:1.0 +FROM av_base_test:1.0 - - -EXPOSE 6768 +RUN source /etc/profile && sed -i 's|mirrorlist=|#mirrorlist=|g' /etc/yum.repos.d/CentOS-Base.repo && sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-Base.repo && yum clean all && yum install -y unzip && yum install -y libsndfile && yum install -y libsamplerate libsamplerate-devel +RUN source /etc/profile && pip3 install librosa && pip3 install gradio && pip3 install torch==2.1.2 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 +RUN source /etc/profile && pip3 install urllib3==1.26.15 && pip3 install coscmd && coscmd config -a AKIDoQmshFWXGitnQmrfCTYNwEExPaU6RVHm -s F9n9E2ZonWy93f04qMaYFfogHadPt62h -b log-sg-1256122840 -r ap-singapore +RUN source /etc/profile && pip3 install asteroid-filterbanks +RUN source /etc/profile && pip3 install praat-parselmouth==0.4.3 +RUN source /etc/profile && pip3 install pyworld +RUN source /etc/profile && pip3 install faiss-cpu +RUN source /etc/profile && pip3 install torchcrepe +RUN source /etc/profile && pip3 install thop +RUN source /etc/profile && pip3 install ffmpeg-python +RUN source /etc/profile && pip3 install fairseq +RUN source /etc/profile && pip3 install redis -# 指定容器启动命令 -CMD ["python", "./main.py"] +WORKDIR /data/code + +CMD ["/bin/bash", "-c", "source /etc/profile; export PYTHONPATH=/data/code; cd /data/code/AIMeiSheng/docker_demo; python3 offline_server.py"] \ No newline at end of file diff --git a/AIMeiSheng/docker_demo/common.py b/AIMeiSheng/docker_demo/common.py index 6a31932..3eff425 100644 --- a/AIMeiSheng/docker_demo/common.py +++ b/AIMeiSheng/docker_demo/common.py @@ -1,52 +1,61 @@ import os import time import logging import urllib, urllib.request +gs_tmp_dir = "/tmp/ai_meisheng_tmp" +gs_model_dir = "/tmp/ai_meisheng_models" +gs_resource_cache_dir = "/tmp/ai_meisheng_resource_cache" +gs_svc_model_path = os.path.join(gs_model_dir, + "weights/xusong_v2_org_version_alldata_embed1_enzx_diff_fi_e15_s244110.pth") +gs_embed_model_path = os.path.join(gs_model_dir, "RawNet3/models/weights/model.pt") +gs_hubert_model_path = os.path.join(gs_model_dir, "hubert.pt") +gs_rmvpe_model_path = os.path.join(gs_model_dir, "rmvpe.pt") + def download2disk(url, dst_path): st = time.time() urllib.request.urlretrieve(url, dst_path) print(f"download {url} -> {dst_path} sp = {time.time() - st}") return os.path.exists(dst_path) def exec_cmd(cmd): # gs_logger.info(cmd) print(cmd) ret = os.system(cmd) if ret != 0: return False return True def exec_cmd_and_result(cmd): r = os.popen(cmd) text = r.read() r.close() return text def upload_file2cos(key, file_path, region='ap-singapore', bucket_name='av-audit-sync-sg-1256122840'): """ 将文件上传到cos :param key: 桶上的具体地址 :param file_path: 本地文件地址 :param region: 区域 :param bucket_name: 桶地址 :return: """ gs_coscmd = "coscmd" gs_coscmd_conf = "~/.cos.conf" cmd = "{} -c {} -r {} -b {} upload {} {}".format(gs_coscmd, gs_coscmd_conf, region, bucket_name, file_path, key) if exec_cmd(cmd): cmd = "{} -c {} -r {} -b {} info {}".format(gs_coscmd, gs_coscmd_conf, region, bucket_name, key) \ + "| grep Content-Length |awk \'{print $2}\'" res_str = exec_cmd_and_result(cmd) logging.info("{},res={}".format(key, res_str)) size = float(res_str) if size > 0: return True return False return False diff --git a/AIMeiSheng/docker_demo/http_server.py b/AIMeiSheng/docker_demo/http_server.py index 23ac0ba..a943980 100644 --- a/AIMeiSheng/docker_demo/http_server.py +++ b/AIMeiSheng/docker_demo/http_server.py @@ -1,128 +1,84 @@ # -*- coding: UTF-8 -*- """ SVC处理逻辑 1. 根据跟定的vocal_url 判别男女 2. 根据男女信息选择适合的男女url 3. 模型推理 """ import gc import os -import shutil import sys +import json import time +import socket + import logging import hashlib -import numpy as np -import multiprocessing as mp -from multiprocessing import Pool from flask import Flask, jsonify, request, abort -from common import download2disk, exec_cmd, upload_file2cos -from svc_online import GSWorkerAttr, SVCOnline, volume_adjustment -# 全局设置 -import socket +from redis_helper import RedisHelper +from offline_server import (gs_server_redis_conf, gs_redis_conf, check_input, gs_err_code_pending, + gs_err_code_params, gs_err_code_too_many_connections) +sys.path.append(os.path.dirname(__file__)) +sys.path.append(os.path.join(os.path.dirname(__file__), "../")) + +# 全局设置 hostname = socket.gethostname() -log_file_name = f"av_svc_{hostname}.log" +log_file_name = f"/tmp/av_meisheng_http_{hostname}.log" logging.basicConfig(filename=log_file_name, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %I:%M:%S', level=logging.INFO) -# errcode -gs_err_code_success = 0 -gs_err_code_download_vocal = 100 -gs_err_code_download_svc_url = 101 -gs_err_code_svc_process = 102 -gs_err_code_transcode = 103 -gs_err_code_volume_adjust = 104 -gs_err_code_upload = 105 - -sys.path.append(os.path.dirname(__file__)) -sys.path.append(os.path.join(os.path.dirname(__file__), "../")) - app = Flask(__name__) -def download_data(worker_attr): - vocal_path = os.path.join(worker_attr.tmp_dir, worker_attr.distinct_id) - if os.path.exists(vocal_path): - os.remove(vocal_path) - - st = time.time() - if not download2disk(worker_attr.vocal_url, worker_attr.vocal_path): - return gs_err_code_download_vocal - logging.info(f"download vocal_url={worker_attr.vocal_url} sp = {time.time() - st}") - - # download svc_source_url - if not os.path.exists(worker_attr.female_svc_source_path): - st = time.time() - if not download2disk(worker_attr.female_svc_source_url, worker_attr.female_svc_source_path): - return gs_err_code_download_svc_url - logging.info(f"download female_url={worker_attr.female_svc_source_url} sp = {time.time() - st}") - - # download svc_source_url - if not os.path.exists(worker_attr.male_svc_source_path): - st = time.time() - if not download2disk(worker_attr.male_svc_source_url, worker_attr.male_svc_source_path): - return gs_err_code_download_svc_url - logging.info(f"download male_url={worker_attr.male_svc_source_url} sp = {time.time() - st}") - return gs_err_code_success - - -def transcode(wav_path, dst_path): - st = time.time() - cmd = f"ffmpeg -i {wav_path} -ar 44100 -ac 2 -b:a 64k -y {dst_path} -loglevel fatal" - exec_cmd(cmd) - logging.info(f"transcode cmd={cmd}, sp = {time.time() - st}") - return os.path.exists(dst_path) +class HttpServer: + def __init__(self, redis_conf, server_conf): + self.redis_helper = RedisHelper(redis_conf) + self.server_conf = server_conf + def process(self, in_data): + msg = { + "status": gs_err_code_params, + "schedule": 100, + "gender": "unknown", + "target_song_url": "", + } -gs_svc_online = None + if not check_input(in_data): + return msg + if self.redis_helper.llen(self.server_conf["producer"]) > 10: + msg["status"] = gs_err_code_too_many_connections + return msg -def process_one(input_data): - logging.info(f"start input={input_data} start prepare data ...") - worker_attr = GSWorkerAttr(input_data) - err = download_data(worker_attr) - if err != gs_err_code_success: - return err, None + distinct_id = hashlib.md5(in_data["record_song_url"].encode()).hexdigest() + distinct_key = self.server_conf["ai_meisheng_key_prefix"] + distinct_id + if not self.redis_helper.exists(distinct_key): + msg["status"] = gs_err_code_pending + self.redis_helper.set(distinct_key, json.dumps(msg)) + self.redis_helper.lpush(self.server_conf["producer"], json.dumps(in_data)) - # process audio - global gs_svc_online - if gs_svc_online is None: - gs_svc_online = SVCOnline() - gs_svc_online.process(worker_attr) - if not os.path.exists(worker_attr.target_wav_path): - return gs_err_code_svc_process, None + self.redis_helper.expire(distinct_key, 15) + msg = self.redis_helper.get(distinct_key) + return json.loads(msg) - # 音量拉伸到指定响度 - volume_adjustment(worker_attr.target_wav_path, worker_attr.target_loudness, worker_attr.target_wav_ad_path) - if not os.path.exists(worker_attr.target_wav_ad_path): - return gs_err_code_volume_adjust, None - # transcode - if not transcode(worker_attr.target_wav_path, worker_attr.target_path): - return gs_err_code_transcode, None - - # upload - st = time.time() - if upload_file2cos(worker_attr.target_url, worker_attr.target_path): - return gs_err_code_upload, None - logging.info(f"audio_url={worker_attr.vocal_url} upload {worker_attr.target_url} sp = {time.time() - st}") - return gs_err_code_success, worker_attr.target_path +gs_http_server = HttpServer(gs_redis_conf, gs_server_redis_conf) @app.route("/ai_meisheng", methods=["POST"]) -def get_song_res(): +def ai_meisheng(): data = request.json st = time.time() logging.info(f"ai_meisheng:in:{data}") - ret, url = process_one(data) - all_ret_msg = jsonify({"out_url": url, "ret": ret}) - logging.info(f"ai_meisheng:out:{data}-{all_ret_msg}, sp={time.time() - st}") - return all_ret_msg + msg = gs_http_server.process(data) + json_msg = jsonify(msg) + logging.info(f"ai_meisheng:out:{data}-{json_msg}, sp={time.time() - st}") + return json_msg if __name__ == "__main__": app.run(host='0.0.0.0', port=5000, threaded=False) diff --git a/AIMeiSheng/docker_demo/offline_server.py b/AIMeiSheng/docker_demo/offline_server.py new file mode 100644 index 0000000..134f7f9 --- /dev/null +++ b/AIMeiSheng/docker_demo/offline_server.py @@ -0,0 +1,189 @@ +# -*- coding: UTF-8 -*- +""" +离线处理: 使用redis进行交互,从redis中获取数据资源,在将结果写入到redis +""" +import os +import sys +import time +import json +import socket +import hashlib +import logging + +from redis_helper import RedisHelper +from common import download2disk, exec_cmd, upload_file2cos +from svc_online import GSWorkerAttr, SVCOnline, volume_adjustment + +hostname = socket.gethostname() +log_file_name = f"/tmp/av_meisheng_{hostname}.log" +logging.basicConfig(filename=log_file_name, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %I:%M:%S', + level=logging.INFO) + +# errcode +gs_err_code_success = 0 +gs_err_code_download_vocal = 100 +gs_err_code_download_svc_url = 101 +gs_err_code_svc_process = 102 +gs_err_code_transcode = 103 +gs_err_code_volume_adjust = 104 +gs_err_code_upload = 105 +gs_err_code_params = 106 +gs_err_code_pending = 107 +gs_err_code_too_many_connections = 429 + +sys.path.append(os.path.dirname(__file__)) +sys.path.append(os.path.join(os.path.dirname(__file__), "../")) + +gs_redis_conf = { + "host": "av-credis.starmaker.co", + "port": 6379, + "pwd": "lKoWEhz%jxTO", +} + +gs_server_redis_conf = { + "producer": "ai_meisheng_producer", # 输入的队列 + "ai_meisheng_key_prefix": "ai_meisheng_key_", # 存储结果情况 +} + + +def download_data(worker_attr): + vocal_path = os.path.join(worker_attr.tmp_dir, worker_attr.distinct_id) + if os.path.exists(vocal_path): + os.remove(vocal_path) + + st = time.time() + if not download2disk(worker_attr.vocal_url, worker_attr.vocal_path): + return gs_err_code_download_vocal + logging.info(f"download vocal_url={worker_attr.vocal_url} sp = {time.time() - st}") + + # download svc_source_url + if not os.path.exists(worker_attr.female_svc_source_path): + st = time.time() + if not download2disk(worker_attr.female_svc_source_url, worker_attr.female_svc_source_path): + return gs_err_code_download_svc_url + logging.info(f"download female_url={worker_attr.female_svc_source_url} sp = {time.time() - st}") + + # download svc_source_url + if not os.path.exists(worker_attr.male_svc_source_path): + st = time.time() + if not download2disk(worker_attr.male_svc_source_url, worker_attr.male_svc_source_path): + return gs_err_code_download_svc_url + logging.info(f"download male_url={worker_attr.male_svc_source_url} sp = {time.time() - st}") + return gs_err_code_success + + +def transcode(wav_path, dst_path): + st = time.time() + cmd = f"ffmpeg -i {wav_path} -ar 44100 -ac 2 -b:a 64k -y {dst_path} -loglevel fatal" + exec_cmd(cmd) + logging.info(f"transcode cmd={cmd}, sp = {time.time() - st}") + return os.path.exists(dst_path) + + +def check_input(input_data): + key_list = ["record_song_url", "target_url", "start", "end", "vocal_loudness", "female_recording_url", + "male_recording_url"] + for key in key_list: + if key not in input_data.keys(): + return False + return True + + +class OfflineServer: + def __init__(self, redis_conf, server_conf, update_redis=False): + self.redis_helper = RedisHelper(redis_conf) + self.svc_online = SVCOnline() + self.server_conf = server_conf + self.distinct_key = server_conf["ai_meisheng_key_prefix"] + self.update_redis = update_redis + + def exists(self): + return self.redis_helper.exists(self.distinct_key) + + def update_result(self, errcode, schedule, gender, target_song_url): + msg = { + "status": errcode, + "schedule": schedule, + "gender": gender, + "target_song_url": target_song_url, + } + # 结果保存15min + if self.update_redis: + self.redis_helper.set(self.distinct_key, json.dumps(msg)) + self.redis_helper.expire(self.distinct_key, 60 * 10) + + def process_one(self, worker_attr): + self.distinct_key = self.server_conf["ai_meisheng_key_prefix"] + worker_attr.distinct_id + logging.info(f"{worker_attr.log_info_name()}, start download ...") + err = download_data(worker_attr) + if err != gs_err_code_success: + self.update_result(err, 100, "unknown", worker_attr.target_url) + return err, None, None + self.update_result(err, 35, "unknown", worker_attr.target_url) + + logging.info(f"{worker_attr.log_info_name()}, start process ...") + gender = self.svc_online.process(worker_attr) + if not os.path.exists(worker_attr.target_wav_path): + self.update_result(gs_err_code_svc_process, 100, gender, worker_attr.target_url) + return gs_err_code_svc_process, None, None + self.update_result(err, 85, gender, worker_attr.target_url) + + # 音量拉伸到指定响度 + logging.info(f"{worker_attr.log_info_name()}, start volume_adjustment ...") + volume_adjustment(worker_attr.target_wav_path, worker_attr.target_loudness, worker_attr.target_wav_ad_path) + if not os.path.exists(worker_attr.target_wav_ad_path): + self.update_result(gs_err_code_volume_adjust, 100, gender, worker_attr.target_url) + return gs_err_code_volume_adjust, None, None + self.update_result(err, 90, gender, worker_attr.target_url) + + # transcode + logging.info(f"{worker_attr.log_info_name()}, start transcode ...") + if not transcode(worker_attr.target_wav_path, worker_attr.target_path): + self.update_result(gs_err_code_transcode, 100, gender, worker_attr.target_url) + return gs_err_code_transcode, None, None + self.update_result(err, 95, gender, worker_attr.target_url) + + # upload + logging.info(f"{worker_attr.log_info_name()}, start upload_file2cos ...") + st = time.time() + # 从target_url 分离出bucket_name,ap,和key + # "http://starmaker-sv-1256122840.cos.na-siliconvalley.myqcloud.com/production/ai_voice/7036874317774285/xxalkdjfladjflkasdf-target.mp4", + bucket_name = worker_attr.target_url.split(".")[0].split("//")[-1] + region = worker_attr.target_url.split(".")[2] + key = "/".join(worker_attr.target_url.split("/")[3:]) + + logging.info(f"{worker_attr.log_info_name()}, start upload_file2cos {bucket_name}, {region}, {key}") + if not upload_file2cos(key, worker_attr.target_path, region=region, bucket_name=bucket_name): + self.update_result(gs_err_code_upload, 100, gender, worker_attr.target_url) + return gs_err_code_upload, None, None + self.update_result(gs_err_code_success, 100, gender, worker_attr.target_url) + logging.info(f"{worker_attr.log_info_name()} upload {worker_attr.target_url} sp = {time.time() - st}") + return gs_err_code_success, worker_attr.target_url, gender + + def process(self): + while True: + data = self.redis_helper.rpop(self.server_conf["producer"]) + if data is None: + time.sleep(1) + continue + + data = json.loads(data) + if not check_input(data): + logging.error(f"input data error={data}") + continue + + worker_attr = GSWorkerAttr(data) + self.distinct_key = self.server_conf["ai_meisheng_key_prefix"] + worker_attr.distinct_id + if not self.exists(): + logging.warning(f"input {data}, timeout abandon ....") + continue + + st = time.time() + errcode, target_path, gender = self.process_one(worker_attr) + self.update_result(errcode, 100, gender, target_path) + logging.info(f"{worker_attr.log_info_name()} finish sp = {time.time() - st}") + + +if __name__ == '__main__': + offline_server = OfflineServer(gs_redis_conf, gs_server_redis_conf, True) + offline_server.process() diff --git a/AIMeiSheng/docker_demo/readme.txt b/AIMeiSheng/docker_demo/readme.txt new file mode 100644 index 0000000..066ca90 --- /dev/null +++ b/AIMeiSheng/docker_demo/readme.txt @@ -0,0 +1,23 @@ +简介: ai美声功能,其核心是输入一段15-30s的人声作为音色信息,再给定输入音源,将音源转换为指定音色的声音的效果。例如,孙燕姿演唱的东风破 +架构方案: http_server.py (1个) 作为服务端,接收外部传来的数据,塞入到redis中,由offline_server.py (多个服务) 进行承接 + +# 部署要求: +1. http_server.py 部署在sg-prod-songrefresh-gpu-7 上 +2. offline_server.py 使用docker 部署在超级节点上,由运维进行控制 + +# http_server.py 环境要求: + pip install redis + pip install flask + +# offline_server.py 环境要求(docker) + cd docker_demo目录下(例子如下): + 1. docker build -f Dockerfile -t av_ai_meisheng . + (通过docker images 获取av_ai_meisheng的image_id) + 2. docker run --gpus all -it -v /data/rsync/jianli.yang/av_svc:/data/code image_id # 即可启动服务 + +# 测试代码: docker 环境下, offline_server.py 即可验证 +# http测试命令: +curl http://127.0.0.1:5004/ai_meisheng -H "Content-Type: application/json" -d '{ "record_song_url": "https://av-audit-sync-sg-1256122840.cos.ap-singapore.myqcloud.com/dataset/AIMeiSheng/vocal_test/yinse.m4a", "target_url": "https://av-audit-sync-sg-1256122840.cos.ap-singapore.myqcloud.com/dataset/AIMeiSheng/vocal_test/out.m4a","start": 0,"end": 15000,"vocal_loudness": -14.57,"female_recording_url": "https://av-audit-sync-sg-1256122840.cos.ap-singapore.myqcloud.com/dataset/AIMeiSheng/vocal_test/female.m4a", "male_recording_url": "https://av-audit-sync-sg-1256122840.cos.ap-singapore.myqcloud.com/dataset/AIMeiSheng/vocal_test/male.m4a"}' +{"gender":"male","schedule":100,"status":0,"target_song_url":"https://av-audit-sync-sg-1256122840.cos.ap-singapore.myqcloud.com/dataset/AIMeiSheng/vocal_test/out.m4a"} + +# 资源消耗: 显存占用约2G,建议一台GPU机器部署2个offline_server.py diff --git a/AIMeiSheng/docker_demo/redis_helper.py b/AIMeiSheng/docker_demo/redis_helper.py new file mode 100644 index 0000000..2cac13a --- /dev/null +++ b/AIMeiSheng/docker_demo/redis_helper.py @@ -0,0 +1,45 @@ +# -*- coding: UTF-8 -*- +import time + +import redis + + +class RedisHelper: + def __init__(self, redis_conf, esp_sec=60): + self.redis_conf = redis_conf + self.last_tm = time.time() + self.esp_sec = esp_sec + self.redis_client = redis.StrictRedis(host=self.redis_conf["host"], + port=int(self.redis_conf["port"]), password=self.redis_conf["pwd"]) + + redis_client = redis.StrictRedis(host="av-credis.starmaker.co", port=6379, password="stamaker_av20240709!@#") + def get_client(self): + if time.time() - self.last_tm > self.esp_sec: + self.redis_client = redis.StrictRedis(host=self.redis_conf["host"], + port=int(self.redis_conf["port"]), password=self.redis_conf["pwd"]) + self.last_tm = time.time() + return self.redis_client + + def get(self, key): + return self.get_client().get(key) + + def set(self, key, value): + return self.get_client().set(key, value) + + def exists(self, key): + return self.get_client().exists(key) + + def expire(self, key, expire_time_sec): + return self.get_client().expire(key, expire_time_sec) + + def del_key(self, key): + return self.get_client().delete(key) + + def lpush(self, key, value): + return self.get_client().lpush(key, value) + + def rpop(self, key): + return self.get_client().rpop(key) + + def llen(self, key): + return self.get_client().llen(key) diff --git a/AIMeiSheng/docker_demo/requirements.txt b/AIMeiSheng/docker_demo/requirements.txt deleted file mode 100644 index b68ec06..0000000 --- a/AIMeiSheng/docker_demo/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -gradio==3.42.0 -gradio_client==0.5.0 diff --git a/AIMeiSheng/docker_demo/svc_online.py b/AIMeiSheng/docker_demo/svc_online.py index b196832..ec3cdf0 100644 --- a/AIMeiSheng/docker_demo/svc_online.py +++ b/AIMeiSheng/docker_demo/svc_online.py @@ -1,170 +1,165 @@ # -*- coding: UTF-8 -*- """ SVC的核心处理逻辑 """ import os import shutil import hashlib import time from AIMeiSheng.meisheng_svc_final import load_model, process_svc_online from AIMeiSheng.meisheng_env_preparex import meisheng_env_prepare -from AIMeiSheng.voice_classification.online.voice_class_online_fang import VoiceClass -from AIMeiSheng.RawNet3.infererence_fang_meisheng import get_embed, get_embed_model -from AIMeiSheng.myinfer_multi_spk_embed_in_dec_diff_fi_meisheng import svc_main, load_hubert, get_vc, get_rmvpe +from AIMeiSheng.voice_classification.online.voice_class_online_fang import VoiceClass, download_volume_balanced from AIMeiSheng.docker_demo.common import * -gs_resource_cache_dir = "/tmp/gs_svc_resource_cache" -gs_tmp_dir = "/tmp/gs_svc_tmp" -gs_model_dir = "/tmp/models" - if os.path.exists(gs_tmp_dir): shutil.rmtree(gs_tmp_dir) os.makedirs(gs_model_dir, exist_ok=True) +os.makedirs(gs_resource_cache_dir, exist_ok=True) # 预设参数 -gs_gender_models_url = "https://av-audit-sync-in-1256122840.cos.ap-mumbai.myqcloud.com/hub/voice_classification/models.zip" -gs_svc_emb_url = "" -gs_svc_model_url = "" -gs_volume_bin_url = "https://av-audit-sync-in-1256122840.cos.ap-mumbai.myqcloud.com/dataset/AIMeiSheng/ebur128_tool" +gs_gender_models_url = "https://av-audit-sync-sg-1256122840.cos.ap-singapore.myqcloud.com/hub/voice_classification/models.zip" +gs_volume_bin_url = "https://av-audit-sync-sg-1256122840.cos.ap-singapore.myqcloud.com/dataset/AIMeiSheng/ebur128_tool" class GSWorkerAttr: def __init__(self, input_data): - vocal_url = input_data["vocal_url"] - female_svc_source_url = input_data["female_svc_url"] - male_svc_source_url = input_data["male_svc_url"] - st_tm = input_data["st_tm"] # 单位是s - ed_tm = input_data["ed_tm"] # 单位是s - - self.distinct_id = hashlib.md5(vocal_url.encode()).hexdigest()#对url进行哈希(通过哈希值建立索引,提高数据库的响应速度) - self.vocal_url = vocal_url - self.target_url = input_data["target_url"] - - ext = vocal_url.split(".")[-1] - self.vocal_path = os.path.join(gs_tmp_dir, self.distinct_id + f"_in.{ext}") - self.target_wav_path = os.path.join(gs_tmp_dir, self.distinct_id + "_out.wav") - self.target_wav_ad_path = os.path.join(gs_tmp_dir, self.distinct_id + "_out_ad.wav") - self.target_path = os.path.join(gs_tmp_dir, self.distinct_id + "_out.m4a") - - self.female_svc_source_url = female_svc_source_url - self.male_svc_source_url = male_svc_source_url - - ext = female_svc_source_url.split(".")[-1] - self.female_svc_source_path = hashlib.md5(female_svc_source_url.encode()).hexdigest() + "." + ext - ext = male_svc_source_url.split(".")[-1] - self.male_svc_source_path = hashlib.md5(male_svc_source_url.encode()).hexdigest() + "." + ext - self.st_tm = st_tm - self.ed_tm = ed_tm - self.target_loudness = input_data["target_loudness"] - + # 取出输入资源 + vocal_url = input_data["record_song_url"] + target_url = input_data["target_url"] + start = input_data["start"] # 单位是ms + end = input_data["end"] # 单位是ms + vocal_loudness = input_data["vocal_loudness"] + female_recording_url = input_data["female_recording_url"] + male_recording_url = input_data["male_recording_url"] + + self.distinct_id = hashlib.md5(vocal_url.encode()).hexdigest() self.tmp_dir = os.path.join(gs_tmp_dir, self.distinct_id) if os.path.exists(self.tmp_dir): shutil.rmtree(self.tmp_dir) os.makedirs(self.tmp_dir) + self.vocal_url = vocal_url + self.target_url = target_url + + ext = vocal_url.split(".")[-1] + self.vocal_path = os.path.join(self.tmp_dir, self.distinct_id + f"_in.{ext}") + self.target_wav_path = os.path.join(self.tmp_dir, self.distinct_id + "_out.wav") + self.target_wav_ad_path = os.path.join(self.tmp_dir, self.distinct_id + "_out_ad.wav") + self.target_path = os.path.join(self.tmp_dir, self.distinct_id + "_out.m4a") + + self.female_svc_source_url = female_recording_url + self.male_svc_source_url = male_recording_url + + ext = female_recording_url.split(".")[-1] + self.female_svc_source_path = os.path.join(gs_resource_cache_dir, + hashlib.md5(female_recording_url.encode()).hexdigest() + "." + ext) + ext = male_recording_url.split(".")[-1] + self.male_svc_source_path = os.path.join(gs_resource_cache_dir, + hashlib.md5(male_recording_url.encode()).hexdigest() + "." + ext) + self.st_tm = start + self.ed_tm = end + self.target_loudness = vocal_loudness + + def log_info_name(self): + return f"d_id={self.distinct_id}, vocal_url={self.vocal_url}" + def __del__(self): - if os.path.exists(self.tmp_dir): - shutil.rmtree(self.tmp_dir) + pass + # if os.path.exists(self.tmp_dir): + # shutil.rmtree(self.tmp_dir) def init_gender_model(): """ 下载模型 :return: """ dst_model_dir = os.path.join(gs_model_dir, "voice_classification") if not os.path.exists(dst_model_dir): dst_zip_path = os.path.join(gs_model_dir, "models.zip") if not download2disk(gs_gender_models_url, dst_zip_path): logging.fatal(f"download gender_model err={gs_gender_models_url}") cmd = f"cd {gs_model_dir}; unzip {dst_zip_path}; mv models voice_classification; rm -f {dst_zip_path}" os.system(cmd) if not os.path.exists(dst_model_dir): logging.fatal(f"unzip {dst_zip_path} err") music_voice_pure_model = os.path.join(dst_model_dir, "voice_005_rec_v5.pth") music_voice_no_pure_model = os.path.join(dst_model_dir, "voice_10_v5.pth") gender_pure_model = os.path.join(dst_model_dir, "gender_8k_ratev5_v6_adam.pth") gender_no_pure_model = os.path.join(dst_model_dir, "gender_8k_v6_adam.pth") vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model) return vc -# def init_svc_model(): -# emb_model_path = os.path.join(gs_model_dir, "RawNet3_weights.pt") -# if not os.path.exists(emb_model_path): -# if not download2disk(gs_svc_emb_url, emb_model_path): -# logging.fatal(f"download svc_emb_model err={gs_svc_emb_url}") -# embed_model = get_embed_model(emb_model_path) -# hubert_model = load_hubert() -# -# svc_filename = gs_svc_model_url.split("/")[-1] -# svc_model_path = os.path.join(gs_model_dir, svc_filename) -# if not os.path.exists(svc_model_path): -# if not download2disk(gs_svc_model_url, svc_model_path): -# logging.fatal(f"download svc_model err={gs_svc_model_url}") -# -# # 此处内部会生成全局模型 -# get_vc(svc_model_path) -# return embed_model, hubert_model +def init_svc_model(): + meisheng_env_prepare(logging, gs_model_dir) + embed_model, hubert_model = load_model() + return embed_model, hubert_model -def init_svc_model(): - meisheng_env_prepare(logging) - embed_model, hubert_model = load_model() - return embed_model, hubert_model +def download_volume_adjustment(): + """ + 下载音量调整工具 + :return: + """ + volume_bin_path = os.path.join(gs_model_dir, "ebur128_tool") + if not os.path.exists(volume_bin_path): + if not download2disk(gs_volume_bin_url, volume_bin_path): + logging.fatal(f"download volume_bin err={gs_volume_bin_url}") + os.system(f"chmod +x {volume_bin_path}") def volume_adjustment(wav_path, target_loudness, out_path): """ 音量调整 :param wav_path: :param target_loudness: :param out_path: :return: """ volume_bin_path = os.path.join(gs_model_dir, "ebur128_tool") - if not os.path.exists(volume_bin_path): - if not download2disk(gs_volume_bin_url, volume_bin_path): - logging.fatal(f"download volume_bin err={gs_volume_bin_url}") cmd = f"{volume_bin_path} {wav_path} {target_loudness} {out_path}" os.system(cmd) class SVCOnline: def __init__(self): st = time.time() self.gender_model = init_gender_model() self.embed_model, self.hubert_model = init_svc_model() + download_volume_adjustment() + download_volume_balanced() logging.info(f"svc init finished, sp = {time.time() - st}") def gender_process(self, worker_attr): st = time.time() gender, female_rate, is_pure = self.gender_model.process(worker_attr.vocal_path) logging.info( f"{worker_attr.vocal_url}, gender={gender}, female_rate={female_rate}, is_pure={is_pure}, " f"gender_process sp = {time.time() - st}") if gender == 0: gender = 'female' elif gender == 1: gender = 'male' elif female_rate > 0.5: gender = 'female' else: gender = 'male' logging.info(f"{worker_attr.vocal_url}, modified gender={gender}") return gender def process(self, worker_attr): gender = self.gender_process(worker_attr) song_path = worker_attr.female_svc_source_path if gender == "male": song_path = worker_attr.male_svc_source_path - params = {'gender': gender, 'tst': worker_attr.st_ms, "tnd": worker_attr.ed_tm, 'delay': 0, 'song_path': None} + params = {'gender': gender, 'tst': worker_attr.st_tm, "tnd": worker_attr.ed_tm, 'delay': 0, 'song_path': None} st = time.time() similar = process_svc_online(song_path, worker_attr.vocal_path, worker_attr.target_wav_path, self.embed_model, - self.hubert_model, params) + self.hubert_model, params) logging.info(f"{worker_attr.vocal_url}, similar={similar} process svc sp = {time.time() - st}") + return gender diff --git a/AIMeiSheng/gender_classify.py b/AIMeiSheng/gender_classify.py index d77eb2a..1d9dcf5 100644 --- a/AIMeiSheng/gender_classify.py +++ b/AIMeiSheng/gender_classify.py @@ -1,32 +1,33 @@ import sys, os import time -sys.path.append('./voice_classification/online/') +sys.path.append(os.path.dirname(__file__)) +sys.path.append(os.path.join(os.path.dirname(__file__), './voice_classification/online/')) from voice_class_online_fang import VoiceClass def load_gender_model(): model_path = "./voice_classification/online/models" music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth") music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth") gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth") gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth") vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model) return vc if __name__ == "__main__": # test_all() # test_all_feature() model_path = sys.argv[1] voice_path = sys.argv[2] music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth") music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth") gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth") gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth") vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model) for i in range(0, 1): st = time.time() print("------------------------------>>>>>") gender, female_rate, is_pure = vc.process(voice_path) print("process|spend_tm=={}".format(time.time() - st)) print("gender:{}, female_rate:{},is_pure:{}".format(gender,female_rate,is_pure)) diff --git a/AIMeiSheng/meisheng_env_preparex.py b/AIMeiSheng/meisheng_env_preparex.py index bf6aa2e..a7bc0db 100644 --- a/AIMeiSheng/meisheng_env_preparex.py +++ b/AIMeiSheng/meisheng_env_preparex.py @@ -1,38 +1,37 @@ import os -from AIMeiSheng.docker_demo.common import * +from AIMeiSheng.docker_demo.common import (gs_svc_model_path, gs_hubert_model_path, gs_embed_model_path, + gs_rmvpe_model_path, download2disk) -def meisheng_env_prepare(logging,AIMeiSheng_Path='./'): +def meisheng_env_prepare(logging, AIMeiSheng_Path='./'): cos_path = "https://av-audit-sync-sg-1256122840.cos.ap-singapore.myqcloud.com/dataset/AIMeiSheng/" + rmvpe_model_url = cos_path + "rmvpe.pt" + if not os.path.exists(gs_rmvpe_model_path): + if not download2disk(rmvpe_model_url, gs_rmvpe_model_path): + logging.fatal(f"download rmvpe_model err={rmvpe_model_url}") - rmvpe_model_path = os.path.join(AIMeiSheng_Path, 'rmvpe.pt') - gs_rmvpe_model_url = cos_path + "rmvpe.pt" - if not os.path.exists(rmvpe_model_path): - if not download2disk(gs_rmvpe_model_url, rmvpe_model_path): - logging.fatal(f"download rmvpe_model err={gs_rmvpe_model_url}") - - hubert_model_path = os.path.join(AIMeiSheng_Path, 'hubert_base.pt') gs_hubert_model_url = cos_path + "hubert_base.pt" - if not os.path.exists(hubert_model_path): - if not download2disk(gs_hubert_model_url, hubert_model_path): + if not os.path.exists(gs_hubert_model_path): + if not download2disk(gs_hubert_model_url, gs_hubert_model_path): logging.fatal(f"download hubert_model err={gs_hubert_model_url}") model_svc = "xusong_v2_org_version_alldata_embed1_enzx_diff_fi_e15_s244110.pth" - svc_model_path = os.path.join(AIMeiSheng_Path, f'weights/{model_svc}') - gs_svc_model_url = cos_path + model_svc - if not os.path.exists(svc_model_path): - if not download2disk(gs_svc_model_url, svc_model_path): - logging.fatal(f"download svc_model err={gs_svc_model_url}") - + base_dir = os.path.dirname(gs_svc_model_path) + os.makedirs(base_dir, exist_ok=True) + svc_model_url = cos_path + model_svc + if not os.path.exists(gs_svc_model_path): + if not download2disk(svc_model_url, gs_svc_model_path): + logging.fatal(f"download svc_model err={svc_model_url}") model_embed = "model.pt" - embed_model_path = os.path.join(AIMeiSheng_Path, f'RawNet3/models/weights/{model_embed}') - gs_embed_model_url = cos_path + model_embed - if not os.path.exists(embed_model_path): - if not download2disk(gs_embed_model_url, embed_model_path): - logging.fatal(f"download embed_model err={gs_embed_model_url}") + base_dir = os.path.dirname(gs_embed_model_path) + os.makedirs(base_dir, exist_ok=True) + embed_model_url = cos_path + model_embed + if not os.path.exists(gs_embed_model_path): + if not download2disk(embed_model_url, gs_embed_model_path): + logging.fatal(f"download embed_model err={embed_model_url}") if __name__ == "__main__": - meisheng_env_prepare() + meisheng_env_prepare() diff --git a/AIMeiSheng/meisheng_svc_final.py b/AIMeiSheng/meisheng_svc_final.py index 1ecaaf7..d3de54d 100644 --- a/AIMeiSheng/meisheng_svc_final.py +++ b/AIMeiSheng/meisheng_svc_final.py @@ -1,227 +1,222 @@ +import os +import sys +sys.path.append(os.path.dirname(__file__)) -import os,sys import time import shutil import glob import hashlib import librosa import soundfile import gradio as gr import pandas as pd import numpy as np -sys.path.append('./RawNet3/') -from infererence_fang_meisheng import get_embed, get_embed_model -from myinfer_multi_spk_embed_in_dec_diff_fi_meisheng import svc_main,load_hubert, get_vc, get_rmvpe +from AIMeiSheng.RawNet3.infererence_fang_meisheng import get_embed, get_embed_model +from myinfer_multi_spk_embed_in_dec_diff_fi_meisheng import svc_main, load_hubert, get_vc, get_rmvpe from gender_classify import load_gender_model - +from AIMeiSheng.docker_demo.common import gs_svc_model_path, gs_embed_model_path, gs_rmvpe_model_path gs_simple_mixer_path = "/data/gpu_env_common/bin/simple_mixer" ##混音执行文件 -tmp_workspace_name = "batch_test_ocean_fi"#工作空间名 +tmp_workspace_name = "batch_test_ocean_fi" # 工作空间名 song_folder = "./data_meisheng/" ##song folder -gs_work_dir = f"./data_meisheng/{tmp_workspace_name}" #工作空间路径 -pth_model_path = "./weights/xusong_v2_org_version_alldata_embed1_enzx_diff_fi_e15_s244110.pth" ##模型文件 - +gs_work_dir = f"./data_meisheng/{tmp_workspace_name}" # 工作空间路径 +pth_model_path = "./weights/xusong_v2_org_version_alldata_embed1_enzx_diff_fi_e15_s244110.pth" ##模型文件 cur_dir = os.path.abspath(os.path.dirname(__file__)) -abs_path = os.path.join(cur_dir,song_folder,tmp_workspace_name) + '/' +abs_path = os.path.join(cur_dir, song_folder, tmp_workspace_name) + '/' f0_method = None + def mix(in_path, acc_path, dst_path): # svc转码到442 svc_442_file = in_path + "_442.wav" st = time.time() cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {} -loglevel fatal".format(in_path, svc_442_file) os.system(cmd) if not os.path.exists(svc_442_file): return -1 print("transcode,{},sp={}".format(in_path, time.time() - st)) # 混合 st = time.time() cmd = "{} {} {} {} 1".format(gs_simple_mixer_path, svc_442_file, acc_path, dst_path) os.system(cmd) print("mixer,{},sp={}".format(in_path, time.time() - st)) def load_model(): global f0_method - embed_model = get_embed_model() + embed_model = get_embed_model(gs_embed_model_path) hubert_model = load_hubert() - get_vc(pth_model_path) - f0_method = get_rmvpe() + get_vc(gs_svc_model_path) + f0_method = get_rmvpe(gs_rmvpe_model_path) print("model preload finish!!!") - return embed_model, hubert_model#,svc_model + return embed_model, hubert_model # ,svc_model + def meisheng_init(): - embed_model, hubert_model = load_model() ##提前加载模型 + embed_model, hubert_model = load_model() ##提前加载模型 gender_model = load_gender_model() - return embed_model, hubert_model, gender_model + return embed_model, hubert_model, gender_model + def pyin_process_single_rmvpe(input_file): global f0_method if f0_method is None: f0_method = get_rmvpe() - rate = 16000 #44100 + rate = 16000 # 44100 # 读取音频文件 y, sr = librosa.load(input_file, sr=rate) - - len_s = len(y)/sr - lim_s = 15 #10 - if(len_s > lim_s): - y1 = y[:sr*lim_s] - y2 = y[-sr*lim_s:] + len_s = len(y) / sr + lim_s = 15 # 10 + if (len_s > lim_s): + y1 = y[:sr * lim_s] + y2 = y[-sr * lim_s:] f0 = f0_method.infer_from_audio(y1, thred=0.03) f0 = f0[f0 < 600] valid_f0 = f0[f0 > 50] mean_pitch1 = np.mean(valid_f0) f0 = f0_method.infer_from_audio(y2, thred=0.03) f0 = f0[f0 < 600] valid_f0 = f0[f0 > 50] mean_pitch2 = np.mean(valid_f0) if abs(mean_pitch1 - mean_pitch2) > 55: - mean_pitch_cur = min(mean_pitch1, mean_pitch2) + mean_pitch_cur = min(mean_pitch1, mean_pitch2) else: - mean_pitch_cur = (mean_pitch1 + mean_pitch2) / 2 + mean_pitch_cur = (mean_pitch1 + mean_pitch2) / 2 else: f0 = f0_method.infer_from_audio(y, thred=0.03) f0 = f0[f0 < 600] valid_f0 = f0[f0 > 50] mean_pitch_cur = np.mean(valid_f0) - return mean_pitch_cur -def meisheng_svc(song_wav, target_wav, svc_out_path, embed_npy, embed_md, hubert_md, paras): +def meisheng_svc(song_wav, target_wav, svc_out_path, embed_npy, embed_md, hubert_md, paras): ##计算pitch f0up_key = pyin_process_single_rmvpe(target_wav) - ## get embed + ## get embed, 音色 get_embed(target_wav, embed_npy, embed_md) print("svc main start...") - svc_main(song_wav, svc_out_path, pth_model_path, embed_npy, f0up_key, hubert_md, paras) + svc_main(song_wav, svc_out_path, embed_npy, f0up_key, hubert_md, paras) print("svc main finished!!") return 0 -def process_svc_online(song_wav, target_wav, svc_out_path, embed_md, hubert_md, paras): +def process_svc_online(song_wav, target_wav, svc_out_path, embed_md, hubert_md, paras): embed_npy = target_wav[:-4] + '.npy' ##embd npy存储位置 similar = meisheng_svc(song_wav, target_wav, svc_out_path, embed_npy, embed_md, hubert_md, paras) return similar -def process_svc(song_wav, target_wav, svc_out_path, embed_md, hubert_md, paras): +def process_svc(song_wav, target_wav, svc_out_path, embed_md, hubert_md, paras): song_wav1, target_wav, svc_out_path = os.path.basename(song_wav), os.path.basename( - target_wav), os.path.basename(svc_out_path) #绝对路径 + target_wav), os.path.basename(svc_out_path) # 绝对路径 song_wav, target_wav, svc_out_path = song_wav, abs_path + target_wav, abs_path + svc_out_path embed_npy = target_wav[:-4] + '.npy' ##embd npy存储位置 # similar = meisheng_svc(song_wav,target_wav,svc_out_path,embed_npy,paras) similar = meisheng_svc(song_wav, target_wav, svc_out_path, embed_npy, embed_md, hubert_md, paras) return similar def get_svc(target_yinse_wav, song_name, embed_model, hubert_model, paras): ''' :param target_yinse_wav: 目标音色 :param song_name: 歌曲名字 ;param paras: 其他参数 :return: svc路径名 ''' ##清空工作空间临时路径 if os.path.exists(gs_work_dir): - #shutil.rmtree(gs_work_dir) + # shutil.rmtree(gs_work_dir) cmd = f"rm -rf {gs_work_dir}/*" os.system(cmd) else: os.makedirs(gs_work_dir) - gender = paras['gender']##为了确定歌曲 + gender = paras['gender'] ##为了确定歌曲 ##目标音色读取 f_dst = os.path.join(gs_work_dir, os.path.basename(target_yinse_wav)) - #print("dir :", f_dst,"target_yinse_wav:",target_yinse_wav) - #shutil.move(target_yinse_wav, f_dst) ##放在工作目录 + # print("dir :", f_dst,"target_yinse_wav:",target_yinse_wav) + # shutil.move(target_yinse_wav, f_dst) ##放在工作目录 shutil.copy(target_yinse_wav, f_dst) target_yinse_wav = f_dst ##歌曲/伴奏 读取(路径需要修改) song_wav = os.path.join("{}{}/{}/vocal321.wav".format(song_folder, gender, song_name)) # 歌曲vocal inf_acc_path = os.path.join("{}{}/{}/acc.wav".format(song_folder, gender, song_name)) - #song_wav = './xusong_long.wav' + # song_wav = './xusong_long.wav' svc_out_path = os.path.join(gs_work_dir, "svc.wav") ###svc结果名字 print("inputMsg:", song_wav, target_yinse_wav, svc_out_path) ## svc process st = time.time() print("start inference...") - similar = process_svc(song_wav, target_yinse_wav, svc_out_path, embed_model, hubert_model,paras) + similar = process_svc(song_wav, target_yinse_wav, svc_out_path, embed_model, hubert_model, paras) print("svc finished!!") print("time cost = {}".format(time.time() - st)) print("out path name {} ".format(svc_out_path)) - #''' + # ''' ##加混响 print("add reverbration...") svc_out_path_effect = svc_out_path[:-4] + '_effect.wav' cmd = f"/data/gpu_env_common/bin/effect_tool {svc_out_path} {svc_out_path_effect}" print("cmd :", cmd) os.system(cmd) # # 人声伴奏合并 print("add acc...") out_path = svc_out_path_effect[:-4] + '_music.wav' mix(svc_out_path_effect, inf_acc_path, out_path) print("time cost = {}".format(time.time() - st)) print("out path name {} ".format(out_path)) - #''' - + # ''' return svc_out_path -def meisheng_func(target_yinse_wav,song_name, paras): - +def meisheng_func(target_yinse_wav, song_name, paras): ##init embed_model, hubert_model, gender_model = meisheng_init() ###gender predict gender, female_rate, is_pure = gender_model.process(target_yinse_wav) print('=====================') print("gender:{}, female_rate:{},is_pure:{}".format(gender, female_rate, is_pure)) if gender == 0: gender = 'female' elif gender == 1: gender = 'male' elif female_rate > 0.5: gender = 'female' else: gender = 'male' print("modified gender:{} ".format(gender)) print('=====================') ##美声main - paras['gender'] = gender ##单位都是ms + paras['gender'] = gender ##单位都是ms get_svc(target_yinse_wav, song_name, embed_model, hubert_model, paras) -if __name__=='__main__': - - #target_yinse_wav = "./raw/meisheng_yinse/female/changying.wav" # 需要完整路径 +if __name__ == '__main__': + # target_yinse_wav = "./raw/meisheng_yinse/female/changying.wav" # 需要完整路径 target_yinse_wav = "./raw/meisheng_yinse/female/target_yinse_cloris.m4a" - song_name = "lost_stars" ##歌曲名字 + song_name = "lost_stars" ##歌曲名字 paras = {'gender': None, 'tst': 0, "tnd": None, 'delay': 0, 'song_path': None} # paras = {'gender': 'female', 'tst': 0, "tnd": 30, 'delay': 0} ###片段svc测试 meisheng_func(target_yinse_wav, song_name, paras) - - - diff --git a/AIMeiSheng/myinfer_multi_spk_embed_in_dec_diff_fi_meisheng.py b/AIMeiSheng/myinfer_multi_spk_embed_in_dec_diff_fi_meisheng.py index b14397a..b68dc3b 100644 --- a/AIMeiSheng/myinfer_multi_spk_embed_in_dec_diff_fi_meisheng.py +++ b/AIMeiSheng/myinfer_multi_spk_embed_in_dec_diff_fi_meisheng.py @@ -1,217 +1,215 @@ import os,sys,pdb,torch now_dir = os.getcwd() sys.path.append(now_dir) import argparse import glob import sys import torch from multiprocessing import cpu_count class Config: def __init__(self,device,is_half): self.device = device self.is_half = is_half self.n_cpu = 0 self.gpu_name = None self.gpu_mem = None self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config() def device_config(self) -> tuple: if torch.cuda.is_available(): i_device = int(self.device.split(":")[-1]) self.gpu_name = torch.cuda.get_device_name(i_device) if ( ("16" in self.gpu_name and "V100" not in self.gpu_name.upper()) or "P40" in self.gpu_name.upper() or "1060" in self.gpu_name or "1070" in self.gpu_name or "1080" in self.gpu_name ): print("16系/10系显卡和P40强制单精度") self.is_half = False for config_file in ["32k.json", "40k.json", "48k.json"]: with open(f"configs/{config_file}", "r") as f: strr = f.read().replace("true", "false") with open(f"configs/{config_file}", "w") as f: f.write(strr) with open("trainset_preprocess_pipeline_print.py", "r") as f: strr = f.read().replace("3.7", "3.0") with open("trainset_preprocess_pipeline_print.py", "w") as f: f.write(strr) else: self.gpu_name = None self.gpu_mem = int( torch.cuda.get_device_properties(i_device).total_memory / 1024 / 1024 / 1024 + 0.4 ) if self.gpu_mem <= 4: with open("trainset_preprocess_pipeline_print.py", "r") as f: strr = f.read().replace("3.7", "3.0") with open("trainset_preprocess_pipeline_print.py", "w") as f: f.write(strr) elif torch.backends.mps.is_available(): print("没有发现支持的N卡, 使用MPS进行推理") self.device = "mps" else: print("没有发现支持的N卡, 使用CPU进行推理") self.device = "cpu" self.is_half = True if self.n_cpu == 0: self.n_cpu = cpu_count() if self.is_half: # 6G显存配置 x_pad = 3 x_query = 10 x_center = 80 #60 x_max = 85#65 else: # 5G显存配置 x_pad = 1 x_query = 6 x_center = 38 x_max = 41 if self.gpu_mem != None and self.gpu_mem <= 4: x_pad = 1 x_query = 5 x_center = 30 x_max = 32 return x_pad, x_query, x_center, x_max index_path="./logs/xusong_v2_org_version_multispk_charlie_puth_embed_in_dec_muloss_show/added_IVF614_Flat_nprobe_1_xusong_v2_org_version_multispk_charlie_puth_embed_in_dec_show_v2.index" # f0method="rmvpe" #harvest or pm index_rate=float("0.0") #index rate device="cuda:0" is_half=True filter_radius=int(3) ##3 resample_sr=int(0) # 0 rms_mix_rate=float(1) # rms混合比例 1,不等于1混合 protect=float(0.33 )## ??? 0.33 fang #print(sys.argv) config=Config(device,is_half) now_dir=os.getcwd() sys.path.append(now_dir) from vc_infer_pipeline_org_embed import VC from lib.infer_pack.models_embed_in_dec_diff_fi import ( SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono, SynthesizerTrnMs768NSFsid, SynthesizerTrnMs768NSFsid_nono, ) from lib.audio import load_audio from fairseq import checkpoint_utils from scipy.io import wavfile - +from AIMeiSheng.docker_demo.common import gs_hubert_model_path # hubert_model=None def load_hubert(): # global hubert_model - models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(["hubert_base.pt"],suffix="",) + models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task([gs_hubert_model_path],suffix="",) #models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(["checkpoint_best_legacy_500.pt"],suffix="",) hubert_model = models[0] hubert_model = hubert_model.to(device) if(is_half):hubert_model = hubert_model.half() else:hubert_model = hubert_model.float() hubert_model.eval() return hubert_model def vc_single(sid,input_audio,f0_up_key,f0_file,f0_method,file_index,index_rate,hubert_model,paras): global tgt_sr,net_g,vc,version if input_audio is None:return "You need to upload an audio", None f0_up_key = int(f0_up_key) # print("@@xxxf0_up_key:",f0_up_key) audio = load_audio(input_audio,16000) if paras != None: st = int(paras['tst'] * 16000/1000) en = len(audio) if paras['tnd'] != None: en = min(en,int(paras['tnd'] * 16000/1000)) audio = audio[st:en] times = [0, 0, 0] if(hubert_model==None): hubert_model = load_hubert() if_f0 = cpt.get("f0", 1) audio_opt=vc.pipeline_mulprocess(hubert_model,net_g,sid,audio,input_audio,times,f0_up_key,f0_method,file_index,index_rate,if_f0,filter_radius,tgt_sr,resample_sr,rms_mix_rate,version,protect,f0_file=f0_file) #print(times) #print("@@using multi process") return audio_opt def get_vc_core(model_path,is_half): #print("loading pth %s" % model_path) cpt = torch.load(model_path, map_location="cpu") tgt_sr = cpt["config"][-1] cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] if_f0 = cpt.get("f0", 1) version = cpt.get("version", "v1") if version == "v1": if if_f0 == 1: net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half) else: net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) elif version == "v2": if if_f0 == 1: # net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=is_half) else: net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) #print("load model finished") del net_g.enc_q net_g.load_state_dict(cpt["weight"], strict=False) #print("load net_g finished") return tgt_sr,net_g,cpt,version def get_vc1(model_path,is_half): tgt_sr, net_g, cpt, version = get_vc_core(model_path, is_half) net_g.eval().to(device) if (is_half):net_g = net_g.half() else:net_g = net_g.float() vc = VC(tgt_sr, config) n_spk=cpt["config"][-3] return -def get_rmvpe(): +def get_rmvpe(model_path="rmvpe.pt"): from lib.rmvpe import RMVPE global f0_method #print("loading rmvpe model") - f0_method = RMVPE( - "rmvpe.pt", is_half=True, device='cuda:0' - ) + f0_method = RMVPE(model_path, is_half=True, device='cuda') return f0_method def get_vc(model_path): global n_spk,tgt_sr,net_g,vc,cpt,device,is_half,version tgt_sr, net_g, cpt, version = get_vc_core(model_path, is_half) net_g.eval().to(device) if (is_half):net_g = net_g.half() else:net_g = net_g.float() vc = VC(tgt_sr, config) n_spk=cpt["config"][-3] # return {"visible": True,"maximum": n_spk, "__type__": "update"} # return net_g -def svc_main(input_path,opt_path,model_path,sid_embed,f0up_key=0,hubert_model=None, paras=None): +def svc_main(input_path,opt_path,sid_embed,f0up_key=0,hubert_model=None, paras=None): #print("sid_embed: ",sid_embed) wav_opt = vc_single(sid_embed,input_path,f0up_key,None,f0_method,index_path,index_rate,hubert_model,paras) #print("out_path: ",opt_path) wavfile.write(opt_path, tgt_sr, wav_opt) diff --git a/AIMeiSheng/vc_infer_pipeline_org_embed.py b/AIMeiSheng/vc_infer_pipeline_org_embed.py index bfda281..d53f4ae 100644 --- a/AIMeiSheng/vc_infer_pipeline_org_embed.py +++ b/AIMeiSheng/vc_infer_pipeline_org_embed.py @@ -1,760 +1,760 @@ import numpy as np, parselmouth, torch, pdb, sys, os from time import time as ttime import torch.nn.functional as F import scipy.signal as signal import pyworld, os, traceback, faiss, librosa, torchcrepe from scipy import signal from functools import lru_cache now_dir = os.getcwd() sys.path.append(now_dir) bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) input_audio_path2wav = {} fidx = 0 import threading import concurrent.futures @lru_cache def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period): audio = input_audio_path2wav[input_audio_path] f0, t = pyworld.harvest( audio, fs=fs, f0_ceil=f0max, f0_floor=f0min, frame_period=frame_period, ) f0 = pyworld.stonemask(audio, f0, t, fs) return f0 def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比 # print(data1.max(),data2.max()) rms1 = librosa.feature.rms( y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2 ) # 每半秒一个点 rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2) rms1 = torch.from_numpy(rms1) rms1 = F.interpolate( rms1.unsqueeze(0), size=data2.shape[0], mode="linear" ).squeeze() rms2 = torch.from_numpy(rms2) rms2 = F.interpolate( rms2.unsqueeze(0), size=data2.shape[0], mode="linear" ).squeeze() rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6) data2 *= ( torch.pow(rms1, torch.tensor(1 - rate)) * torch.pow(rms2, torch.tensor(rate - 1)) ).numpy() return data2 class VC(object): def __init__(self, tgt_sr, config): self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = ( config.x_pad, ##config会根据设备配置不通知如:3 config.x_query, # 10 等于x_max-x_center)*2 config.x_center, #60 config.x_max, #65 config.is_half, ) self.sr = 16000 # hubert输入采样率 self.window = 160 # 每帧点数 self.t_pad = self.sr * self.x_pad # 每条前后pad时间 self.t_pad_tgt = tgt_sr * self.x_pad self.t_pad2 = self.t_pad * 2 self.t_query = self.sr * self.x_query # 查询切点前后查询时间, self.t_center = self.sr * self.x_center # 查询切点位置 self.t_max = self.sr * self.x_max # 免查询时长阈值 self.device = config.device def get_f0( self, input_audio_path, x, p_len, f0_up_key, f0_method, filter_radius, inp_f0=None, ): global input_audio_path2wav time_step = self.window / self.sr * 1000 f0_min = 50 f0_max = 1100 f0_mel_min = 1127 * np.log(1 + f0_min / 700) f0_mel_max = 1127 * np.log(1 + f0_max / 700) if f0_method == "pm": f0 = ( parselmouth.Sound(x, self.sr) .to_pitch_ac( time_step=time_step / 1000, voicing_threshold=0.6, pitch_floor=f0_min, pitch_ceiling=f0_max, ) .selected_array["frequency"] ) pad_size = (p_len - len(f0) + 1) // 2 if pad_size > 0 or p_len - len(f0) - pad_size > 0: f0 = np.pad( f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" ) elif f0_method == "harvest": input_audio_path2wav[input_audio_path] = x.astype(np.double) f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10) if filter_radius > 2: f0 = signal.medfilt(f0, 3) elif f0_method == "crepe": model = "full" # Pick a batch size that doesn't cause memory errors on your gpu batch_size = 512 # Compute pitch using first gpu audio = torch.tensor(np.copy(x))[None].float() f0, pd = torchcrepe.predict( audio, self.sr, self.window, f0_min, f0_max, model, batch_size=batch_size, device=self.device, return_periodicity=True, ) pd = torchcrepe.filter.median(pd, 3) f0 = torchcrepe.filter.mean(f0, 3) f0[pd < 0.1] = 0 f0 = f0[0].cpu().numpy() elif f0_method == "rmvpe": if hasattr(self, "model_rmvpe") == False: from lib.rmvpe import RMVPE print("loading rmvpe model") self.model_rmvpe = RMVPE( "rmvpe.pt", is_half=self.is_half, device=self.device ) f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) else: ##for meisheng self.model_rmvpe = f0_method f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) ##这里读文件,更改pitch st fang valid_f0 = f0[f0 > 50] mean_pitch_cur = np.mean(valid_f0[:min(len(valid_f0),500)]) #print("@@f0_up_key:",f0_up_key) deta = 0 if(f0_up_key > 50 ): deta = -mean_pitch_cur + f0_up_key #print("$$$$$$$$$fangxxxxx pitch shift: ",deta) f0_up_key = int(np.log2(deta/(mean_pitch_cur + 1) + 1) * 12)##方法2 fang #if( abs(f0_up_key) < 3 ): # f0_up_key = 0 f0_up_key = max(min(12,f0_up_key),-12) #print("f0_up_key: ",f0_up_key) f0 *= pow(2, f0_up_key / 12)#这块是音调更改 fang 我设置的0 # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) tf0 = self.sr // self.window # 每秒f0点数 if inp_f0 is not None: delta_t = np.round( (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1 ).astype("int16") replace_f0 = np.interp( list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1] ) shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0] f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[ :shape ] # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) f0bak = f0.copy() f0_mel = 1127 * np.log(1 + f0 / 700) f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( f0_mel_max - f0_mel_min ) + 1 f0_mel[f0_mel <= 1] = 1 f0_mel[f0_mel > 255] = 255 - f0_coarse = np.rint(f0_mel).astype(np.int) + f0_coarse = np.rint(f0_mel).astype(int) return f0_coarse, f0bak # 1-0 def vc( self, model, net_g, sid, audio0, pitch, pitchf, times, index, big_npy, index_rate, version, protect, ): # ,file_index,file_big_npy feats = torch.from_numpy(audio0) if self.is_half: feats = feats.half() else: feats = feats.float() if feats.dim() == 2: # double channels feats = feats.mean(-1) assert feats.dim() == 1, feats.dim() feats = feats.view(1, -1) padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) #print("@@@feats: ",feats.shape) #print("@@@padding_mask: ",padding_mask.shape) inputs = { "source": feats.to(self.device), "padding_mask": padding_mask, "output_layer": 9 if version == "v1" else 12, #"output_layer": 6 if version == "v1" else 12, } t0 = ttime() #''' with torch.no_grad(): logits = model.extract_features(**inputs) feats = model.final_proj(logits[0]) if version == "v1" else logits[0]#为何v1要转化,维度问题??? fang #''' #print("@@@feats: ",feats.shape) ''' global fidx feats_name = f"./feats_{fidx}.pt" fidx += 1 torch.save(feats, feats_name) feats = torch.load(feats_name) #''' if protect < 0.5 and pitch != None and pitchf != None: feats0 = feats.clone() if ( isinstance(index, type(None)) == False and isinstance(big_npy, type(None)) == False and index_rate != 0 ): npy = feats[0].cpu().numpy() if self.is_half: npy = npy.astype("float32") # _, I = index.search(npy, 1) # npy = big_npy[I.squeeze()] score, ix = index.search(npy, k=8) weight = np.square(1 / score) weight /= weight.sum(axis=1, keepdims=True) npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) if self.is_half: npy = npy.astype("float16") feats = ( torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats )##基于index和实际音频的特征进行组合,作为输入 fang #print("@@@feats: ",feats.shape) feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) if protect < 0.5 and pitch != None and pitchf != None: feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute( 0, 2, 1 )#feats0的维度1 插值增加一倍 fang t1 = ttime() p_len = audio0.shape[0] // self.window ##分帧求pitch fang if feats.shape[1] < p_len: p_len = feats.shape[1] if pitch != None and pitchf != None: pitch = pitch[:, :p_len] pitchf = pitchf[:, :p_len] if protect < 0.5 and pitch != None and pitchf != None: pitchff = pitchf.clone() pitchff[pitchf > 0] = 1 pitchff[pitchf < 1] = protect pitchff = pitchff.unsqueeze(-1) feats = feats * pitchff + feats0 * (1 - pitchff) feats = feats.to(feats0.dtype) p_len = torch.tensor([p_len], device=self.device).long() #print("###feats:",feats.shape,"pitch:",pitch.shape,"p_len:",p_len) with torch.no_grad(): if pitch != None and pitchf != None: audio1 = ( (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]) .data.cpu() .float() .numpy() ) else: audio1 = ( (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy() ) del feats, p_len, padding_mask if torch.cuda.is_available(): torch.cuda.empty_cache() t2 = ttime() times[0] += t1 - t0 times[2] += t2 - t1 return audio1 def pipeline( self, model, net_g, sid, audio,## input wav input_audio_path, #input wav name times, f0_up_key, f0_method,# f0 meathod file_index, #index 路径 # file_big_npy, index_rate, if_f0, filter_radius, tgt_sr, resample_sr, rms_mix_rate, version, protect, f0_file=None, ): if ( file_index != "" #.index文件不为空 fang # and file_big_npy != "" # and os.path.exists(file_big_npy) == True and os.path.exists(file_index) == True and index_rate != 0 ): try: index = faiss.read_index(file_index) # big_npy = np.load(file_big_npy) big_npy = index.reconstruct_n(0, index.ntotal) except: traceback.print_exc() index = big_npy = None else: index = big_npy = None #print("####audio 1:",audio.shape) audio = signal.filtfilt(bh, ah, audio) #print("####audio 2:",audio.shape) audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") opt_ts = [] #print("###t_max:",self.t_max) #print("###window:",self.window,"self.t_query:",self.t_query,"self.t_pad2:",self.t_pad2) if audio_pad.shape[0] > self.t_max: audio_sum = np.zeros_like(audio) for i in range(self.window): audio_sum += audio_pad[i : i - self.window]#这样算循环了,每个idx是过去一帧的值的和 fang for t in range(self.t_center, audio.shape[0], self.t_center):#一分钟一帧?? fang opt_ts.append( t - self.t_query + np.where( np.abs(audio_sum[t - self.t_query : t + self.t_query]) == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min() )[0][0] )#返回[ t - self.t_query, t+self.t_query] 区间最小值位置的索引保存,fang s = 0 audio_opt = [] t = None t1 = ttime() audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") p_len = audio_pad.shape[0] // self.window inp_f0 = None if hasattr(f0_file, "name") == True: try: with open(f0_file.name, "r") as f: lines = f.read().strip("\n").split("\n") inp_f0 = [] for line in lines: inp_f0.append([float(i) for i in line.split(",")]) inp_f0 = np.array(inp_f0, dtype="float32") except: traceback.print_exc() #sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() sid_embed = np.load(sid) sid = torch.FloatTensor(sid_embed).to(self.device).half() pitch, pitchf = None, None if if_f0 == 1: pitch, pitchf = self.get_f0( input_audio_path, audio_pad, p_len, f0_up_key, f0_method, filter_radius, inp_f0, ) pitch = pitch[:p_len] pitchf = pitchf[:p_len] if self.device == "mps": pitchf = pitchf.astype(np.float32) pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() #print("&&&&pitch: ",pitchf) t2 = ttime() times[1] += t2 - t1 #print("####len(audio_pad):",len(audio_pad)) #print("###pitch:", pitch.shape) for t in opt_ts: #分段推理每段音频,一段这里设置60s左右 fang t = t // self.window * self.window if if_f0 == 1: audio_opt.append( self.vc( model, net_g, sid, audio_pad[s : t + self.t_pad2 + self.window], pitch[:, s // self.window : (t + self.t_pad2) // self.window], pitchf[:, s // self.window : (t + self.t_pad2) // self.window], times, index, big_npy, index_rate, version, protect, )[self.t_pad_tgt : -self.t_pad_tgt] ) else: audio_opt.append( self.vc( model, net_g, sid, audio_pad[s : t + self.t_pad2 + self.window], None, None, times, index, big_npy, index_rate, version, protect, )[self.t_pad_tgt : -self.t_pad_tgt] ) s = t if if_f0 == 1: ##后面是最后一段处理 fang audio_opt.append( self.vc( model, net_g, sid, audio_pad[t:], pitch[:, t // self.window :] if t is not None else pitch, pitchf[:, t // self.window :] if t is not None else pitchf, times, index, big_npy, index_rate, version, protect, )[self.t_pad_tgt : -self.t_pad_tgt] ) else: audio_opt.append( self.vc( model, net_g, sid, audio_pad[t:], None, None, times, index, big_npy, index_rate, version, protect, )[self.t_pad_tgt : -self.t_pad_tgt] ) audio_opt = np.concatenate(audio_opt) if rms_mix_rate != 1: audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate) if resample_sr >= 16000 and tgt_sr != resample_sr: audio_opt = librosa.resample( audio_opt, orig_sr=tgt_sr, target_sr=resample_sr ) audio_max = np.abs(audio_opt).max() / 0.99 max_int16 = 32768 if audio_max > 1: max_int16 /= audio_max audio_opt = (audio_opt * max_int16).astype(np.int16) del pitch, pitchf, sid if torch.cuda.is_available(): torch.cuda.empty_cache() return audio_opt def infer_core_fang(self,para1,para2,para3,idx, model, net_g, sid, times, index, big_npy, index_rate, version, protect): return [ self.vc( model, net_g, sid, para1, para2, para3, # audio_pad[s: t + self.t_pad2 + self.window], # pitch[:, s // self.window: (t + self.t_pad2) // self.window], # pitchf[:, s // self.window: (t + self.t_pad2) // self.window], times, index, big_npy, index_rate, version, protect, )[self.t_pad_tgt: -self.t_pad_tgt], idx] def ThreadPool_process_core(self, func_process,params1,params2,params3, model, net_g, sid, # audio_pad[s: t + self.t_pad2 + self.window], # pitch[:, s // self.window: (t + self.t_pad2) // self.window], # pitchf[:, s // self.window: (t + self.t_pad2) // self.window], times, index, big_npy, index_rate, version, protect ): num_threads = 2 futures = [] sort_ret = {} with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor: for idx in range(len(params1)): para1 = params1[idx] para2 = params2[idx] para3 = params3[idx] ret = executor.submit(self.infer_core_fang,para1,para2,para3,idx, model, net_g, sid, times, index, big_npy, index_rate, version, protect) futures.append(ret) cnt = 0 for future in concurrent.futures.as_completed(futures): cnt += 1 #print(f"process finised {cnt}, and index :{future.result()[1]}") #print(future.result()) # result # print(future.result()[1]) ##index sort_ret[str(future.result()[1])] = future.result()[0] fea_list = [] for idx in range(len(sort_ret)): fea_list.append(sort_ret[str(idx)]) return fea_list def pipeline_mulprocess( self, model, net_g, sid, audio, ## input wav input_audio_path, # input wav name times, f0_up_key, f0_method, # f0 meathod file_index, # index 路径 # file_big_npy, index_rate, if_f0, filter_radius, tgt_sr, resample_sr, rms_mix_rate, version, protect, f0_file=None, ): if ( file_index != "" # .index文件不为空 fang # and file_big_npy != "" # and os.path.exists(file_big_npy) == True and os.path.exists(file_index) == True and index_rate != 0 ): try: index = faiss.read_index(file_index) # big_npy = np.load(file_big_npy) big_npy = index.reconstruct_n(0, index.ntotal) except: traceback.print_exc() index = big_npy = None else: index = big_npy = None audio = signal.filtfilt(bh, ah, audio) audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") opt_ts = [] if audio_pad.shape[0] > self.t_max: audio_sum = np.zeros_like(audio) for i in range(self.window): audio_sum += audio_pad[i: i - self.window] # 这样算循环了,每个idx是过去一帧的值的和 fang for t in range(self.t_center, audio.shape[0], self.t_center): # 一分钟一帧?? fang opt_ts.append( t - self.t_query + np.where( np.abs(audio_sum[t - self.t_query: t + self.t_query]) == np.abs(audio_sum[t - self.t_query: t + self.t_query]).min() )[0][0] ) # 返回[ t - self.t_query, t+self.t_query] 区间最小值位置的索引保存,fang s = 0 t = None t1 = ttime() audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") p_len = audio_pad.shape[0] // self.window inp_f0 = None if hasattr(f0_file, "name") == True: try: with open(f0_file.name, "r") as f: lines = f.read().strip("\n").split("\n") inp_f0 = [] for line in lines: inp_f0.append([float(i) for i in line.split(",")]) inp_f0 = np.array(inp_f0, dtype="float32") except: traceback.print_exc() # sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() sid_embed = np.load(sid) sid = torch.FloatTensor(sid_embed).to(self.device).half() pitch, pitchf = None, None #''' if if_f0 == 1: pitch, pitchf = self.get_f0( input_audio_path, audio_pad, p_len, f0_up_key, f0_method, filter_radius, inp_f0, ) pitch = pitch[:p_len] pitchf = pitchf[:p_len] if self.device == "mps": pitchf = pitchf.astype(np.float32) pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() #''' ''' pitch_name = "./pitch_pitchf.npz" #np.savez(pitch_name, pitch = pitch.detach().cpu().numpy(), pitchf = pitchf.detach().cpu().numpy()) npz_obj = np.load(pitch_name) #文件名的后缀为npz pitch, pitchf = npz_obj['pitch'], npz_obj['pitchf'] pitch = torch.tensor(pitch, device=self.device).long() pitchf = torch.tensor(pitchf, device=self.device).float() #''' t2 = ttime() times[1] += t2 - t1 audio_opt = [] audio_pad_list = [] pitch_list = [] pitchf_list = [] for t in opt_ts: # 分段推理每段音频,一段这里设置60s左右 fang t = t // self.window * self.window audio_pad_list.append(audio_pad[s: t + self.t_pad2 + self.window]) pitch_list.append(pitch[:, s // self.window: (t + self.t_pad2) // self.window]) pitchf_list.append(pitchf[:, s // self.window: (t + self.t_pad2) // self.window]) s = t audio_pad_list.append(audio_pad[t:]) pitch_list.append(pitch[:, t // self.window:] if t is not None else pitch) pitchf_list.append(pitchf[:, t // self.window:] if t is not None else pitchf) audio_opt = self.ThreadPool_process_core(self.infer_core_fang, audio_pad_list, pitch_list, pitchf_list, model, net_g, sid, times, index, big_npy, index_rate, version, protect ) ''' if if_f0 == 1: ##后面是最后一段处理 fang audio_opt.append( self.vc( model, net_g, sid, audio_pad[t:], pitch[:, t // self.window:] if t is not None else pitch, pitchf[:, t // self.window:] if t is not None else pitchf, times, index, big_npy, index_rate, version, protect, )[self.t_pad_tgt: -self.t_pad_tgt] ) else: audio_opt.append( self.vc( model, net_g, sid, audio_pad[t:], None, None, times, index, big_npy, index_rate, version, protect, )[self.t_pad_tgt: -self.t_pad_tgt] ) #''' audio_opt = np.concatenate(audio_opt) if rms_mix_rate != 1: audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate) if resample_sr >= 16000 and tgt_sr != resample_sr: audio_opt = librosa.resample( audio_opt, orig_sr=tgt_sr, target_sr=resample_sr ) audio_max = np.abs(audio_opt).max() / 0.99 max_int16 = 32768 if audio_max > 1: max_int16 /= audio_max audio_opt = (audio_opt * max_int16).astype(np.int16) del pitch, pitchf, sid if torch.cuda.is_available(): torch.cuda.empty_cache() return audio_opt diff --git a/AIMeiSheng/voice_classification/online/voice_class_online_fang.py b/AIMeiSheng/voice_classification/online/voice_class_online_fang.py index dca2cfe..a1a7d8f 100644 --- a/AIMeiSheng/voice_classification/online/voice_class_online_fang.py +++ b/AIMeiSheng/voice_classification/online/voice_class_online_fang.py @@ -1,423 +1,442 @@ """ 男女声分类在线工具 1 转码为16bit单声道 2 均衡化 3 模型分类 """ import os import sys import librosa import shutil import logging import time import torch.nn.functional as F import numpy as np from model import * + # from common import bind_kernel logging.basicConfig(level=logging.INFO) os.environ["LRU_CACHE_CAPACITY"] = "1" # torch.set_num_threads(1) # bind_kernel(1) """ 临时用一下,全局使用的变量 """ transcode_time = 0 vb_time = 0 mfcc_time = 0 predict_time = 0 """ 错误码 """ ERR_CODE_SUCCESS = 0 # 处理成功 ERR_CODE_NO_FILE = -1 # 文件不存在 ERR_CODE_TRANSCODE = -2 # 转码失败 ERR_CODE_VOLUME_BALANCED = -3 # 均衡化失败 ERR_CODE_FEATURE_TOO_SHORT = -4 # 特征文件太短 """ 常量 """ FRAME_LEN = 128 MFCC_LEN = 80 -EBUR128_BIN = "/data/gpu_env_common/res/av_svc/bin/standard_audio_no_cut" +gs_bin_url = "https://av-audit-sync-sg-1256122840.cos.ap-singapore.myqcloud.com/hub/voice_classification/bin/bin.zip" +EBUR128_BIN = "/tmp/voice_class_bin/standard_audio_no_cut" # EBUR128_BIN = "/Users/yangjianli/linux/opt/soft/bin/standard_audio_no_cut" GENDER_FEMALE = 0 GENDER_MALE = 1 GENDER_OTHER = 2 """ 通用函数 """ def exec_cmd(cmd): ret = os.system(cmd) if ret != 0: return False return True """ 业务需要的函数 """ def get_one_mfcc(file_url): st = time.time() data, sr = librosa.load(file_url, sr=16000) if len(data) < 512: return [] mfcc = librosa.feature.mfcc(y=data, sr=sr, n_fft=512, hop_length=256, n_mfcc=MFCC_LEN) mfcc = mfcc.transpose() print("get_one_mfcc:spend_time={}".format(time.time() - st)) global mfcc_time mfcc_time += time.time() - st return mfcc +def download_volume_balanced(): + import urllib.request + if not os.path.exists(EBUR128_BIN): + dst_path = "/tmp/bin.zip" + urllib.request.urlretrieve(gs_bin_url, dst_path) + if not os.path.exists(dst_path): + print(f"download dst_path={gs_bin_url} err!") + exit(-1) + dirname = os.path.dirname(dst_path) + cmd = f"cd {dirname}; unzip bin.zip; rm -f bin.zip; mv bin voice_class_bin" + os.system(cmd) + if not os.path.exists(EBUR128_BIN): + print(f"exec {cmd} err!") + exit(-1) + + def volume_balanced(src, dst): st = time.time() + download_volume_balanced() cmd = "{} {} {}".format(EBUR128_BIN, src, dst) logging.info(cmd) exec_cmd(cmd) if not os.path.exists(dst): logging.error("volume_balanced:cmd={}".format(cmd)) print("volume_balanced:spend_time={}".format(time.time() - st)) global vb_time vb_time += time.time() - st return os.path.exists(dst) def transcode(src, dst): st = time.time() cmd = "ffmpeg -loglevel quiet -i {} -ar 16000 -ac 1 {}".format(src, dst) logging.info(cmd) exec_cmd(cmd) if not os.path.exists(dst): logging.error("transcode:cmd={}".format(cmd)) print("transcode:spend_time={}".format(time.time() - st)) global transcode_time transcode_time += time.time() - st return os.path.exists(dst) class VoiceClass: def __init__(self, music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model): """ 四个模型 :param music_voice_pure_model: 分辨纯净人声/其他 :param music_voice_no_pure_model: 分辨有人声/其他 :param gender_pure_model: 纯净人声分辨男女 :param gender_no_pure_model: 有人声分辨男女 """ st = time.time() self.device = "cpu" self.batch_size = 256 self.music_voice_pure_model = load_model(MusicVoiceV5Model, music_voice_pure_model, self.device) self.music_voice_no_pure_model = load_model(MusicVoiceV5Model, music_voice_no_pure_model, self.device) self.gender_pure_model = load_model(MobileNetV2Gender, gender_pure_model, self.device) self.gender_no_pure_model = load_model(MobileNetV2Gender, gender_no_pure_model, self.device) logging.info("load model ok ! spend_time={}".format(time.time() - st)) def batch_predict(self, model, features): st = time.time() scores = [] with torch.no_grad(): for i in range(0, len(features), self.batch_size): cur_data = features[i:i + self.batch_size].to(self.device) predicts = model(cur_data) predicts_score = F.softmax(predicts, dim=1) scores.extend(predicts_score.cpu().numpy()) ret = np.array(scores) global predict_time predict_time += time.time() - st return ret def predict_pure(self, filename, features): scores = self.batch_predict(self.music_voice_pure_model, features) new_features = [] for idx, score in enumerate(scores): if score[0] > 0.5: # 非人声 continue new_features.append(features[idx].numpy()) # 人声段太少,不能进行处理 # 参数可以改 new_feature_len = len(new_features) new_feature_rate = len(new_features) / len(features) if new_feature_len < 4 or new_feature_rate < 0.4: logging.warning( "filename={}|predict_pure|other|len={}|rate={}".format(filename, new_feature_len, new_feature_rate) ) return GENDER_OTHER, -1 new_features = torch.from_numpy(np.array(new_features)) scores = self.batch_predict(self.gender_pure_model, new_features) f_avg = sum(scores[:, 0]) / len(scores) m_avg = sum(scores[:, 1]) / len(scores) female_rate = f_avg / (f_avg + m_avg) if female_rate > 0.65: return GENDER_FEMALE, female_rate if female_rate < 0.12: return GENDER_MALE, female_rate logging.warning( "filename={}|predict_pure|other|len={}|rate={}".format(filename, new_feature_len, new_feature_rate) ) return GENDER_OTHER, female_rate def predict_no_pure(self, filename, features): scores = self.batch_predict(self.music_voice_no_pure_model, features) new_features = [] for idx, score in enumerate(scores): if score[0] > 0.5: # 非人声 continue new_features.append(features[idx].numpy()) # 人声段太少,不能进行处理 # 参数可以改 new_feature_len = len(new_features) new_feature_rate = len(new_features) / len(features) if new_feature_len < 4 or new_feature_rate < 0.4: logging.warning( "filename={}|predict_no_pure|other|len={}|rate={}".format(filename, new_feature_len, new_feature_rate) ) return GENDER_OTHER, -1 new_features = torch.from_numpy(np.array(new_features)) scores = self.batch_predict(self.gender_no_pure_model, new_features) f_avg = sum(scores[:, 0]) / len(scores) m_avg = sum(scores[:, 1]) / len(scores) female_rate = f_avg / (f_avg + m_avg) if female_rate > 0.75: return GENDER_FEMALE, female_rate if female_rate < 0.1: return GENDER_MALE, female_rate logging.warning( "filename={}|predict_no_pure|other|len={}|rate={}".format(filename, new_feature_len, new_feature_rate) ) return GENDER_OTHER, female_rate def predict(self, filename, features): st = time.time() new_features = [] for i in range(FRAME_LEN, len(features), FRAME_LEN): new_features.append(features[i - FRAME_LEN: i]) new_features = torch.from_numpy(np.array(new_features)) gender, rate = self.predict_pure(filename, new_features) if gender == GENDER_OTHER: logging.info("start no pure process...") gender, rate = self.predict_no_pure(filename, new_features) return gender, rate, False print("predict|spend_time={}".format(time.time() - st)) return gender, rate, True def process_one_logic(self, filename, file_path, cache_dir): tmp_wav = os.path.join(cache_dir, "tmp.wav") tmp_vb_wav = os.path.join(cache_dir, "tmp_vb.wav") if not transcode(file_path, tmp_wav): return ERR_CODE_TRANSCODE, None, None if not volume_balanced(tmp_wav, tmp_vb_wav): return ERR_CODE_VOLUME_BALANCED, None, None features = get_one_mfcc(tmp_vb_wav) if len(features) < FRAME_LEN: logging.error("feature too short|file_path={}".format(file_path)) return ERR_CODE_FEATURE_TOO_SHORT, None, None return self.predict(filename, features) def process_one(self, file_path): base_dir = os.path.dirname(file_path) filename = os.path.splitext(file_path)[0] - print("filename:",filename) + print("filename:", filename) cache_dir = os.path.join(base_dir, filename + "_cache") if os.path.exists(cache_dir): shutil.rmtree(cache_dir) os.makedirs(cache_dir) ret = self.process_one_logic(filename, file_path, cache_dir) shutil.rmtree(cache_dir) return ret def process(self, file_path): gender, female_rate, is_pure = self.process_one(file_path) logging.info("{}|gender={}|female_rate={}".format(file_path, gender, female_rate)) return gender, female_rate, is_pure def process_by_feature(self, feature_file): """ 直接处理特征文件 :param feature_file: :return: """ filename = os.path.splitext(feature_file)[0] features = np.load(feature_file) gender, female_rate = self.predict(filename, features) return gender, female_rate def test_all_feature(): import glob base_dir = "/data/datasets/music_voice_dataset_full/feature_online_data_v3" female = glob.glob(os.path.join(base_dir, "female/*feature.npy")) male = glob.glob(os.path.join(base_dir, "male/*feature.npy")) other = glob.glob(os.path.join(base_dir, "other/*feature.npy")) model_path = "/data/jianli.yang/voice_classification/online/models" music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth") music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth") gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth") gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth") vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model) tot_st = time.time() ret_map = { 0: {0: 0, 1: 0, 2: 0}, 1: {0: 0, 1: 0, 2: 0}, 2: {0: 0, 1: 0, 2: 0} } for file in female: st = time.time() print("------------------------------>>>>>") gender, female_score = vc.process_by_feature(file) ret_map[0][gender] += 1 if gender != 0: print("err:female->{}|{}|{}".format(gender, file, female_score)) print("process|spend_tm=={}".format(time.time() - st)) for file in male: st = time.time() print("------------------------------>>>>>") gender, female_score = vc.process_by_feature(file) ret_map[1][gender] += 1 if gender != 1: print("err:male->{}|{}|{}".format(gender, file, female_score)) print("process|spend_tm=={}".format(time.time() - st)) for file in other: st = time.time() print("------------------------------>>>>>") gender, female_score = vc.process_by_feature(file) ret_map[2][gender] += 1 if gender != 2: print("err:other->{}|{}|{}".format(gender, file, female_score)) print("process|spend_tm=={}".format(time.time() - st)) global transcode_time, vb_time, mfcc_time, predict_time print("spend_time:tot={}|transcode={}|vb={}|gen_feature={}|predict={}".format(time.time() - tot_st, transcode_time, vb_time, mfcc_time, predict_time)) f_f = ret_map[0][0] f_m = ret_map[0][1] f_o = ret_map[0][2] m_f = ret_map[1][0] m_m = ret_map[1][1] m_o = ret_map[1][2] o_f = ret_map[2][0] o_m = ret_map[2][1] o_o = ret_map[2][2] print("ff:{},fm:{},fo:{}".format(f_f, f_m, f_o)) print("mm:{},mf:{},mo:{}".format(m_m, m_f, m_o)) print("om:{},of:{},oo:{}".format(o_m, o_f, o_o)) # 女性准确率和召回率 f_acc = f_f / (f_f + m_f + o_f) f_recall = f_f / (f_f + f_m + f_o) # 男性准确率和召回率 m_acc = m_m / (m_m + f_m + o_m) m_recall = m_m / (m_m + m_f + m_o) print("female: acc={}|recall={}".format(f_acc, f_recall)) print("male: acc={}|recall={}".format(m_acc, m_recall)) def test_all(): import glob base_dir = "/data/datasets/music_voice_dataset_full/online_data_v3_top200" female = glob.glob(os.path.join(base_dir, "female/*mp4")) male = glob.glob(os.path.join(base_dir, "male/*mp4")) other = glob.glob(os.path.join(base_dir, "other/*mp4")) model_path = "/data/jianli.yang/voice_classification/online/models" music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth") music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth") gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth") gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth") vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model) tot_st = time.time() ret_map = { 0: {0: 0, 1: 0, 2: 0}, 1: {0: 0, 1: 0, 2: 0}, 2: {0: 0, 1: 0, 2: 0} } for file in female: st = time.time() print("------------------------------>>>>>") gender, female_score = vc.process(file) ret_map[0][gender] += 1 if gender != 0: print("err:female->{}|{}|{}".format(gender, file, female_score)) print("process|spend_tm=={}".format(time.time() - st)) for file in male: st = time.time() print("------------------------------>>>>>") gender, female_score = vc.process(file) ret_map[1][gender] += 1 if gender != 1: print("err:male->{}|{}|{}".format(gender, file, female_score)) print("process|spend_tm=={}".format(time.time() - st)) for file in other: st = time.time() print("------------------------------>>>>>") gender, female_score = vc.process(file) ret_map[2][gender] += 1 if gender != 2: print("err:other->{}|{}|{}".format(gender, file, female_score)) print("process|spend_tm=={}".format(time.time() - st)) global transcode_time, vb_time, mfcc_time, predict_time print("spend_time:tot={}|transcode={}|vb={}|gen_feature={}|predict={}".format(time.time() - tot_st, transcode_time, vb_time, mfcc_time, predict_time)) f_f = ret_map[0][0] f_m = ret_map[0][1] f_o = ret_map[0][2] m_f = ret_map[1][0] m_m = ret_map[1][1] m_o = ret_map[1][2] o_f = ret_map[2][0] o_m = ret_map[2][1] o_o = ret_map[2][2] print("ff:{},fm:{},fo:{}".format(f_f, f_m, f_o)) print("mm:{},mf:{},mo:{}".format(m_m, m_f, m_o)) print("om:{},of:{},oo:{}".format(o_m, o_f, o_o)) # 女性准确率和召回率 f_acc = f_f / (f_f + m_f + o_f) f_recall = f_f / (f_f + f_m + f_o) # 男性准确率和召回率 m_acc = m_m / (m_m + f_m + o_m) m_recall = m_m / (m_m + m_f + m_o) print("female: acc={}|recall={}".format(f_acc, f_recall)) print("male: acc={}|recall={}".format(m_acc, m_recall)) if __name__ == "__main__": # test_all() # test_all_feature() model_path = sys.argv[1] voice_path = sys.argv[2] music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth") music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth") gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth") gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth") vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model) for i in range(0, 1): st = time.time() print("------------------------------>>>>>") gender, female_rate, is_pure = vc.process(voice_path) print("process|spend_tm=={}".format(time.time() - st)) - print("gender:{}, female_rate:{},is_pure:{}".format(gender,female_rate,is_pure)) + print("gender:{}, female_rate:{},is_pure:{}".format(gender, female_rate, is_pure))