""" 男女声分类在线工具 1 转码为16bit单声道 2 均衡化 3 模型分类 """ import os import sys import librosa import shutil import logging import time import torch.nn.functional as F import numpy as np from model import * # from common import bind_kernel logging.basicConfig(level=logging.INFO) os.environ["LRU_CACHE_CAPACITY"] = "1" # torch.set_num_threads(1) # bind_kernel(1) """ 临时用一下,全局使用的变量 """ transcode_time = 0 vb_time = 0 mfcc_time = 0 predict_time = 0 """ 错误码 """ ERR_CODE_SUCCESS = 0 # 处理成功 ERR_CODE_NO_FILE = -1 # 文件不存在 ERR_CODE_TRANSCODE = -2 # 转码失败 ERR_CODE_VOLUME_BALANCED = -3 # 均衡化失败 ERR_CODE_FEATURE_TOO_SHORT = -4 # 特征文件太短 """ 常量 """ FRAME_LEN = 128 MFCC_LEN = 80 EBUR128_BIN = "/opt/soft/bin/standard_audio_no_cut" # EBUR128_BIN = "/Users/yangjianli/linux/opt/soft/bin/standard_audio_no_cut" GENDER_FEMALE = 0 GENDER_MALE = 1 GENDER_OTHER = 2 """ 通用函数 """ def exec_cmd(cmd): ret = os.system(cmd) if ret != 0: return False return True """ 业务需要的函数 """ def get_one_mfcc(file_url): st = time.time() data, sr = librosa.load(file_url, sr=16000) if len(data) < 512: return [] mfcc = librosa.feature.mfcc(y=data, sr=sr, n_fft=512, hop_length=256, n_mfcc=MFCC_LEN) mfcc = mfcc.transpose() print("get_one_mfcc:spend_time={}".format(time.time() - st)) global mfcc_time mfcc_time += time.time() - st return mfcc def volume_balanced(src, dst): st = time.time() cmd = "{} {} {}".format(EBUR128_BIN, src, dst) logging.info(cmd) exec_cmd(cmd) if not os.path.exists(dst): logging.error("volume_balanced:cmd={}".format(cmd)) print("volume_balanced:spend_time={}".format(time.time() - st)) global vb_time vb_time += time.time() - st return os.path.exists(dst) def transcode(src, dst): st = time.time() cmd = "ffmpeg -loglevel quiet -i {} -ar 16000 -ac 1 {}".format(src, dst) logging.info(cmd) exec_cmd(cmd) if not os.path.exists(dst): logging.error("transcode:cmd={}".format(cmd)) print("transcode:spend_time={}".format(time.time() - st)) global transcode_time transcode_time += time.time() - st return os.path.exists(dst) class VoiceClass: def __init__(self, music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model): """ 四个模型 :param music_voice_pure_model: 分辨纯净人声/其他 :param music_voice_no_pure_model: 分辨有人声/其他 :param gender_pure_model: 纯净人声分辨男女 :param gender_no_pure_model: 有人声分辨男女 """ st = time.time() self.device = "cpu" self.batch_size = 256 self.music_voice_pure_model = load_model(MusicVoiceV5Model, music_voice_pure_model, self.device) self.music_voice_no_pure_model = load_model(MusicVoiceV5Model, music_voice_no_pure_model, self.device) self.gender_pure_model = load_model(MobileNetV2Gender, gender_pure_model, self.device) self.gender_no_pure_model = load_model(MobileNetV2Gender, gender_no_pure_model, self.device) logging.info("load model ok ! spend_time={}".format(time.time() - st)) def batch_predict(self, model, features): st = time.time() scores = [] with torch.no_grad(): for i in range(0, len(features), self.batch_size): cur_data = features[i:i + self.batch_size].to(self.device) predicts = model(cur_data) predicts_score = F.softmax(predicts, dim=1) scores.extend(predicts_score.cpu().numpy()) ret = np.array(scores) global predict_time predict_time += time.time() - st return ret def predict_pure(self, filename, features): scores = self.batch_predict(self.music_voice_pure_model, features) new_features = [] for idx, score in enumerate(scores): if score[0] > 0.5: # 非人声 continue new_features.append(features[idx].numpy()) # 人声段太少,不能进行处理 # 参数可以改 new_feature_len = len(new_features) new_feature_rate = len(new_features) / len(features) if new_feature_len < 4 or new_feature_rate < 0.4: logging.warning( "filename={}|predict_pure|other|len={}|rate={}".format(filename, new_feature_len, new_feature_rate) ) return GENDER_OTHER, -1 new_features = torch.from_numpy(np.array(new_features)) scores = self.batch_predict(self.gender_pure_model, new_features) f_avg = sum(scores[:, 0]) / len(scores) m_avg = sum(scores[:, 1]) / len(scores) female_rate = f_avg / (f_avg + m_avg) if female_rate > 0.65: return GENDER_FEMALE, female_rate if female_rate < 0.12: return GENDER_MALE, female_rate logging.warning( "filename={}|predict_pure|other|len={}|rate={}".format(filename, new_feature_len, new_feature_rate) ) return GENDER_OTHER, female_rate def predict_no_pure(self, filename, features): scores = self.batch_predict(self.music_voice_no_pure_model, features) new_features = [] for idx, score in enumerate(scores): if score[0] > 0.5: # 非人声 continue new_features.append(features[idx].numpy()) # 人声段太少,不能进行处理 # 参数可以改 new_feature_len = len(new_features) new_feature_rate = len(new_features) / len(features) if new_feature_len < 4 or new_feature_rate < 0.4: logging.warning( "filename={}|predict_no_pure|other|len={}|rate={}".format(filename, new_feature_len, new_feature_rate) ) return GENDER_OTHER, -1 new_features = torch.from_numpy(np.array(new_features)) scores = self.batch_predict(self.gender_no_pure_model, new_features) f_avg = sum(scores[:, 0]) / len(scores) m_avg = sum(scores[:, 1]) / len(scores) female_rate = f_avg / (f_avg + m_avg) if female_rate > 0.75: return GENDER_FEMALE, female_rate if female_rate < 0.1: return GENDER_MALE, female_rate logging.warning( "filename={}|predict_no_pure|other|len={}|rate={}".format(filename, new_feature_len, new_feature_rate) ) return GENDER_OTHER, female_rate def predict(self, filename, features): st = time.time() new_features = [] for i in range(FRAME_LEN, len(features), FRAME_LEN): new_features.append(features[i - FRAME_LEN: i]) new_features = torch.from_numpy(np.array(new_features)) gender, rate = self.predict_pure(filename, new_features) if gender == GENDER_OTHER: logging.info("start no pure process...") return self.predict_no_pure(filename, new_features) print("predict|spend_time={}".format(time.time() - st)) return gender, rate def process_one_logic(self, filename, file_path, cache_dir): tmp_wav = os.path.join(cache_dir, "tmp.wav") tmp_vb_wav = os.path.join(cache_dir, "tmp_vb.wav") if not transcode(file_path, tmp_wav): return ERR_CODE_TRANSCODE if not volume_balanced(tmp_wav, tmp_vb_wav): return ERR_CODE_VOLUME_BALANCED features = get_one_mfcc(tmp_vb_wav) if len(features) < FRAME_LEN: logging.error("feature too short|file_path={}".format(file_path)) return ERR_CODE_FEATURE_TOO_SHORT return self.predict(filename, features) def process_one(self, file_path): base_dir = os.path.dirname(file_path) filename = os.path.splitext(file_path)[0] cache_dir = os.path.join(base_dir, filename + "_cache") if os.path.exists(cache_dir): shutil.rmtree(cache_dir) os.makedirs(cache_dir) ret = self.process_one_logic(filename, file_path, cache_dir) shutil.rmtree(cache_dir) return ret def process(self, file_path): gender, female_rate = self.process_one(file_path) logging.info("{}|gender={}|female_rate={}".format(file_path, gender, female_rate)) return gender, female_rate def process_by_feature(self, feature_file): """ 直接处理特征文件 :param feature_file: :return: """ filename = os.path.splitext(feature_file)[0] features = np.load(feature_file) gender, female_rate = self.predict(filename, features) return gender, female_rate def test_all_feature(): import glob base_dir = "/data/datasets/music_voice_dataset_full/feature_online_data_v3" female = glob.glob(os.path.join(base_dir, "female/*feature.npy")) male = glob.glob(os.path.join(base_dir, "male/*feature.npy")) other = glob.glob(os.path.join(base_dir, "other/*feature.npy")) model_path = "/data/jianli.yang/voice_classification/online/models" music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth") music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth") gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth") gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth") vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model) tot_st = time.time() ret_map = { 0: {0: 0, 1: 0, 2: 0}, 1: {0: 0, 1: 0, 2: 0}, 2: {0: 0, 1: 0, 2: 0} } for file in female: st = time.time() print("------------------------------>>>>>") gender, female_score = vc.process_by_feature(file) ret_map[0][gender] += 1 if gender != 0: print("err:female->{}|{}|{}".format(gender, file, female_score)) print("process|spend_tm=={}".format(time.time() - st)) for file in male: st = time.time() print("------------------------------>>>>>") gender, female_score = vc.process_by_feature(file) ret_map[1][gender] += 1 if gender != 1: print("err:male->{}|{}|{}".format(gender, file, female_score)) print("process|spend_tm=={}".format(time.time() - st)) for file in other: st = time.time() print("------------------------------>>>>>") gender, female_score = vc.process_by_feature(file) ret_map[2][gender] += 1 if gender != 2: print("err:other->{}|{}|{}".format(gender, file, female_score)) print("process|spend_tm=={}".format(time.time() - st)) global transcode_time, vb_time, mfcc_time, predict_time print("spend_time:tot={}|transcode={}|vb={}|gen_feature={}|predict={}".format(time.time() - tot_st, transcode_time, vb_time, mfcc_time, predict_time)) f_f = ret_map[0][0] f_m = ret_map[0][1] f_o = ret_map[0][2] m_f = ret_map[1][0] m_m = ret_map[1][1] m_o = ret_map[1][2] o_f = ret_map[2][0] o_m = ret_map[2][1] o_o = ret_map[2][2] print("ff:{},fm:{},fo:{}".format(f_f, f_m, f_o)) print("mm:{},mf:{},mo:{}".format(m_m, m_f, m_o)) print("om:{},of:{},oo:{}".format(o_m, o_f, o_o)) # 女性准确率和召回率 f_acc = f_f / (f_f + m_f + o_f) f_recall = f_f / (f_f + f_m + f_o) # 男性准确率和召回率 m_acc = m_m / (m_m + f_m + o_m) m_recall = m_m / (m_m + m_f + m_o) print("female: acc={}|recall={}".format(f_acc, f_recall)) print("male: acc={}|recall={}".format(m_acc, m_recall)) def test_all(): import glob base_dir = "/data/datasets/music_voice_dataset_full/online_data_v3_top200" female = glob.glob(os.path.join(base_dir, "female/*mp4")) male = glob.glob(os.path.join(base_dir, "male/*mp4")) other = glob.glob(os.path.join(base_dir, "other/*mp4")) model_path = "/data/jianli.yang/voice_classification/online/models" music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth") music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth") gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth") gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth") vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model) tot_st = time.time() ret_map = { 0: {0: 0, 1: 0, 2: 0}, 1: {0: 0, 1: 0, 2: 0}, 2: {0: 0, 1: 0, 2: 0} } for file in female: st = time.time() print("------------------------------>>>>>") gender, female_score = vc.process(file) ret_map[0][gender] += 1 if gender != 0: print("err:female->{}|{}|{}".format(gender, file, female_score)) print("process|spend_tm=={}".format(time.time() - st)) for file in male: st = time.time() print("------------------------------>>>>>") gender, female_score = vc.process(file) ret_map[1][gender] += 1 if gender != 1: print("err:male->{}|{}|{}".format(gender, file, female_score)) print("process|spend_tm=={}".format(time.time() - st)) for file in other: st = time.time() print("------------------------------>>>>>") gender, female_score = vc.process(file) ret_map[2][gender] += 1 if gender != 2: print("err:other->{}|{}|{}".format(gender, file, female_score)) print("process|spend_tm=={}".format(time.time() - st)) global transcode_time, vb_time, mfcc_time, predict_time print("spend_time:tot={}|transcode={}|vb={}|gen_feature={}|predict={}".format(time.time() - tot_st, transcode_time, vb_time, mfcc_time, predict_time)) f_f = ret_map[0][0] f_m = ret_map[0][1] f_o = ret_map[0][2] m_f = ret_map[1][0] m_m = ret_map[1][1] m_o = ret_map[1][2] o_f = ret_map[2][0] o_m = ret_map[2][1] o_o = ret_map[2][2] print("ff:{},fm:{},fo:{}".format(f_f, f_m, f_o)) print("mm:{},mf:{},mo:{}".format(m_m, m_f, m_o)) print("om:{},of:{},oo:{}".format(o_m, o_f, o_o)) # 女性准确率和召回率 f_acc = f_f / (f_f + m_f + o_f) f_recall = f_f / (f_f + f_m + f_o) # 男性准确率和召回率 m_acc = m_m / (m_m + f_m + o_m) m_recall = m_m / (m_m + m_f + m_o) print("female: acc={}|recall={}".format(f_acc, f_recall)) print("male: acc={}|recall={}".format(m_acc, m_recall)) if __name__ == "__main__": # test_all() # test_all_feature() model_path = sys.argv[1] voice_path = sys.argv[2] music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth") music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth") gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth") gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth") vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model) for i in range(0, 1): st = time.time() print("------------------------------>>>>>") vc.process(voice_path) print("process|spend_tm=={}".format(time.time() - st))