Page MenuHomePhabricator

voice_class_online.py
No OneTemporary

voice_class_online.py

"""
男女声分类在线工具
1 转码为16bit单声道
2 均衡化
3 模型分类
"""
import os
import sys
import librosa
import shutil
import logging
import time
import torch.nn.functional as F
import numpy as np
from model import *
# from common import bind_kernel
logging.basicConfig(level=logging.INFO)
os.environ["LRU_CACHE_CAPACITY"] = "1"
# torch.set_num_threads(1)
# bind_kernel(1)
"""
临时用一下,全局使用的变量
"""
transcode_time = 0
vb_time = 0
mfcc_time = 0
predict_time = 0
"""
错误码
"""
ERR_CODE_SUCCESS = 0 # 处理成功
ERR_CODE_NO_FILE = -1 # 文件不存在
ERR_CODE_TRANSCODE = -2 # 转码失败
ERR_CODE_VOLUME_BALANCED = -3 # 均衡化失败
ERR_CODE_FEATURE_TOO_SHORT = -4 # 特征文件太短
"""
常量
"""
FRAME_LEN = 128
MFCC_LEN = 80
EBUR128_BIN = "/opt/soft/bin/standard_audio_no_cut"
# EBUR128_BIN = "/Users/yangjianli/linux/opt/soft/bin/standard_audio_no_cut"
GENDER_FEMALE = 0
GENDER_MALE = 1
GENDER_OTHER = 2
"""
通用函数
"""
def exec_cmd(cmd):
ret = os.system(cmd)
if ret != 0:
return False
return True
"""
业务需要的函数
"""
def get_one_mfcc(file_url):
st = time.time()
data, sr = librosa.load(file_url, sr=16000)
if len(data) < 512:
return []
mfcc = librosa.feature.mfcc(y=data, sr=sr, n_fft=512, hop_length=256, n_mfcc=MFCC_LEN)
mfcc = mfcc.transpose()
print("get_one_mfcc:spend_time={}".format(time.time() - st))
global mfcc_time
mfcc_time += time.time() - st
return mfcc
def volume_balanced(src, dst):
st = time.time()
cmd = "{} {} {}".format(EBUR128_BIN, src, dst)
logging.info(cmd)
exec_cmd(cmd)
if not os.path.exists(dst):
logging.error("volume_balanced:cmd={}".format(cmd))
print("volume_balanced:spend_time={}".format(time.time() - st))
global vb_time
vb_time += time.time() - st
return os.path.exists(dst)
def transcode(src, dst):
st = time.time()
cmd = "ffmpeg -loglevel quiet -i {} -ar 16000 -ac 1 {}".format(src, dst)
logging.info(cmd)
exec_cmd(cmd)
if not os.path.exists(dst):
logging.error("transcode:cmd={}".format(cmd))
print("transcode:spend_time={}".format(time.time() - st))
global transcode_time
transcode_time += time.time() - st
return os.path.exists(dst)
class VoiceClass:
def __init__(self, music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model):
"""
四个模型
:param music_voice_pure_model: 分辨纯净人声/其他
:param music_voice_no_pure_model: 分辨有人声/其他
:param gender_pure_model: 纯净人声分辨男女
:param gender_no_pure_model: 有人声分辨男女
"""
st = time.time()
self.device = "cpu"
self.batch_size = 256
self.music_voice_pure_model = load_model(MusicVoiceV5Model, music_voice_pure_model, self.device)
self.music_voice_no_pure_model = load_model(MusicVoiceV5Model, music_voice_no_pure_model, self.device)
self.gender_pure_model = load_model(MobileNetV2Gender, gender_pure_model, self.device)
self.gender_no_pure_model = load_model(MobileNetV2Gender, gender_no_pure_model, self.device)
logging.info("load model ok ! spend_time={}".format(time.time() - st))
def batch_predict(self, model, features):
st = time.time()
scores = []
with torch.no_grad():
for i in range(0, len(features), self.batch_size):
cur_data = features[i:i + self.batch_size].to(self.device)
predicts = model(cur_data)
predicts_score = F.softmax(predicts, dim=1)
scores.extend(predicts_score.cpu().numpy())
ret = np.array(scores)
global predict_time
predict_time += time.time() - st
return ret
def predict_pure(self, filename, features):
scores = self.batch_predict(self.music_voice_pure_model, features)
new_features = []
for idx, score in enumerate(scores):
if score[0] > 0.5: # 非人声
continue
new_features.append(features[idx].numpy())
# 人声段太少,不能进行处理
# 参数可以改
new_feature_len = len(new_features)
new_feature_rate = len(new_features) / len(features)
if new_feature_len < 4 or new_feature_rate < 0.4:
logging.warning(
"filename={}|predict_pure|other|len={}|rate={}".format(filename, new_feature_len, new_feature_rate)
)
return GENDER_OTHER, -1
new_features = torch.from_numpy(np.array(new_features))
scores = self.batch_predict(self.gender_pure_model, new_features)
f_avg = sum(scores[:, 0]) / len(scores)
m_avg = sum(scores[:, 1]) / len(scores)
female_rate = f_avg / (f_avg + m_avg)
if female_rate > 0.65:
return GENDER_FEMALE, female_rate
if female_rate < 0.12:
return GENDER_MALE, female_rate
logging.warning(
"filename={}|predict_pure|other|len={}|rate={}".format(filename, new_feature_len, new_feature_rate)
)
return GENDER_OTHER, female_rate
def predict_no_pure(self, filename, features):
scores = self.batch_predict(self.music_voice_no_pure_model, features)
new_features = []
for idx, score in enumerate(scores):
if score[0] > 0.5: # 非人声
continue
new_features.append(features[idx].numpy())
# 人声段太少,不能进行处理
# 参数可以改
new_feature_len = len(new_features)
new_feature_rate = len(new_features) / len(features)
if new_feature_len < 4 or new_feature_rate < 0.4:
logging.warning(
"filename={}|predict_no_pure|other|len={}|rate={}".format(filename, new_feature_len, new_feature_rate)
)
return GENDER_OTHER, -1
new_features = torch.from_numpy(np.array(new_features))
scores = self.batch_predict(self.gender_no_pure_model, new_features)
f_avg = sum(scores[:, 0]) / len(scores)
m_avg = sum(scores[:, 1]) / len(scores)
female_rate = f_avg / (f_avg + m_avg)
if female_rate > 0.75:
return GENDER_FEMALE, female_rate
if female_rate < 0.1:
return GENDER_MALE, female_rate
logging.warning(
"filename={}|predict_no_pure|other|len={}|rate={}".format(filename, new_feature_len, new_feature_rate)
)
return GENDER_OTHER, female_rate
def predict(self, filename, features):
st = time.time()
new_features = []
for i in range(FRAME_LEN, len(features), FRAME_LEN):
new_features.append(features[i - FRAME_LEN: i])
new_features = torch.from_numpy(np.array(new_features))
gender, rate = self.predict_pure(filename, new_features)
if gender == GENDER_OTHER:
logging.info("start no pure process...")
return self.predict_no_pure(filename, new_features)
print("predict|spend_time={}".format(time.time() - st))
return gender, rate
def process_one_logic(self, filename, file_path, cache_dir):
tmp_wav = os.path.join(cache_dir, "tmp.wav")
tmp_vb_wav = os.path.join(cache_dir, "tmp_vb.wav")
if not transcode(file_path, tmp_wav):
return ERR_CODE_TRANSCODE
if not volume_balanced(tmp_wav, tmp_vb_wav):
return ERR_CODE_VOLUME_BALANCED
features = get_one_mfcc(tmp_vb_wav)
if len(features) < FRAME_LEN:
logging.error("feature too short|file_path={}".format(file_path))
return ERR_CODE_FEATURE_TOO_SHORT
return self.predict(filename, features)
def process_one(self, file_path):
base_dir = os.path.dirname(file_path)
filename = os.path.splitext(file_path)[0]
cache_dir = os.path.join(base_dir, filename + "_cache")
if os.path.exists(cache_dir):
shutil.rmtree(cache_dir)
os.makedirs(cache_dir)
ret = self.process_one_logic(filename, file_path, cache_dir)
shutil.rmtree(cache_dir)
return ret
def process(self, file_path):
gender, female_rate = self.process_one(file_path)
logging.info("{}|gender={}|female_rate={}".format(file_path, gender, female_rate))
return gender, female_rate
def process_by_feature(self, feature_file):
"""
直接处理特征文件
:param feature_file:
:return:
"""
filename = os.path.splitext(feature_file)[0]
features = np.load(feature_file)
gender, female_rate = self.predict(filename, features)
return gender, female_rate
def test_all_feature():
import glob
base_dir = "/data/datasets/music_voice_dataset_full/feature_online_data_v3"
female = glob.glob(os.path.join(base_dir, "female/*feature.npy"))
male = glob.glob(os.path.join(base_dir, "male/*feature.npy"))
other = glob.glob(os.path.join(base_dir, "other/*feature.npy"))
model_path = "/data/jianli.yang/voice_classification/online/models"
music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth")
music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth")
gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth")
gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth")
vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model)
tot_st = time.time()
ret_map = {
0: {0: 0, 1: 0, 2: 0},
1: {0: 0, 1: 0, 2: 0},
2: {0: 0, 1: 0, 2: 0}
}
for file in female:
st = time.time()
print("------------------------------>>>>>")
gender, female_score = vc.process_by_feature(file)
ret_map[0][gender] += 1
if gender != 0:
print("err:female->{}|{}|{}".format(gender, file, female_score))
print("process|spend_tm=={}".format(time.time() - st))
for file in male:
st = time.time()
print("------------------------------>>>>>")
gender, female_score = vc.process_by_feature(file)
ret_map[1][gender] += 1
if gender != 1:
print("err:male->{}|{}|{}".format(gender, file, female_score))
print("process|spend_tm=={}".format(time.time() - st))
for file in other:
st = time.time()
print("------------------------------>>>>>")
gender, female_score = vc.process_by_feature(file)
ret_map[2][gender] += 1
if gender != 2:
print("err:other->{}|{}|{}".format(gender, file, female_score))
print("process|spend_tm=={}".format(time.time() - st))
global transcode_time, vb_time, mfcc_time, predict_time
print("spend_time:tot={}|transcode={}|vb={}|gen_feature={}|predict={}".format(time.time() - tot_st, transcode_time,
vb_time, mfcc_time, predict_time))
f_f = ret_map[0][0]
f_m = ret_map[0][1]
f_o = ret_map[0][2]
m_f = ret_map[1][0]
m_m = ret_map[1][1]
m_o = ret_map[1][2]
o_f = ret_map[2][0]
o_m = ret_map[2][1]
o_o = ret_map[2][2]
print("ff:{},fm:{},fo:{}".format(f_f, f_m, f_o))
print("mm:{},mf:{},mo:{}".format(m_m, m_f, m_o))
print("om:{},of:{},oo:{}".format(o_m, o_f, o_o))
# 女性准确率和召回率
f_acc = f_f / (f_f + m_f + o_f)
f_recall = f_f / (f_f + f_m + f_o)
# 男性准确率和召回率
m_acc = m_m / (m_m + f_m + o_m)
m_recall = m_m / (m_m + m_f + m_o)
print("female: acc={}|recall={}".format(f_acc, f_recall))
print("male: acc={}|recall={}".format(m_acc, m_recall))
def test_all():
import glob
base_dir = "/data/datasets/music_voice_dataset_full/online_data_v3_top200"
female = glob.glob(os.path.join(base_dir, "female/*mp4"))
male = glob.glob(os.path.join(base_dir, "male/*mp4"))
other = glob.glob(os.path.join(base_dir, "other/*mp4"))
model_path = "/data/jianli.yang/voice_classification/online/models"
music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth")
music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth")
gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth")
gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth")
vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model)
tot_st = time.time()
ret_map = {
0: {0: 0, 1: 0, 2: 0},
1: {0: 0, 1: 0, 2: 0},
2: {0: 0, 1: 0, 2: 0}
}
for file in female:
st = time.time()
print("------------------------------>>>>>")
gender, female_score = vc.process(file)
ret_map[0][gender] += 1
if gender != 0:
print("err:female->{}|{}|{}".format(gender, file, female_score))
print("process|spend_tm=={}".format(time.time() - st))
for file in male:
st = time.time()
print("------------------------------>>>>>")
gender, female_score = vc.process(file)
ret_map[1][gender] += 1
if gender != 1:
print("err:male->{}|{}|{}".format(gender, file, female_score))
print("process|spend_tm=={}".format(time.time() - st))
for file in other:
st = time.time()
print("------------------------------>>>>>")
gender, female_score = vc.process(file)
ret_map[2][gender] += 1
if gender != 2:
print("err:other->{}|{}|{}".format(gender, file, female_score))
print("process|spend_tm=={}".format(time.time() - st))
global transcode_time, vb_time, mfcc_time, predict_time
print("spend_time:tot={}|transcode={}|vb={}|gen_feature={}|predict={}".format(time.time() - tot_st, transcode_time,
vb_time, mfcc_time, predict_time))
f_f = ret_map[0][0]
f_m = ret_map[0][1]
f_o = ret_map[0][2]
m_f = ret_map[1][0]
m_m = ret_map[1][1]
m_o = ret_map[1][2]
o_f = ret_map[2][0]
o_m = ret_map[2][1]
o_o = ret_map[2][2]
print("ff:{},fm:{},fo:{}".format(f_f, f_m, f_o))
print("mm:{},mf:{},mo:{}".format(m_m, m_f, m_o))
print("om:{},of:{},oo:{}".format(o_m, o_f, o_o))
# 女性准确率和召回率
f_acc = f_f / (f_f + m_f + o_f)
f_recall = f_f / (f_f + f_m + f_o)
# 男性准确率和召回率
m_acc = m_m / (m_m + f_m + o_m)
m_recall = m_m / (m_m + m_f + m_o)
print("female: acc={}|recall={}".format(f_acc, f_recall))
print("male: acc={}|recall={}".format(m_acc, m_recall))
if __name__ == "__main__":
# test_all()
# test_all_feature()
model_path = sys.argv[1]
voice_path = sys.argv[2]
music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth")
music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth")
gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth")
gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth")
vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model)
for i in range(0, 1):
st = time.time()
print("------------------------------>>>>>")
vc.process(voice_path)
print("process|spend_tm=={}".format(time.time() - st))

File Metadata

Mime Type
text/plain
Expires
Sun, Nov 24, 19:32 (19 h, 13 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1326487
Default Alt Text
voice_class_online.py (15 KB)

Event Timeline