Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F4880336
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
17 KB
Subscribers
None
View Options
diff --git a/AIMeiSheng/voice_classification/online/voice_class_online_fang.py b/AIMeiSheng/voice_classification/online/voice_class_online_fang.py
index a1a7d8f..d792db0 100644
--- a/AIMeiSheng/voice_classification/online/voice_class_online_fang.py
+++ b/AIMeiSheng/voice_classification/online/voice_class_online_fang.py
@@ -1,442 +1,444 @@
"""
男女声分类在线工具
1 转码为16bit单声道
2 均衡化
3 模型分类
"""
import os
import sys
import librosa
import shutil
import logging
import time
import torch.nn.functional as F
import numpy as np
from model import *
# from common import bind_kernel
logging.basicConfig(level=logging.INFO)
os.environ["LRU_CACHE_CAPACITY"] = "1"
# torch.set_num_threads(1)
# bind_kernel(1)
"""
临时用一下,全局使用的变量
"""
transcode_time = 0
vb_time = 0
mfcc_time = 0
predict_time = 0
"""
错误码
"""
ERR_CODE_SUCCESS = 0 # 处理成功
ERR_CODE_NO_FILE = -1 # 文件不存在
ERR_CODE_TRANSCODE = -2 # 转码失败
ERR_CODE_VOLUME_BALANCED = -3 # 均衡化失败
ERR_CODE_FEATURE_TOO_SHORT = -4 # 特征文件太短
"""
常量
"""
FRAME_LEN = 128
MFCC_LEN = 80
gs_bin_url = "https://av-audit-sync-sg-1256122840.cos.ap-singapore.myqcloud.com/hub/voice_classification/bin/bin.zip"
EBUR128_BIN = "/tmp/voice_class_bin/standard_audio_no_cut"
# EBUR128_BIN = "/Users/yangjianli/linux/opt/soft/bin/standard_audio_no_cut"
GENDER_FEMALE = 0
GENDER_MALE = 1
GENDER_OTHER = 2
"""
通用函数
"""
def exec_cmd(cmd):
ret = os.system(cmd)
if ret != 0:
return False
return True
"""
业务需要的函数
"""
def get_one_mfcc(file_url):
st = time.time()
data, sr = librosa.load(file_url, sr=16000)
if len(data) < 512:
return []
mfcc = librosa.feature.mfcc(y=data, sr=sr, n_fft=512, hop_length=256, n_mfcc=MFCC_LEN)
mfcc = mfcc.transpose()
print("get_one_mfcc:spend_time={}".format(time.time() - st))
global mfcc_time
mfcc_time += time.time() - st
return mfcc
def download_volume_balanced():
import urllib.request
if not os.path.exists(EBUR128_BIN):
dst_path = "/tmp/bin.zip"
urllib.request.urlretrieve(gs_bin_url, dst_path)
if not os.path.exists(dst_path):
print(f"download dst_path={gs_bin_url} err!")
exit(-1)
dirname = os.path.dirname(dst_path)
cmd = f"cd {dirname}; unzip bin.zip; rm -f bin.zip; mv bin voice_class_bin"
os.system(cmd)
if not os.path.exists(EBUR128_BIN):
print(f"exec {cmd} err!")
exit(-1)
def volume_balanced(src, dst):
st = time.time()
download_volume_balanced()
cmd = "{} {} {}".format(EBUR128_BIN, src, dst)
logging.info(cmd)
exec_cmd(cmd)
if not os.path.exists(dst):
logging.error("volume_balanced:cmd={}".format(cmd))
print("volume_balanced:spend_time={}".format(time.time() - st))
global vb_time
vb_time += time.time() - st
return os.path.exists(dst)
def transcode(src, dst):
st = time.time()
cmd = "ffmpeg -loglevel quiet -i {} -ar 16000 -ac 1 {}".format(src, dst)
logging.info(cmd)
exec_cmd(cmd)
if not os.path.exists(dst):
logging.error("transcode:cmd={}".format(cmd))
print("transcode:spend_time={}".format(time.time() - st))
global transcode_time
transcode_time += time.time() - st
return os.path.exists(dst)
class VoiceClass:
def __init__(self, music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model):
"""
四个模型
:param music_voice_pure_model: 分辨纯净人声/其他
:param music_voice_no_pure_model: 分辨有人声/其他
:param gender_pure_model: 纯净人声分辨男女
:param gender_no_pure_model: 有人声分辨男女
"""
st = time.time()
self.device = "cpu"
self.batch_size = 256
self.music_voice_pure_model = load_model(MusicVoiceV5Model, music_voice_pure_model, self.device)
self.music_voice_no_pure_model = load_model(MusicVoiceV5Model, music_voice_no_pure_model, self.device)
self.gender_pure_model = load_model(MobileNetV2Gender, gender_pure_model, self.device)
self.gender_no_pure_model = load_model(MobileNetV2Gender, gender_no_pure_model, self.device)
logging.info("load model ok ! spend_time={}".format(time.time() - st))
def batch_predict(self, model, features):
st = time.time()
scores = []
with torch.no_grad():
for i in range(0, len(features), self.batch_size):
cur_data = features[i:i + self.batch_size].to(self.device)
predicts = model(cur_data)
predicts_score = F.softmax(predicts, dim=1)
scores.extend(predicts_score.cpu().numpy())
ret = np.array(scores)
global predict_time
predict_time += time.time() - st
return ret
def predict_pure(self, filename, features):
scores = self.batch_predict(self.music_voice_pure_model, features)
new_features = []
for idx, score in enumerate(scores):
if score[0] > 0.5: # 非人声
continue
new_features.append(features[idx].numpy())
# 人声段太少,不能进行处理
# 参数可以改
new_feature_len = len(new_features)
new_feature_rate = len(new_features) / len(features)
- if new_feature_len < 4 or new_feature_rate < 0.4:
+ # 修改人声占比低一些
+ if new_feature_len < 4 or new_feature_rate < 0.05:
logging.warning(
"filename={}|predict_pure|other|len={}|rate={}".format(filename, new_feature_len, new_feature_rate)
)
return GENDER_OTHER, -1
new_features = torch.from_numpy(np.array(new_features))
scores = self.batch_predict(self.gender_pure_model, new_features)
f_avg = sum(scores[:, 0]) / len(scores)
m_avg = sum(scores[:, 1]) / len(scores)
female_rate = f_avg / (f_avg + m_avg)
if female_rate > 0.65:
return GENDER_FEMALE, female_rate
if female_rate < 0.12:
return GENDER_MALE, female_rate
logging.warning(
"filename={}|predict_pure|other|len={}|rate={}".format(filename, new_feature_len, new_feature_rate)
)
return GENDER_OTHER, female_rate
def predict_no_pure(self, filename, features):
scores = self.batch_predict(self.music_voice_no_pure_model, features)
new_features = []
for idx, score in enumerate(scores):
if score[0] > 0.5: # 非人声
continue
new_features.append(features[idx].numpy())
# 人声段太少,不能进行处理
# 参数可以改
new_feature_len = len(new_features)
new_feature_rate = len(new_features) / len(features)
- if new_feature_len < 4 or new_feature_rate < 0.4:
+ # 修改人声占比低一些
+ if new_feature_len < 4 or new_feature_rate < 0.05:
logging.warning(
"filename={}|predict_no_pure|other|len={}|rate={}".format(filename, new_feature_len, new_feature_rate)
)
return GENDER_OTHER, -1
new_features = torch.from_numpy(np.array(new_features))
scores = self.batch_predict(self.gender_no_pure_model, new_features)
f_avg = sum(scores[:, 0]) / len(scores)
m_avg = sum(scores[:, 1]) / len(scores)
female_rate = f_avg / (f_avg + m_avg)
if female_rate > 0.75:
return GENDER_FEMALE, female_rate
if female_rate < 0.1:
return GENDER_MALE, female_rate
logging.warning(
"filename={}|predict_no_pure|other|len={}|rate={}".format(filename, new_feature_len, new_feature_rate)
)
return GENDER_OTHER, female_rate
def predict(self, filename, features):
st = time.time()
new_features = []
for i in range(FRAME_LEN, len(features), FRAME_LEN):
new_features.append(features[i - FRAME_LEN: i])
new_features = torch.from_numpy(np.array(new_features))
gender, rate = self.predict_pure(filename, new_features)
if gender == GENDER_OTHER:
logging.info("start no pure process...")
gender, rate = self.predict_no_pure(filename, new_features)
return gender, rate, False
print("predict|spend_time={}".format(time.time() - st))
return gender, rate, True
def process_one_logic(self, filename, file_path, cache_dir):
tmp_wav = os.path.join(cache_dir, "tmp.wav")
tmp_vb_wav = os.path.join(cache_dir, "tmp_vb.wav")
if not transcode(file_path, tmp_wav):
return ERR_CODE_TRANSCODE, None, None
if not volume_balanced(tmp_wav, tmp_vb_wav):
return ERR_CODE_VOLUME_BALANCED, None, None
features = get_one_mfcc(tmp_vb_wav)
if len(features) < FRAME_LEN:
logging.error("feature too short|file_path={}".format(file_path))
return ERR_CODE_FEATURE_TOO_SHORT, None, None
return self.predict(filename, features)
def process_one(self, file_path):
base_dir = os.path.dirname(file_path)
filename = os.path.splitext(file_path)[0]
print("filename:", filename)
cache_dir = os.path.join(base_dir, filename + "_cache")
if os.path.exists(cache_dir):
shutil.rmtree(cache_dir)
os.makedirs(cache_dir)
ret = self.process_one_logic(filename, file_path, cache_dir)
shutil.rmtree(cache_dir)
return ret
def process(self, file_path):
gender, female_rate, is_pure = self.process_one(file_path)
logging.info("{}|gender={}|female_rate={}".format(file_path, gender, female_rate))
return gender, female_rate, is_pure
def process_by_feature(self, feature_file):
"""
直接处理特征文件
:param feature_file:
:return:
"""
filename = os.path.splitext(feature_file)[0]
features = np.load(feature_file)
gender, female_rate = self.predict(filename, features)
return gender, female_rate
def test_all_feature():
import glob
base_dir = "/data/datasets/music_voice_dataset_full/feature_online_data_v3"
female = glob.glob(os.path.join(base_dir, "female/*feature.npy"))
male = glob.glob(os.path.join(base_dir, "male/*feature.npy"))
other = glob.glob(os.path.join(base_dir, "other/*feature.npy"))
model_path = "/data/jianli.yang/voice_classification/online/models"
music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth")
music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth")
gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth")
gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth")
vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model)
tot_st = time.time()
ret_map = {
0: {0: 0, 1: 0, 2: 0},
1: {0: 0, 1: 0, 2: 0},
2: {0: 0, 1: 0, 2: 0}
}
for file in female:
st = time.time()
print("------------------------------>>>>>")
gender, female_score = vc.process_by_feature(file)
ret_map[0][gender] += 1
if gender != 0:
print("err:female->{}|{}|{}".format(gender, file, female_score))
print("process|spend_tm=={}".format(time.time() - st))
for file in male:
st = time.time()
print("------------------------------>>>>>")
gender, female_score = vc.process_by_feature(file)
ret_map[1][gender] += 1
if gender != 1:
print("err:male->{}|{}|{}".format(gender, file, female_score))
print("process|spend_tm=={}".format(time.time() - st))
for file in other:
st = time.time()
print("------------------------------>>>>>")
gender, female_score = vc.process_by_feature(file)
ret_map[2][gender] += 1
if gender != 2:
print("err:other->{}|{}|{}".format(gender, file, female_score))
print("process|spend_tm=={}".format(time.time() - st))
global transcode_time, vb_time, mfcc_time, predict_time
print("spend_time:tot={}|transcode={}|vb={}|gen_feature={}|predict={}".format(time.time() - tot_st, transcode_time,
vb_time, mfcc_time, predict_time))
f_f = ret_map[0][0]
f_m = ret_map[0][1]
f_o = ret_map[0][2]
m_f = ret_map[1][0]
m_m = ret_map[1][1]
m_o = ret_map[1][2]
o_f = ret_map[2][0]
o_m = ret_map[2][1]
o_o = ret_map[2][2]
print("ff:{},fm:{},fo:{}".format(f_f, f_m, f_o))
print("mm:{},mf:{},mo:{}".format(m_m, m_f, m_o))
print("om:{},of:{},oo:{}".format(o_m, o_f, o_o))
# 女性准确率和召回率
f_acc = f_f / (f_f + m_f + o_f)
f_recall = f_f / (f_f + f_m + f_o)
# 男性准确率和召回率
m_acc = m_m / (m_m + f_m + o_m)
m_recall = m_m / (m_m + m_f + m_o)
print("female: acc={}|recall={}".format(f_acc, f_recall))
print("male: acc={}|recall={}".format(m_acc, m_recall))
def test_all():
import glob
base_dir = "/data/datasets/music_voice_dataset_full/online_data_v3_top200"
female = glob.glob(os.path.join(base_dir, "female/*mp4"))
male = glob.glob(os.path.join(base_dir, "male/*mp4"))
other = glob.glob(os.path.join(base_dir, "other/*mp4"))
model_path = "/data/jianli.yang/voice_classification/online/models"
music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth")
music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth")
gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth")
gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth")
vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model)
tot_st = time.time()
ret_map = {
0: {0: 0, 1: 0, 2: 0},
1: {0: 0, 1: 0, 2: 0},
2: {0: 0, 1: 0, 2: 0}
}
for file in female:
st = time.time()
print("------------------------------>>>>>")
gender, female_score = vc.process(file)
ret_map[0][gender] += 1
if gender != 0:
print("err:female->{}|{}|{}".format(gender, file, female_score))
print("process|spend_tm=={}".format(time.time() - st))
for file in male:
st = time.time()
print("------------------------------>>>>>")
gender, female_score = vc.process(file)
ret_map[1][gender] += 1
if gender != 1:
print("err:male->{}|{}|{}".format(gender, file, female_score))
print("process|spend_tm=={}".format(time.time() - st))
for file in other:
st = time.time()
print("------------------------------>>>>>")
gender, female_score = vc.process(file)
ret_map[2][gender] += 1
if gender != 2:
print("err:other->{}|{}|{}".format(gender, file, female_score))
print("process|spend_tm=={}".format(time.time() - st))
global transcode_time, vb_time, mfcc_time, predict_time
print("spend_time:tot={}|transcode={}|vb={}|gen_feature={}|predict={}".format(time.time() - tot_st, transcode_time,
vb_time, mfcc_time, predict_time))
f_f = ret_map[0][0]
f_m = ret_map[0][1]
f_o = ret_map[0][2]
m_f = ret_map[1][0]
m_m = ret_map[1][1]
m_o = ret_map[1][2]
o_f = ret_map[2][0]
o_m = ret_map[2][1]
o_o = ret_map[2][2]
print("ff:{},fm:{},fo:{}".format(f_f, f_m, f_o))
print("mm:{},mf:{},mo:{}".format(m_m, m_f, m_o))
print("om:{},of:{},oo:{}".format(o_m, o_f, o_o))
# 女性准确率和召回率
f_acc = f_f / (f_f + m_f + o_f)
f_recall = f_f / (f_f + f_m + f_o)
# 男性准确率和召回率
m_acc = m_m / (m_m + f_m + o_m)
m_recall = m_m / (m_m + m_f + m_o)
print("female: acc={}|recall={}".format(f_acc, f_recall))
print("male: acc={}|recall={}".format(m_acc, m_recall))
if __name__ == "__main__":
# test_all()
# test_all_feature()
model_path = sys.argv[1]
voice_path = sys.argv[2]
music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth")
music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth")
gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth")
gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth")
vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model)
for i in range(0, 1):
st = time.time()
print("------------------------------>>>>>")
gender, female_rate, is_pure = vc.process(voice_path)
print("process|spend_tm=={}".format(time.time() - st))
print("gender:{}, female_rate:{},is_pure:{}".format(gender, female_rate, is_pure))
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Sun, Jan 12, 08:34 (1 d, 11 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1347133
Default Alt Text
(17 KB)
Attached To
R350 av_svc
Event Timeline
Log In to Comment