Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F4845551
voice_class_online.py
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
15 KB
Subscribers
None
voice_class_online.py
View Options
"""
男女声分类在线工具
1 转码为16bit单声道
2 均衡化
3 模型分类
"""
import
os
import
sys
import
librosa
import
shutil
import
logging
import
time
import
torch.nn.functional
as
F
import
numpy
as
np
from
model
import
*
# from common import bind_kernel
logging
.
basicConfig
(
level
=
logging
.
INFO
)
os
.
environ
[
"LRU_CACHE_CAPACITY"
]
=
"1"
# torch.set_num_threads(1)
# bind_kernel(1)
"""
临时用一下,全局使用的变量
"""
transcode_time
=
0
vb_time
=
0
mfcc_time
=
0
predict_time
=
0
"""
错误码
"""
ERR_CODE_SUCCESS
=
0
# 处理成功
ERR_CODE_NO_FILE
=
-
1
# 文件不存在
ERR_CODE_TRANSCODE
=
-
2
# 转码失败
ERR_CODE_VOLUME_BALANCED
=
-
3
# 均衡化失败
ERR_CODE_FEATURE_TOO_SHORT
=
-
4
# 特征文件太短
"""
常量
"""
FRAME_LEN
=
128
MFCC_LEN
=
80
EBUR128_BIN
=
"/opt/soft/bin/standard_audio_no_cut"
# EBUR128_BIN = "/Users/yangjianli/linux/opt/soft/bin/standard_audio_no_cut"
GENDER_FEMALE
=
0
GENDER_MALE
=
1
GENDER_OTHER
=
2
"""
通用函数
"""
def
exec_cmd
(
cmd
):
ret
=
os
.
system
(
cmd
)
if
ret
!=
0
:
return
False
return
True
"""
业务需要的函数
"""
def
get_one_mfcc
(
file_url
):
st
=
time
.
time
()
data
,
sr
=
librosa
.
load
(
file_url
,
sr
=
16000
)
if
len
(
data
)
<
512
:
return
[]
mfcc
=
librosa
.
feature
.
mfcc
(
y
=
data
,
sr
=
sr
,
n_fft
=
512
,
hop_length
=
256
,
n_mfcc
=
MFCC_LEN
)
mfcc
=
mfcc
.
transpose
()
print
(
"get_one_mfcc:spend_time={}"
.
format
(
time
.
time
()
-
st
))
global
mfcc_time
mfcc_time
+=
time
.
time
()
-
st
return
mfcc
def
volume_balanced
(
src
,
dst
):
st
=
time
.
time
()
cmd
=
"{} {} {}"
.
format
(
EBUR128_BIN
,
src
,
dst
)
logging
.
info
(
cmd
)
exec_cmd
(
cmd
)
if
not
os
.
path
.
exists
(
dst
):
logging
.
error
(
"volume_balanced:cmd={}"
.
format
(
cmd
))
print
(
"volume_balanced:spend_time={}"
.
format
(
time
.
time
()
-
st
))
global
vb_time
vb_time
+=
time
.
time
()
-
st
return
os
.
path
.
exists
(
dst
)
def
transcode
(
src
,
dst
):
st
=
time
.
time
()
cmd
=
"ffmpeg -loglevel quiet -i {} -ar 16000 -ac 1 {}"
.
format
(
src
,
dst
)
logging
.
info
(
cmd
)
exec_cmd
(
cmd
)
if
not
os
.
path
.
exists
(
dst
):
logging
.
error
(
"transcode:cmd={}"
.
format
(
cmd
))
print
(
"transcode:spend_time={}"
.
format
(
time
.
time
()
-
st
))
global
transcode_time
transcode_time
+=
time
.
time
()
-
st
return
os
.
path
.
exists
(
dst
)
class
VoiceClass
:
def
__init__
(
self
,
music_voice_pure_model
,
music_voice_no_pure_model
,
gender_pure_model
,
gender_no_pure_model
):
"""
四个模型
:param music_voice_pure_model: 分辨纯净人声/其他
:param music_voice_no_pure_model: 分辨有人声/其他
:param gender_pure_model: 纯净人声分辨男女
:param gender_no_pure_model: 有人声分辨男女
"""
st
=
time
.
time
()
self
.
device
=
"cpu"
self
.
batch_size
=
256
self
.
music_voice_pure_model
=
load_model
(
MusicVoiceV5Model
,
music_voice_pure_model
,
self
.
device
)
self
.
music_voice_no_pure_model
=
load_model
(
MusicVoiceV5Model
,
music_voice_no_pure_model
,
self
.
device
)
self
.
gender_pure_model
=
load_model
(
MobileNetV2Gender
,
gender_pure_model
,
self
.
device
)
self
.
gender_no_pure_model
=
load_model
(
MobileNetV2Gender
,
gender_no_pure_model
,
self
.
device
)
logging
.
info
(
"load model ok ! spend_time={}"
.
format
(
time
.
time
()
-
st
))
def
batch_predict
(
self
,
model
,
features
):
st
=
time
.
time
()
scores
=
[]
with
torch
.
no_grad
():
for
i
in
range
(
0
,
len
(
features
),
self
.
batch_size
):
cur_data
=
features
[
i
:
i
+
self
.
batch_size
]
.
to
(
self
.
device
)
predicts
=
model
(
cur_data
)
predicts_score
=
F
.
softmax
(
predicts
,
dim
=
1
)
scores
.
extend
(
predicts_score
.
cpu
()
.
numpy
())
ret
=
np
.
array
(
scores
)
global
predict_time
predict_time
+=
time
.
time
()
-
st
return
ret
def
predict_pure
(
self
,
filename
,
features
):
scores
=
self
.
batch_predict
(
self
.
music_voice_pure_model
,
features
)
new_features
=
[]
for
idx
,
score
in
enumerate
(
scores
):
if
score
[
0
]
>
0.5
:
# 非人声
continue
new_features
.
append
(
features
[
idx
]
.
numpy
())
# 人声段太少,不能进行处理
# 参数可以改
new_feature_len
=
len
(
new_features
)
new_feature_rate
=
len
(
new_features
)
/
len
(
features
)
if
new_feature_len
<
4
or
new_feature_rate
<
0.4
:
logging
.
warning
(
"filename={}|predict_pure|other|len={}|rate={}"
.
format
(
filename
,
new_feature_len
,
new_feature_rate
)
)
return
GENDER_OTHER
,
-
1
new_features
=
torch
.
from_numpy
(
np
.
array
(
new_features
))
scores
=
self
.
batch_predict
(
self
.
gender_pure_model
,
new_features
)
f_avg
=
sum
(
scores
[:,
0
])
/
len
(
scores
)
m_avg
=
sum
(
scores
[:,
1
])
/
len
(
scores
)
female_rate
=
f_avg
/
(
f_avg
+
m_avg
)
if
female_rate
>
0.65
:
return
GENDER_FEMALE
,
female_rate
if
female_rate
<
0.12
:
return
GENDER_MALE
,
female_rate
logging
.
warning
(
"filename={}|predict_pure|other|len={}|rate={}"
.
format
(
filename
,
new_feature_len
,
new_feature_rate
)
)
return
GENDER_OTHER
,
female_rate
def
predict_no_pure
(
self
,
filename
,
features
):
scores
=
self
.
batch_predict
(
self
.
music_voice_no_pure_model
,
features
)
new_features
=
[]
for
idx
,
score
in
enumerate
(
scores
):
if
score
[
0
]
>
0.5
:
# 非人声
continue
new_features
.
append
(
features
[
idx
]
.
numpy
())
# 人声段太少,不能进行处理
# 参数可以改
new_feature_len
=
len
(
new_features
)
new_feature_rate
=
len
(
new_features
)
/
len
(
features
)
if
new_feature_len
<
4
or
new_feature_rate
<
0.4
:
logging
.
warning
(
"filename={}|predict_no_pure|other|len={}|rate={}"
.
format
(
filename
,
new_feature_len
,
new_feature_rate
)
)
return
GENDER_OTHER
,
-
1
new_features
=
torch
.
from_numpy
(
np
.
array
(
new_features
))
scores
=
self
.
batch_predict
(
self
.
gender_no_pure_model
,
new_features
)
f_avg
=
sum
(
scores
[:,
0
])
/
len
(
scores
)
m_avg
=
sum
(
scores
[:,
1
])
/
len
(
scores
)
female_rate
=
f_avg
/
(
f_avg
+
m_avg
)
if
female_rate
>
0.75
:
return
GENDER_FEMALE
,
female_rate
if
female_rate
<
0.1
:
return
GENDER_MALE
,
female_rate
logging
.
warning
(
"filename={}|predict_no_pure|other|len={}|rate={}"
.
format
(
filename
,
new_feature_len
,
new_feature_rate
)
)
return
GENDER_OTHER
,
female_rate
def
predict
(
self
,
filename
,
features
):
st
=
time
.
time
()
new_features
=
[]
for
i
in
range
(
FRAME_LEN
,
len
(
features
),
FRAME_LEN
):
new_features
.
append
(
features
[
i
-
FRAME_LEN
:
i
])
new_features
=
torch
.
from_numpy
(
np
.
array
(
new_features
))
gender
,
rate
=
self
.
predict_pure
(
filename
,
new_features
)
if
gender
==
GENDER_OTHER
:
logging
.
info
(
"start no pure process..."
)
return
self
.
predict_no_pure
(
filename
,
new_features
)
print
(
"predict|spend_time={}"
.
format
(
time
.
time
()
-
st
))
return
gender
,
rate
def
process_one_logic
(
self
,
filename
,
file_path
,
cache_dir
):
tmp_wav
=
os
.
path
.
join
(
cache_dir
,
"tmp.wav"
)
tmp_vb_wav
=
os
.
path
.
join
(
cache_dir
,
"tmp_vb.wav"
)
if
not
transcode
(
file_path
,
tmp_wav
):
return
ERR_CODE_TRANSCODE
if
not
volume_balanced
(
tmp_wav
,
tmp_vb_wav
):
return
ERR_CODE_VOLUME_BALANCED
features
=
get_one_mfcc
(
tmp_vb_wav
)
if
len
(
features
)
<
FRAME_LEN
:
logging
.
error
(
"feature too short|file_path={}"
.
format
(
file_path
))
return
ERR_CODE_FEATURE_TOO_SHORT
return
self
.
predict
(
filename
,
features
)
def
process_one
(
self
,
file_path
):
base_dir
=
os
.
path
.
dirname
(
file_path
)
filename
=
os
.
path
.
splitext
(
file_path
)[
0
]
cache_dir
=
os
.
path
.
join
(
base_dir
,
filename
+
"_cache"
)
if
os
.
path
.
exists
(
cache_dir
):
shutil
.
rmtree
(
cache_dir
)
os
.
makedirs
(
cache_dir
)
ret
=
self
.
process_one_logic
(
filename
,
file_path
,
cache_dir
)
shutil
.
rmtree
(
cache_dir
)
return
ret
def
process
(
self
,
file_path
):
gender
,
female_rate
=
self
.
process_one
(
file_path
)
logging
.
info
(
"{}|gender={}|female_rate={}"
.
format
(
file_path
,
gender
,
female_rate
))
return
gender
,
female_rate
def
process_by_feature
(
self
,
feature_file
):
"""
直接处理特征文件
:param feature_file:
:return:
"""
filename
=
os
.
path
.
splitext
(
feature_file
)[
0
]
features
=
np
.
load
(
feature_file
)
gender
,
female_rate
=
self
.
predict
(
filename
,
features
)
return
gender
,
female_rate
def
test_all_feature
():
import
glob
base_dir
=
"/data/datasets/music_voice_dataset_full/feature_online_data_v3"
female
=
glob
.
glob
(
os
.
path
.
join
(
base_dir
,
"female/*feature.npy"
))
male
=
glob
.
glob
(
os
.
path
.
join
(
base_dir
,
"male/*feature.npy"
))
other
=
glob
.
glob
(
os
.
path
.
join
(
base_dir
,
"other/*feature.npy"
))
model_path
=
"/data/jianli.yang/voice_classification/online/models"
music_voice_pure_model
=
os
.
path
.
join
(
model_path
,
"voice_005_rec_v5.pth"
)
music_voice_no_pure_model
=
os
.
path
.
join
(
model_path
,
"voice_10_v5.pth"
)
gender_pure_model
=
os
.
path
.
join
(
model_path
,
"gender_8k_ratev5_v6_adam.pth"
)
gender_no_pure_model
=
os
.
path
.
join
(
model_path
,
"gender_8k_v6_adam.pth"
)
vc
=
VoiceClass
(
music_voice_pure_model
,
music_voice_no_pure_model
,
gender_pure_model
,
gender_no_pure_model
)
tot_st
=
time
.
time
()
ret_map
=
{
0
:
{
0
:
0
,
1
:
0
,
2
:
0
},
1
:
{
0
:
0
,
1
:
0
,
2
:
0
},
2
:
{
0
:
0
,
1
:
0
,
2
:
0
}
}
for
file
in
female
:
st
=
time
.
time
()
print
(
"------------------------------>>>>>"
)
gender
,
female_score
=
vc
.
process_by_feature
(
file
)
ret_map
[
0
][
gender
]
+=
1
if
gender
!=
0
:
print
(
"err:female->{}|{}|{}"
.
format
(
gender
,
file
,
female_score
))
print
(
"process|spend_tm=={}"
.
format
(
time
.
time
()
-
st
))
for
file
in
male
:
st
=
time
.
time
()
print
(
"------------------------------>>>>>"
)
gender
,
female_score
=
vc
.
process_by_feature
(
file
)
ret_map
[
1
][
gender
]
+=
1
if
gender
!=
1
:
print
(
"err:male->{}|{}|{}"
.
format
(
gender
,
file
,
female_score
))
print
(
"process|spend_tm=={}"
.
format
(
time
.
time
()
-
st
))
for
file
in
other
:
st
=
time
.
time
()
print
(
"------------------------------>>>>>"
)
gender
,
female_score
=
vc
.
process_by_feature
(
file
)
ret_map
[
2
][
gender
]
+=
1
if
gender
!=
2
:
print
(
"err:other->{}|{}|{}"
.
format
(
gender
,
file
,
female_score
))
print
(
"process|spend_tm=={}"
.
format
(
time
.
time
()
-
st
))
global
transcode_time
,
vb_time
,
mfcc_time
,
predict_time
print
(
"spend_time:tot={}|transcode={}|vb={}|gen_feature={}|predict={}"
.
format
(
time
.
time
()
-
tot_st
,
transcode_time
,
vb_time
,
mfcc_time
,
predict_time
))
f_f
=
ret_map
[
0
][
0
]
f_m
=
ret_map
[
0
][
1
]
f_o
=
ret_map
[
0
][
2
]
m_f
=
ret_map
[
1
][
0
]
m_m
=
ret_map
[
1
][
1
]
m_o
=
ret_map
[
1
][
2
]
o_f
=
ret_map
[
2
][
0
]
o_m
=
ret_map
[
2
][
1
]
o_o
=
ret_map
[
2
][
2
]
print
(
"ff:{},fm:{},fo:{}"
.
format
(
f_f
,
f_m
,
f_o
))
print
(
"mm:{},mf:{},mo:{}"
.
format
(
m_m
,
m_f
,
m_o
))
print
(
"om:{},of:{},oo:{}"
.
format
(
o_m
,
o_f
,
o_o
))
# 女性准确率和召回率
f_acc
=
f_f
/
(
f_f
+
m_f
+
o_f
)
f_recall
=
f_f
/
(
f_f
+
f_m
+
f_o
)
# 男性准确率和召回率
m_acc
=
m_m
/
(
m_m
+
f_m
+
o_m
)
m_recall
=
m_m
/
(
m_m
+
m_f
+
m_o
)
print
(
"female: acc={}|recall={}"
.
format
(
f_acc
,
f_recall
))
print
(
"male: acc={}|recall={}"
.
format
(
m_acc
,
m_recall
))
def
test_all
():
import
glob
base_dir
=
"/data/datasets/music_voice_dataset_full/online_data_v3_top200"
female
=
glob
.
glob
(
os
.
path
.
join
(
base_dir
,
"female/*mp4"
))
male
=
glob
.
glob
(
os
.
path
.
join
(
base_dir
,
"male/*mp4"
))
other
=
glob
.
glob
(
os
.
path
.
join
(
base_dir
,
"other/*mp4"
))
model_path
=
"/data/jianli.yang/voice_classification/online/models"
music_voice_pure_model
=
os
.
path
.
join
(
model_path
,
"voice_005_rec_v5.pth"
)
music_voice_no_pure_model
=
os
.
path
.
join
(
model_path
,
"voice_10_v5.pth"
)
gender_pure_model
=
os
.
path
.
join
(
model_path
,
"gender_8k_ratev5_v6_adam.pth"
)
gender_no_pure_model
=
os
.
path
.
join
(
model_path
,
"gender_8k_v6_adam.pth"
)
vc
=
VoiceClass
(
music_voice_pure_model
,
music_voice_no_pure_model
,
gender_pure_model
,
gender_no_pure_model
)
tot_st
=
time
.
time
()
ret_map
=
{
0
:
{
0
:
0
,
1
:
0
,
2
:
0
},
1
:
{
0
:
0
,
1
:
0
,
2
:
0
},
2
:
{
0
:
0
,
1
:
0
,
2
:
0
}
}
for
file
in
female
:
st
=
time
.
time
()
print
(
"------------------------------>>>>>"
)
gender
,
female_score
=
vc
.
process
(
file
)
ret_map
[
0
][
gender
]
+=
1
if
gender
!=
0
:
print
(
"err:female->{}|{}|{}"
.
format
(
gender
,
file
,
female_score
))
print
(
"process|spend_tm=={}"
.
format
(
time
.
time
()
-
st
))
for
file
in
male
:
st
=
time
.
time
()
print
(
"------------------------------>>>>>"
)
gender
,
female_score
=
vc
.
process
(
file
)
ret_map
[
1
][
gender
]
+=
1
if
gender
!=
1
:
print
(
"err:male->{}|{}|{}"
.
format
(
gender
,
file
,
female_score
))
print
(
"process|spend_tm=={}"
.
format
(
time
.
time
()
-
st
))
for
file
in
other
:
st
=
time
.
time
()
print
(
"------------------------------>>>>>"
)
gender
,
female_score
=
vc
.
process
(
file
)
ret_map
[
2
][
gender
]
+=
1
if
gender
!=
2
:
print
(
"err:other->{}|{}|{}"
.
format
(
gender
,
file
,
female_score
))
print
(
"process|spend_tm=={}"
.
format
(
time
.
time
()
-
st
))
global
transcode_time
,
vb_time
,
mfcc_time
,
predict_time
print
(
"spend_time:tot={}|transcode={}|vb={}|gen_feature={}|predict={}"
.
format
(
time
.
time
()
-
tot_st
,
transcode_time
,
vb_time
,
mfcc_time
,
predict_time
))
f_f
=
ret_map
[
0
][
0
]
f_m
=
ret_map
[
0
][
1
]
f_o
=
ret_map
[
0
][
2
]
m_f
=
ret_map
[
1
][
0
]
m_m
=
ret_map
[
1
][
1
]
m_o
=
ret_map
[
1
][
2
]
o_f
=
ret_map
[
2
][
0
]
o_m
=
ret_map
[
2
][
1
]
o_o
=
ret_map
[
2
][
2
]
print
(
"ff:{},fm:{},fo:{}"
.
format
(
f_f
,
f_m
,
f_o
))
print
(
"mm:{},mf:{},mo:{}"
.
format
(
m_m
,
m_f
,
m_o
))
print
(
"om:{},of:{},oo:{}"
.
format
(
o_m
,
o_f
,
o_o
))
# 女性准确率和召回率
f_acc
=
f_f
/
(
f_f
+
m_f
+
o_f
)
f_recall
=
f_f
/
(
f_f
+
f_m
+
f_o
)
# 男性准确率和召回率
m_acc
=
m_m
/
(
m_m
+
f_m
+
o_m
)
m_recall
=
m_m
/
(
m_m
+
m_f
+
m_o
)
print
(
"female: acc={}|recall={}"
.
format
(
f_acc
,
f_recall
))
print
(
"male: acc={}|recall={}"
.
format
(
m_acc
,
m_recall
))
if
__name__
==
"__main__"
:
# test_all()
# test_all_feature()
model_path
=
sys
.
argv
[
1
]
voice_path
=
sys
.
argv
[
2
]
music_voice_pure_model
=
os
.
path
.
join
(
model_path
,
"voice_005_rec_v5.pth"
)
music_voice_no_pure_model
=
os
.
path
.
join
(
model_path
,
"voice_10_v5.pth"
)
gender_pure_model
=
os
.
path
.
join
(
model_path
,
"gender_8k_ratev5_v6_adam.pth"
)
gender_no_pure_model
=
os
.
path
.
join
(
model_path
,
"gender_8k_v6_adam.pth"
)
vc
=
VoiceClass
(
music_voice_pure_model
,
music_voice_no_pure_model
,
gender_pure_model
,
gender_no_pure_model
)
for
i
in
range
(
0
,
1
):
st
=
time
.
time
()
print
(
"------------------------------>>>>>"
)
vc
.
process
(
voice_path
)
print
(
"process|spend_tm=={}"
.
format
(
time
.
time
()
-
st
))
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sun, Nov 24, 19:32 (22 h, 12 s)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1326487
Default Alt Text
voice_class_online.py (15 KB)
Attached To
R350 av_svc
Event Timeline
Log In to Comment