Page MenuHomePhabricator

No OneTemporary

diff --git a/AutoCoverTool/online/tone_shift_one.py b/AutoCoverTool/online/tone_shift_one.py
index e395c8d..9a422f8 100644
--- a/AutoCoverTool/online/tone_shift_one.py
+++ b/AutoCoverTool/online/tone_shift_one.py
@@ -1,328 +1,338 @@
"""
变调的方式做处理
1. 下载
2. 分离
3. 针对于人声变调+2,伴奏+1
4. 合成
"""
import os
import json
import shutil
import librosa
import logging
import numpy as np
from ref.music_remover.separate_interface import SeparateInterface
from online.inference_worker import upload_file2cos, gs_state_use, gs_state_finish, gs_state_default
from online.common import *
+from ref.online.voice_class_online import VoiceClass
logging.basicConfig(filename='/tmp/tone_shift_one.log', level=logging.INFO)
gs_tone_shift_exe = "/opt/soft/bin/tone_shift_exe"
gs_simple_mixer_path = "/opt/soft/bin/simple_mixer"
gs_err_code_success = 0
gs_err_code_tone_shift = 1
gs_err_code_mix = 2
gs_err_code_transcode = 3
gs_err_code_upload = 4
gs_err_code_download = 5
gs_err_code_trans_to_mp3 = 6
gs_err_code_separate = 7
gs_err_code_duration_too_long = 8
gs_err_code_duration_no_vocal = 9
gs_err_code_duration_err = 10
gs_err_code_transcode_acc = 11
gs_err_code_upload_acc = 12
gs_err_code_download_acc = 13
gs_err_code_download_vocal = 14
gs_err_code_transcode_acc_v1 = 15
gs_err_code_transcode_vocal_v1 = 16
gs_err_code_silence_no_data = 17
gs_err_code_silence_no_process = 18
def exec_cmd(cmd):
r = os.popen(cmd)
text = r.read()
r.close()
return text
def get_d(audio_path):
cmd = "ffprobe -v quiet -print_format json -show_format -show_streams {}".format(audio_path)
data = exec_cmd(cmd)
data = json.loads(data)
# 返回秒
if 'format' in data.keys() and 'duration' in data['format']:
return float(data["format"]["duration"])
return -1
def get_mean_power(audio_path):
sr = 44100
audio, sr = librosa.load(audio_path, sr=sr, mono=True)
mm = np.mean(np.abs(audio))
return mm
class ToneShift:
def __init__(self):
self.separate_inst = SeparateInterface()
+ model_path = "./models"
+ music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth")
+ music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth")
+ gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth")
+ gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth")
+
+ self.voice_class = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model,
+ gender_no_pure_model)
def update_state(self, song_id, state):
sql = "update svc_queue_table set state={},update_time={} where song_id = {}". \
format(state, int(time.time()), song_id)
banned_user_map['db'] = "av_db"
update_db(sql, banned_user_map)
def get_url_by_id(self, song_id):
sql = "select song_id, url from svc_queue_table where song_id={}".format(song_id)
banned_user_map["db"] = "av_db"
data = get_data_by_mysql(sql)
if len(data) == 0:
return None, None
return str(data[0][0]), data[0][1]
def get_one_data_logic(self):
"""
按照5,4,3的优先级进行获取
:return:
"""
song_src_arr = [5, 4, 3]
for song_src in song_src_arr:
song_id, song_url = self.get_one_data(song_src=song_src)
if song_id is not None:
return song_id, song_url
return None, None
def get_one_data(self, song_src=3):
sql = "select song_id, url from svc_queue_table where state = 0 and song_src={} order by create_time asc limit 1".format(
song_src)
banned_user_map["db"] = "av_db"
data = get_data_by_mysql(sql, banned_user_map)
if len(data) == 0:
return None, None
song_id, song_url = data[0]
if song_id != "":
self.update_state(song_id, gs_state_use)
return str(song_id), song_url
def pre_process(self, work_dir, song_url):
"""
创建文件夹,下载数据
:return:
"""
if "?sign=" in song_url:
return gs_err_code_download
ext = str(song_url).split(".")[-1]
dst_file = "{}/src_origin.{}".format(work_dir, ext)
cmd = "wget {} -O {}".format(song_url, dst_file)
os.system(cmd)
if not os.path.exists(dst_file):
return gs_err_code_download
duration = get_d(dst_file)
if duration < 0:
return gs_err_code_duration_err
print("Duration:", dst_file, duration)
if duration > 20 * 60:
return gs_err_code_duration_too_long
dst_mp3_file = "{}/src.wav".format(work_dir)
cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {} ".format(dst_file, dst_mp3_file)
os.system(cmd)
if not os.path.exists(dst_mp3_file):
return gs_err_code_trans_to_mp3
return gs_err_code_success
def tone_shift_one(self, in_file, dst_file, pitch):
cmd = "{} {} {} {}".format(gs_tone_shift_exe, in_file, dst_file, pitch)
os.system(cmd)
return os.path.exists(dst_file)
def mix(self, cid, vocal_path, acc_path, tp):
if tp == 1:
vocal_pitch = 2
acc_pitch = 0
else:
vocal_pitch = -2
acc_pitch = 0
vocal_path_2 = vocal_path.replace(".wav", "_{}.wav".format(vocal_pitch))
acc_path_2 = acc_path.replace(".wav", "_{}.wav".format(acc_pitch))
err = self.tone_shift_one(vocal_path, vocal_path_2, vocal_pitch)
if not err:
- return gs_err_code_tone_shift, None
+ return gs_err_code_tone_shift, None, None
+ gender, female_rate = self.voice_class.process_one(vocal_path_2)
err = self.tone_shift_one(acc_path, acc_path_2, acc_pitch)
if not err:
- return gs_err_code_tone_shift, None
+ return gs_err_code_tone_shift, None, None
base_dir = os.path.dirname(vocal_path)
mix_path = "{}/mix_{}_{}.wav".format(base_dir, vocal_pitch, acc_pitch)
cmd = "{} {} {} {}".format(gs_simple_mixer_path, vocal_path_2, acc_path_2, mix_path)
print("exec_cmd={}".format(cmd))
os.system(cmd)
if not os.path.exists(mix_path):
- return gs_err_code_mix, None
+ return gs_err_code_mix, None, None
# 转码
mix_path_mp3 = mix_path.replace(".wav", ".mp4")
cmd = "ffmpeg -i {} -b:a 128k -c:a aac -ar 44100 -ac 2 -y {} -loglevel fatal".format(mix_path, mix_path_mp3)
os.system(cmd)
if not os.path.exists(mix_path_mp3):
- return gs_err_code_transcode, None
+ return gs_err_code_transcode, None, None
# 上传到cos
mix_name = os.path.basename(mix_path_mp3)
key = "av_res/svc_res_tone_shift/{}/{}".format(str(cid), mix_name)
if not upload_file2cos(key, mix_path_mp3):
- return gs_err_code_upload, None
- return gs_err_code_success, key
+ return gs_err_code_upload, None, None
+ return gs_err_code_success, key, gender
def upload_acc(self, cid, acc_path):
# 转码
mix_path_aac = acc_path.replace(".wav", ".m4a")
cmd = "ffmpeg -i {} -b:a 128k -c:a aac -ar 44100 -ac 2 -y {} -loglevel fatal".format(acc_path, mix_path_aac)
os.system(cmd)
if not os.path.exists(mix_path_aac):
return gs_err_code_transcode_acc, None
# 上传
mix_name = os.path.basename(mix_path_aac)
key = "av_res/svc_res_tone_shift/{}/{}".format(str(cid), mix_name)
if not upload_file2cos(key, mix_path_aac):
return gs_err_code_upload_acc, None
return gs_err_code_success, key
def process_one(self, cid, work_dir):
"""
:param cid:
:param work_dir:
:return:
"""
src_mp3 = os.path.join(work_dir, "src.wav")
vocal_path = os.path.join(work_dir, "vocal.wav")
acc_path = os.path.join(work_dir, "acc.wav")
if not (os.path.exists(vocal_path) and os.path.exists(acc_path)):
if not self.separate_inst.process(cid, src_mp3, vocal_path, acc_path):
return gs_err_code_separate, []
if not os.path.exists(vocal_path) or not os.path.exists(acc_path):
return gs_err_code_separate, []
# 当人声的平均能量小于一定值时,则认为无人声(0.01是经验值判定,样本分析来看)
# 无人声的样本[0.0056, 0.0003], 有人声的样本(目前最小)[0.046, 0.049]
print("power:{},{}".format(cid, get_mean_power(vocal_path)))
if get_mean_power(vocal_path) < 0.02:
return gs_err_code_duration_no_vocal, []
- err, type1_mix_mp3 = self.mix(cid, vocal_path, acc_path, 1)
+ err, type1_mix_mp3, gender = self.mix(cid, vocal_path, acc_path, 1)
if err != gs_err_code_success:
return err, []
- err, type2_mix_mp3 = self.mix(cid, vocal_path, acc_path, 2)
+ err, type2_mix_mp3, gender2 = self.mix(cid, vocal_path, acc_path, 2)
if err != gs_err_code_success:
return err, []
# 上传伴奏文件
# err, acc_path_m4a = self.upload_acc(cid, acc_path)
# if err != gs_err_code_success:
# return err, []
- return gs_err_code_success, [type1_mix_mp3, type2_mix_mp3]
+ return gs_err_code_success, [type1_mix_mp3, type2_mix_mp3, str(gender), str(gender2)]
def download_and_transcode(self, url, local_path, local_path_wav):
cmd = "wget {} -O {}".format(url, local_path)
os.system(cmd)
if not os.path.exists(local_path):
return -1
cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {}".format(local_path, local_path_wav)
os.system(cmd)
if not os.path.exists(local_path_wav):
return -2
return 0
def get_data_from_mysql(self, cid, work_dir):
sql = "select starmaker_songid,task_url,complete_url,voice_url from starmaker_musicbook.silence where starmaker_songid={} order by task_id desc limit 1".format(
cid)
data = get_data_by_mysql(sql, banned_user_map)
if len(data) == 0:
return gs_err_code_silence_no_data
song_id, task_url, complete_url, voice_url = data[0]
if complete_url != "" and voice_url != "":
"""
将人声与伴奏下载下来
"""
ext = str(complete_url).split(".")[-1]
acc_dst_file = os.path.join(work_dir, "acc.{}".format(ext))
acc_wav_dst_file = os.path.join(work_dir, "acc.wav")
err = self.download_and_transcode(complete_url, acc_dst_file, acc_wav_dst_file)
os.unlink(acc_dst_file)
if err == -1:
return gs_err_code_download_acc
if err == -2:
return gs_err_code_transcode_acc_v1
ext = str(voice_url).split(".")[-1]
vocal_dst_file = os.path.join(work_dir, "vocal.{}".format(ext))
vocal_wav_dst_file = os.path.join(work_dir, "vocal.wav")
err = self.download_and_transcode(voice_url, vocal_dst_file, vocal_wav_dst_file)
os.unlink(vocal_dst_file)
if err == -1:
return gs_err_code_download_vocal
if err == -2:
return gs_err_code_transcode_vocal_v1
return gs_err_code_success
return gs_err_code_silence_no_process
def process_worker(self):
logging.info("start process_worker .....")
base_dir = "/tmp/tone_shift_one"
if not os.path.exists(base_dir):
os.makedirs(base_dir)
while True:
worker_st = time.time()
cid, song_url = self.get_one_data_logic()
- # cid, song_url = self.get_url_by_id('175210503076374799')
+ # cid, song_url = self.get_url_by_id('611752105030548048')
if cid is None:
time.sleep(5)
logging.info("get one data is None ...")
continue
work_dir = os.path.join(base_dir, str(cid))
if os.path.exists(work_dir):
shutil.rmtree(work_dir)
os.makedirs(work_dir)
# 先查看消音数据库中是否已经完成了该项目,已经有的话,就直接下载即可
err = self.get_data_from_mysql(cid, work_dir)
if err != gs_err_code_success:
# 清空磁盘
shutil.rmtree(work_dir)
os.makedirs(work_dir)
err = self.pre_process(work_dir, song_url)
if err != gs_err_code_success:
self.update_state(str(cid), -err)
continue
st = time.time()
err, data = self.process_one(str(cid), work_dir)
logging.info("process_finish,{},{}".format(cid, time.time() - st))
if err == gs_err_code_success and len(data) != 0:
sql = "update svc_queue_table set state={},update_time={},svc_url=\"{}\" where song_id = {}". \
format(gs_state_finish, int(time.time()), ",".join(data), str(cid))
banned_user_map['db'] = "av_db"
update_db(sql, banned_user_map)
else:
self.update_state(str(cid), -err)
shutil.rmtree(work_dir)
logging.info("process_finish,{},{}".format(cid, time.time() - worker_st))
if __name__ == '__main__':
ts = ToneShift()
ts.process_worker()
diff --git a/AutoCoverTool/ref/online/common.py b/AutoCoverTool/ref/online/common.py
new file mode 100644
index 0000000..af3487a
--- /dev/null
+++ b/AutoCoverTool/ref/online/common.py
@@ -0,0 +1,93 @@
+#-*-encording=utf-8-*-
+"""
+程序绑定核心
+一个脚本启动多次,每次绑定一个核心,不会多次绑定到同一个核心
+每个进程选定绑定n个核心,或者自己传入需要绑定的核心编号
+"""
+
+import time
+import psutil
+import os
+import sys
+import hashlib
+import fcntl
+
+"""
+自动获取可用核心
+"""
+
+
+def exec_cmd_ints(cmd):
+ """
+ 执行cmd,获取返回值
+ :param cmd:
+ :return:
+ """
+ r = os.popen(cmd)
+ lines = r.readlines()
+ ids = []
+ for line in lines:
+ line = line.strip()
+ if line.isdigit():
+ id = int(float(line))
+ ids.append(id)
+ return ids
+
+
+def get_idle_kernel(n=1):
+ cur_id = os.getpid()
+ name = os.path.basename(sys.argv[0])
+ command = "ps -ef | grep {} |grep python | awk \'{{print $2}}\'".format(name)
+ print(command)
+ ids = exec_cmd_ints(command)
+
+ print(ids, cur_id)
+ # 获取所有被绑定的核心
+ count = psutil.cpu_count()
+ used = [False] * (count // n)
+ command = "pidstat | grep {} | awk \'{{print $(NF-1)}}\'"
+ for i in range(0, len(ids)):
+ if cur_id != ids[i]:
+ cmd = command.format(ids[i])
+ kers = exec_cmd_ints(cmd)
+ for ker in kers:
+ ker = ker // n
+ used[ker] = True
+ print(used)
+ # 获取N个可用的核心
+ for i in range(0, len(used)):
+ if not used[i]:
+ res = []
+ cur_i = i * n
+ for idx in range(cur_i, cur_i+n):
+ if idx < count:
+ res.append(idx)
+ return res
+ return 0
+
+
+def bind_kernel(n=1, kernel=[]):
+ p = psutil.Process()
+
+ # 加锁
+ name = hashlib.md5(os.path.basename(sys.argv[0]).encode('utf-8')).hexdigest()
+ name = os.path.join("/tmp", name + ".lock")
+ if not os.path.exists(name):
+ with open(name, "w") as f:
+ f.write("0")
+ file = open(name)
+ fcntl.flock(file.fileno(), fcntl.LOCK_EX) # 排他锁
+ print("lock file --- {}".format(name))
+ if len(kernel) > 0:
+ kernels = kernel
+ else:
+ kernels = get_idle_kernel(n)
+ p.cpu_affinity(kernels) # 绑定特定核心
+ print("bind_kernel", kernels)
+ file.close() # 释放锁
+ print("unlock file --- {}".format(name))
+
+
+def calc_forever():
+ for i in range(0, 10000):
+ time.sleep(1000)
\ No newline at end of file
diff --git a/AutoCoverTool/ref/online/mobilenet_v2_custom.py b/AutoCoverTool/ref/online/mobilenet_v2_custom.py
new file mode 100644
index 0000000..57b1227
--- /dev/null
+++ b/AutoCoverTool/ref/online/mobilenet_v2_custom.py
@@ -0,0 +1,142 @@
+"""
+直接从代码库中拷贝出的代码
+目的: mobilenet_v2只允许输入图片的通道数为3,不满足要求,因此拷贝出来做修改
+"""
+
+from torch import nn
+
+
+def _make_divisible(v, divisor, min_value=None):
+ """
+ This function is taken from the original tf repo.
+ It ensures that all layers have a channel number that is divisible by 8
+ It can be seen here:
+ https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+ :param v:
+ :param divisor:
+ :param min_value:
+ :return:
+ """
+ if min_value is None:
+ min_value = divisor
+ new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+ # Make sure that round down does not go down by more than 10%.
+ if new_v < 0.9 * v:
+ new_v += divisor
+ return new_v
+
+
+class ConvBNReLU(nn.Sequential):
+ def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
+ padding = (kernel_size - 1) // 2
+ super(ConvBNReLU, self).__init__(
+ nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
+ nn.BatchNorm2d(out_planes),
+ nn.ReLU6(inplace=True)
+ )
+
+
+class InvertedResidual(nn.Module):
+ def __init__(self, inp, oup, stride, expand_ratio):
+ super(InvertedResidual, self).__init__()
+ self.stride = stride
+ assert stride in [1, 2]
+
+ hidden_dim = int(round(inp * expand_ratio))
+ self.use_res_connect = self.stride == 1 and inp == oup
+
+ layers = []
+ if expand_ratio != 1:
+ # pw
+ layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
+ layers.extend([
+ # dw
+ ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim),
+ # pw-linear
+ nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+ nn.BatchNorm2d(oup),
+ ])
+ self.conv = nn.Sequential(*layers)
+
+ def forward(self, x):
+ if self.use_res_connect:
+ return x + self.conv(x)
+ else:
+ return self.conv(x)
+
+
+class MobileNetV2Custom(nn.Module):
+ def __init__(self, num_classes=2, in_channel=1, width_mult=1.0, inverted_residual_setting=None, round_nearest=8):
+ """
+ MobileNet V2 main class
+
+ Args:
+ num_classes (int): Number of classes
+ width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
+ inverted_residual_setting: Network structure
+ round_nearest (int): Round the number of channels in each layer to be a multiple of this number
+ Set to 1 to turn off rounding
+ """
+ super(MobileNetV2Custom, self).__init__()
+ block = InvertedResidual
+ input_channel = 32
+ last_channel = 1280
+
+ if inverted_residual_setting is None:
+ inverted_residual_setting = [
+ # t, c, n, s
+ [1, 16, 1, 1],
+ [6, 24, 2, 2],
+ [6, 32, 3, 2],
+ [6, 64, 4, 2],
+ [6, 96, 3, 1],
+ [6, 160, 3, 2],
+ [6, 320, 1, 1],
+ ]
+
+ # only check the first element, assuming user knows t,c,n,s are required
+ if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
+ raise ValueError("inverted_residual_setting should be non-empty "
+ "or a 4-element list, got {}".format(inverted_residual_setting))
+
+ # building first layer
+ input_channel = _make_divisible(input_channel * width_mult, round_nearest)
+ self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
+ # 修改的地方,原来in_channel=3
+ features = [ConvBNReLU(in_channel, input_channel, stride=2)]
+ # building inverted residual blocks
+ for t, c, n, s in inverted_residual_setting:
+ output_channel = _make_divisible(c * width_mult, round_nearest)
+ for i in range(n):
+ stride = s if i == 0 else 1
+ features.append(block(input_channel, output_channel, stride, expand_ratio=t))
+ input_channel = output_channel
+ # building last several layers
+ features.append(ConvBNReLU(input_channel, self.last_channel, kernel_size=1))
+ # make it nn.Sequential
+ self.features = nn.Sequential(*features)
+
+ # building classifier
+ self.classifier = nn.Sequential(
+ nn.Dropout(0.2),
+ nn.Linear(self.last_channel, num_classes),
+ )
+
+ # weight initialization
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ nn.init.kaiming_normal_(m.weight, mode='fan_out')
+ if m.bias is not None:
+ nn.init.zeros_(m.bias)
+ elif isinstance(m, nn.BatchNorm2d):
+ nn.init.ones_(m.weight)
+ nn.init.zeros_(m.bias)
+ elif isinstance(m, nn.Linear):
+ nn.init.normal_(m.weight, 0, 0.01)
+ nn.init.zeros_(m.bias)
+
+ def forward(self, x):
+ x = self.features(x)
+ x = x.mean([2, 3])
+ x = self.classifier(x)
+ return x
diff --git a/AutoCoverTool/ref/online/model.py b/AutoCoverTool/ref/online/model.py
new file mode 100644
index 0000000..c5e8adc
--- /dev/null
+++ b/AutoCoverTool/ref/online/model.py
@@ -0,0 +1,71 @@
+from mobilenet_v2_custom import MobileNetV2Custom
+import torch
+import torch.nn as nn
+
+MFCC_LEN = 80
+FRAME_LEN = 128
+
+
+class MobileNetV2Gender(MobileNetV2Custom):
+
+ def forward(self, x):
+ x = x.view([-1, 1, FRAME_LEN, MFCC_LEN])
+ return super(MobileNetV2Gender, self).forward(x)
+
+
+class MusicVoiceV5Model(nn.Module):
+ def __init__(self):
+ super(MusicVoiceV5Model, self).__init__()
+
+ def conv_bn(inp, oup, stride):
+ return nn.Sequential(
+ nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+ nn.BatchNorm2d(oup),
+ nn.ReLU(inplace=True)
+ )
+
+ def conv_dw(inp, oup, stride):
+ return nn.Sequential(
+ nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
+ nn.BatchNorm2d(inp),
+ nn.ReLU(inplace=True),
+
+ nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+ nn.BatchNorm2d(oup),
+ nn.ReLU(inplace=True),
+ )
+
+ self.model = nn.Sequential(
+ conv_bn(1, 32, 2),
+ conv_dw(32, 64, 1),
+ conv_dw(64, 128, 2),
+ conv_dw(128, 128, 1),
+ conv_dw(128, 256, 2),
+ conv_dw(256, 256, 1),
+ conv_dw(256, 512, 2),
+ conv_dw(512, 512, 1),
+ conv_dw(512, 512, 1),
+ conv_dw(512, 512, 1),
+ conv_dw(512, 512, 1),
+ conv_dw(512, 512, 1),
+ conv_dw(512, 1024, 2),
+ conv_dw(1024, 1024, 1),
+ nn.AvgPool2d((4, 3)),
+ )
+ self.fc = nn.Linear(1024, 2)
+
+ def forward(self, x):
+ x = x.view([-1, 1, FRAME_LEN, MFCC_LEN])
+ x = self.model(x)
+ x = x.view(-1, 1024)
+ x = self.fc(x)
+ return x
+
+
+def load_model(model_type, model_path, device):
+ model = model_type()
+ params = torch.load(model_path, map_location=torch.device(device))
+ model.load_state_dict(state_dict=params)
+ model.eval()
+ model.to(device)
+ return model
diff --git a/AutoCoverTool/ref/online/readme.md b/AutoCoverTool/ref/online/readme.md
new file mode 100644
index 0000000..10a1f09
--- /dev/null
+++ b/AutoCoverTool/ref/online/readme.md
@@ -0,0 +1,50 @@
+#男女声识别
+
+```
+模型名称以及对应作用:
+---gender_8k_ratev5_v6_adam.pth // 男女声(纯人声)分类模型(使用8k纯人声数据集进行训练,mobilenet_v2,adam优化器)
+---gender_8k_v6_adam.pth // 男女声(带人声)分类模型(使用8k带人声数据集进行训练,mobilenet_v2,adam优化器)
+---voice_005_rec_v5.pth // 纯人声分类模型(400首人工标注的歌曲,判定纯人声段(使用作品中带人声段当作负样本) mobilenet_v1, sgd优化器)
+---voice_10_v5.pth // 带人声分类模型(400首人工标注的歌曲,判定带人声段, mobilenet_v1, sgd优化器)
+模型地址:https://av-audit-sync-in-1256122840.cos.ap-mumbai.myqcloud.com/hub/voice_classification/models.zip
+```
+
+# 文件说明
+```
+---common.py // 用于绑定核心的代码
+---mobilenet_v2_custom.py // 模型代码
+---model.py // 调用模型的封装层
+---readme.MD // 说明文件
+---voice_class_online.py // 运行时使用的文件
+```
+
+# 环境安装
+```
+cd /home/worker
+wget "https://av-audit-sync-in-1256122840.cos.ap-mumbai.myqcloud.com/hub/voice_classification/bin/bin.zip"
+unzip bin.zip
+rm -f bin.zip
+export PATH=$PATH:/home/worker/bin # 需要写入到.zshrc中
+sudo yum install libsndfile-devel
+
+# 以下使用手动安装即可
+conda create -n voice_class python=3.7 -y
+conda activate voice_class
+pip3 install librosa
+pip3 install psutil
+pip3 install torch==1.5 torchvision torchaudio
+```
+
+# 使用说明
+```
+下载模型并解压后,按照voice_class_online.py中的运行方式运行即可
+```
+
+# 注意:
+目前代码中限制了CPU的核心数量,只允许占用一个核,建议根据核心的情况多开几个进程做处理
+
+# 性能测试(不加性能限制的情况下在GPU-2机器上测试得到):
+20个线上样本(男10,女10)
+
+CPU情况:spend_time:tot=31.91|transcode=5.92|vb=3.12|gen_feature=3.5|predict=18.94
+GPU情况:spend_time:tot=15.64|transcode=6.34|vb=4.17|gen_feature=3.3|predict=1.443
diff --git a/AutoCoverTool/ref/online/resource/female/4785074274851990.mp4 b/AutoCoverTool/ref/online/resource/female/4785074274851990.mp4
new file mode 100644
index 0000000..9b225ee
Binary files /dev/null and b/AutoCoverTool/ref/online/resource/female/4785074274851990.mp4 differ
diff --git a/AutoCoverTool/ref/online/voice_class_online.py b/AutoCoverTool/ref/online/voice_class_online.py
new file mode 100644
index 0000000..6041c94
--- /dev/null
+++ b/AutoCoverTool/ref/online/voice_class_online.py
@@ -0,0 +1,420 @@
+"""
+男女声分类在线工具
+1 转码为16bit单声道
+2 均衡化
+3 模型分类
+"""
+
+import os
+import sys
+import librosa
+import shutil
+import logging
+import time
+import torch.nn.functional as F
+import numpy as np
+from model import *
+# from common import bind_kernel
+
+logging.basicConfig(level=logging.INFO)
+
+os.environ["LRU_CACHE_CAPACITY"] = "1"
+
+# torch.set_num_threads(1)
+# bind_kernel(1)
+
+"""
+临时用一下,全局使用的变量
+"""
+
+transcode_time = 0
+vb_time = 0
+mfcc_time = 0
+predict_time = 0
+
+"""
+错误码
+"""
+ERR_CODE_SUCCESS = 0 # 处理成功
+ERR_CODE_NO_FILE = -1 # 文件不存在
+ERR_CODE_TRANSCODE = -2 # 转码失败
+ERR_CODE_VOLUME_BALANCED = -3 # 均衡化失败
+ERR_CODE_FEATURE_TOO_SHORT = -4 # 特征文件太短
+
+"""
+常量
+"""
+
+FRAME_LEN = 128
+MFCC_LEN = 80
+
+EBUR128_BIN = "/opt/soft/bin/standard_audio_no_cut"
+# EBUR128_BIN = "/Users/yangjianli/linux/opt/soft/bin/standard_audio_no_cut"
+GENDER_FEMALE = 0
+GENDER_MALE = 1
+GENDER_OTHER = 2
+"""
+通用函数
+"""
+
+
+def exec_cmd(cmd):
+ ret = os.system(cmd)
+ if ret != 0:
+ return False
+ return True
+
+
+"""
+业务需要的函数
+"""
+
+
+def get_one_mfcc(file_url):
+ st = time.time()
+ data, sr = librosa.load(file_url, sr=16000)
+ if len(data) < 512:
+ return []
+ mfcc = librosa.feature.mfcc(y=data, sr=sr, n_fft=512, hop_length=256, n_mfcc=MFCC_LEN)
+ mfcc = mfcc.transpose()
+ print("get_one_mfcc:spend_time={}".format(time.time() - st))
+ global mfcc_time
+ mfcc_time += time.time() - st
+ return mfcc
+
+
+def volume_balanced(src, dst):
+ st = time.time()
+ cmd = "{} {} {}".format(EBUR128_BIN, src, dst)
+ logging.info(cmd)
+ exec_cmd(cmd)
+ if not os.path.exists(dst):
+ logging.error("volume_balanced:cmd={}".format(cmd))
+ print("volume_balanced:spend_time={}".format(time.time() - st))
+
+ global vb_time
+ vb_time += time.time() - st
+ return os.path.exists(dst)
+
+
+def transcode(src, dst):
+ st = time.time()
+ cmd = "ffmpeg -loglevel quiet -i {} -ar 16000 -ac 1 {}".format(src, dst)
+ logging.info(cmd)
+ exec_cmd(cmd)
+ if not os.path.exists(dst):
+ logging.error("transcode:cmd={}".format(cmd))
+ print("transcode:spend_time={}".format(time.time() - st))
+ global transcode_time
+ transcode_time += time.time() - st
+ return os.path.exists(dst)
+
+
+class VoiceClass:
+
+ def __init__(self, music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model):
+ """
+ 四个模型
+ :param music_voice_pure_model: 分辨纯净人声/其他
+ :param music_voice_no_pure_model: 分辨有人声/其他
+ :param gender_pure_model: 纯净人声分辨男女
+ :param gender_no_pure_model: 有人声分辨男女
+ """
+ st = time.time()
+ self.device = "cpu"
+ self.batch_size = 256
+ self.music_voice_pure_model = load_model(MusicVoiceV5Model, music_voice_pure_model, self.device)
+ self.music_voice_no_pure_model = load_model(MusicVoiceV5Model, music_voice_no_pure_model, self.device)
+ self.gender_pure_model = load_model(MobileNetV2Gender, gender_pure_model, self.device)
+ self.gender_no_pure_model = load_model(MobileNetV2Gender, gender_no_pure_model, self.device)
+ logging.info("load model ok ! spend_time={}".format(time.time() - st))
+
+ def batch_predict(self, model, features):
+ st = time.time()
+ scores = []
+ with torch.no_grad():
+ for i in range(0, len(features), self.batch_size):
+ cur_data = features[i:i + self.batch_size].to(self.device)
+ predicts = model(cur_data)
+ predicts_score = F.softmax(predicts, dim=1)
+ scores.extend(predicts_score.cpu().numpy())
+ ret = np.array(scores)
+ global predict_time
+ predict_time += time.time() - st
+ return ret
+
+ def predict_pure(self, filename, features):
+ scores = self.batch_predict(self.music_voice_pure_model, features)
+ new_features = []
+ for idx, score in enumerate(scores):
+ if score[0] > 0.5: # 非人声
+ continue
+ new_features.append(features[idx].numpy())
+
+ # 人声段太少,不能进行处理
+ # 参数可以改
+ new_feature_len = len(new_features)
+ new_feature_rate = len(new_features) / len(features)
+ if new_feature_len < 4 or new_feature_rate < 0.4:
+ logging.warning(
+ "filename={}|predict_pure|other|len={}|rate={}".format(filename, new_feature_len, new_feature_rate)
+ )
+ return GENDER_OTHER, -1
+ new_features = torch.from_numpy(np.array(new_features))
+ scores = self.batch_predict(self.gender_pure_model, new_features)
+ f_avg = sum(scores[:, 0]) / len(scores)
+ m_avg = sum(scores[:, 1]) / len(scores)
+ female_rate = f_avg / (f_avg + m_avg)
+ if female_rate > 0.65:
+ return GENDER_FEMALE, female_rate
+ if female_rate < 0.12:
+ return GENDER_MALE, female_rate
+ logging.warning(
+ "filename={}|predict_pure|other|len={}|rate={}".format(filename, new_feature_len, new_feature_rate)
+ )
+ return GENDER_OTHER, female_rate
+
+ def predict_no_pure(self, filename, features):
+ scores = self.batch_predict(self.music_voice_no_pure_model, features)
+ new_features = []
+ for idx, score in enumerate(scores):
+ if score[0] > 0.5: # 非人声
+ continue
+ new_features.append(features[idx].numpy())
+
+ # 人声段太少,不能进行处理
+ # 参数可以改
+ new_feature_len = len(new_features)
+ new_feature_rate = len(new_features) / len(features)
+ if new_feature_len < 4 or new_feature_rate < 0.4:
+ logging.warning(
+ "filename={}|predict_no_pure|other|len={}|rate={}".format(filename, new_feature_len, new_feature_rate)
+ )
+ return GENDER_OTHER, -1
+ new_features = torch.from_numpy(np.array(new_features))
+ scores = self.batch_predict(self.gender_no_pure_model, new_features)
+ f_avg = sum(scores[:, 0]) / len(scores)
+ m_avg = sum(scores[:, 1]) / len(scores)
+ female_rate = f_avg / (f_avg + m_avg)
+ if female_rate > 0.75:
+ return GENDER_FEMALE, female_rate
+ if female_rate < 0.1:
+ return GENDER_MALE, female_rate
+ logging.warning(
+ "filename={}|predict_no_pure|other|len={}|rate={}".format(filename, new_feature_len, new_feature_rate)
+ )
+ return GENDER_OTHER, female_rate
+
+ def predict(self, filename, features):
+ st = time.time()
+ new_features = []
+ for i in range(FRAME_LEN, len(features), FRAME_LEN):
+ new_features.append(features[i - FRAME_LEN: i])
+ new_features = torch.from_numpy(np.array(new_features))
+ gender, rate = self.predict_pure(filename, new_features)
+ if gender == GENDER_OTHER:
+ logging.info("start no pure process...")
+ return self.predict_no_pure(filename, new_features)
+ print("predict|spend_time={}".format(time.time() - st))
+ return gender, rate
+
+ def process_one_logic(self, filename, file_path, cache_dir):
+ tmp_wav = os.path.join(cache_dir, "tmp.wav")
+ tmp_vb_wav = os.path.join(cache_dir, "tmp_vb.wav")
+ if not transcode(file_path, tmp_wav):
+ return ERR_CODE_TRANSCODE
+ if not volume_balanced(tmp_wav, tmp_vb_wav):
+ return ERR_CODE_VOLUME_BALANCED
+ features = get_one_mfcc(tmp_vb_wav)
+ if len(features) < FRAME_LEN:
+ logging.error("feature too short|file_path={}".format(file_path))
+ return ERR_CODE_FEATURE_TOO_SHORT
+ return self.predict(filename, features)
+
+ def process_one(self, file_path):
+ base_dir = os.path.dirname(file_path)
+ filename = os.path.splitext(file_path)[0]
+ cache_dir = os.path.join(base_dir, filename + "_cache")
+ if os.path.exists(cache_dir):
+ shutil.rmtree(cache_dir)
+ os.makedirs(cache_dir)
+ ret = self.process_one_logic(filename, file_path, cache_dir)
+ shutil.rmtree(cache_dir)
+ return ret
+
+ def process(self, file_path):
+ gender, female_rate = self.process_one(file_path)
+ logging.info("{}|gender={}|female_rate={}".format(file_path, gender, female_rate))
+ return gender, female_rate
+
+ def process_by_feature(self, feature_file):
+ """
+ 直接处理特征文件
+ :param feature_file:
+ :return:
+ """
+ filename = os.path.splitext(feature_file)[0]
+ features = np.load(feature_file)
+ gender, female_rate = self.predict(filename, features)
+ return gender, female_rate
+
+
+def test_all_feature():
+ import glob
+ base_dir = "/data/datasets/music_voice_dataset_full/feature_online_data_v3"
+ female = glob.glob(os.path.join(base_dir, "female/*feature.npy"))
+ male = glob.glob(os.path.join(base_dir, "male/*feature.npy"))
+ other = glob.glob(os.path.join(base_dir, "other/*feature.npy"))
+ model_path = "/data/jianli.yang/voice_classification/online/models"
+ music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth")
+ music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth")
+ gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth")
+ gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth")
+ vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model)
+
+ tot_st = time.time()
+ ret_map = {
+ 0: {0: 0, 1: 0, 2: 0},
+ 1: {0: 0, 1: 0, 2: 0},
+ 2: {0: 0, 1: 0, 2: 0}
+ }
+ for file in female:
+ st = time.time()
+ print("------------------------------>>>>>")
+ gender, female_score = vc.process_by_feature(file)
+ ret_map[0][gender] += 1
+ if gender != 0:
+ print("err:female->{}|{}|{}".format(gender, file, female_score))
+ print("process|spend_tm=={}".format(time.time() - st))
+
+ for file in male:
+ st = time.time()
+ print("------------------------------>>>>>")
+ gender, female_score = vc.process_by_feature(file)
+ ret_map[1][gender] += 1
+ if gender != 1:
+ print("err:male->{}|{}|{}".format(gender, file, female_score))
+ print("process|spend_tm=={}".format(time.time() - st))
+
+ for file in other:
+ st = time.time()
+ print("------------------------------>>>>>")
+ gender, female_score = vc.process_by_feature(file)
+ ret_map[2][gender] += 1
+ if gender != 2:
+ print("err:other->{}|{}|{}".format(gender, file, female_score))
+ print("process|spend_tm=={}".format(time.time() - st))
+
+ global transcode_time, vb_time, mfcc_time, predict_time
+ print("spend_time:tot={}|transcode={}|vb={}|gen_feature={}|predict={}".format(time.time() - tot_st, transcode_time,
+ vb_time, mfcc_time, predict_time))
+ f_f = ret_map[0][0]
+ f_m = ret_map[0][1]
+ f_o = ret_map[0][2]
+ m_f = ret_map[1][0]
+ m_m = ret_map[1][1]
+ m_o = ret_map[1][2]
+ o_f = ret_map[2][0]
+ o_m = ret_map[2][1]
+ o_o = ret_map[2][2]
+
+ print("ff:{},fm:{},fo:{}".format(f_f, f_m, f_o))
+ print("mm:{},mf:{},mo:{}".format(m_m, m_f, m_o))
+ print("om:{},of:{},oo:{}".format(o_m, o_f, o_o))
+ # 女性准确率和召回率
+ f_acc = f_f / (f_f + m_f + o_f)
+ f_recall = f_f / (f_f + f_m + f_o)
+ # 男性准确率和召回率
+ m_acc = m_m / (m_m + f_m + o_m)
+ m_recall = m_m / (m_m + m_f + m_o)
+ print("female: acc={}|recall={}".format(f_acc, f_recall))
+ print("male: acc={}|recall={}".format(m_acc, m_recall))
+
+
+def test_all():
+ import glob
+ base_dir = "/data/datasets/music_voice_dataset_full/online_data_v3_top200"
+ female = glob.glob(os.path.join(base_dir, "female/*mp4"))
+ male = glob.glob(os.path.join(base_dir, "male/*mp4"))
+ other = glob.glob(os.path.join(base_dir, "other/*mp4"))
+ model_path = "/data/jianli.yang/voice_classification/online/models"
+ music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth")
+ music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth")
+ gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth")
+ gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth")
+ vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model)
+
+ tot_st = time.time()
+ ret_map = {
+ 0: {0: 0, 1: 0, 2: 0},
+ 1: {0: 0, 1: 0, 2: 0},
+ 2: {0: 0, 1: 0, 2: 0}
+ }
+ for file in female:
+ st = time.time()
+ print("------------------------------>>>>>")
+ gender, female_score = vc.process(file)
+ ret_map[0][gender] += 1
+ if gender != 0:
+ print("err:female->{}|{}|{}".format(gender, file, female_score))
+ print("process|spend_tm=={}".format(time.time() - st))
+
+ for file in male:
+ st = time.time()
+ print("------------------------------>>>>>")
+ gender, female_score = vc.process(file)
+ ret_map[1][gender] += 1
+ if gender != 1:
+ print("err:male->{}|{}|{}".format(gender, file, female_score))
+ print("process|spend_tm=={}".format(time.time() - st))
+
+ for file in other:
+ st = time.time()
+ print("------------------------------>>>>>")
+ gender, female_score = vc.process(file)
+ ret_map[2][gender] += 1
+ if gender != 2:
+ print("err:other->{}|{}|{}".format(gender, file, female_score))
+ print("process|spend_tm=={}".format(time.time() - st))
+
+ global transcode_time, vb_time, mfcc_time, predict_time
+ print("spend_time:tot={}|transcode={}|vb={}|gen_feature={}|predict={}".format(time.time() - tot_st, transcode_time,
+ vb_time, mfcc_time, predict_time))
+ f_f = ret_map[0][0]
+ f_m = ret_map[0][1]
+ f_o = ret_map[0][2]
+ m_f = ret_map[1][0]
+ m_m = ret_map[1][1]
+ m_o = ret_map[1][2]
+ o_f = ret_map[2][0]
+ o_m = ret_map[2][1]
+ o_o = ret_map[2][2]
+
+ print("ff:{},fm:{},fo:{}".format(f_f, f_m, f_o))
+ print("mm:{},mf:{},mo:{}".format(m_m, m_f, m_o))
+ print("om:{},of:{},oo:{}".format(o_m, o_f, o_o))
+ # 女性准确率和召回率
+ f_acc = f_f / (f_f + m_f + o_f)
+ f_recall = f_f / (f_f + f_m + f_o)
+ # 男性准确率和召回率
+ m_acc = m_m / (m_m + f_m + o_m)
+ m_recall = m_m / (m_m + m_f + m_o)
+ print("female: acc={}|recall={}".format(f_acc, f_recall))
+ print("male: acc={}|recall={}".format(m_acc, m_recall))
+
+
+if __name__ == "__main__":
+ # test_all()
+ # test_all_feature()
+ model_path = sys.argv[1]
+ voice_path = sys.argv[2]
+ music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth")
+ music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth")
+ gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth")
+ gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth")
+ vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model)
+ for i in range(0, 1):
+ st = time.time()
+ print("------------------------------>>>>>")
+ vc.process(voice_path)
+ print("process|spend_tm=={}".format(time.time() - st))

File Metadata

Mime Type
text/x-diff
Expires
Sun, Jan 12, 08:30 (1 d, 10 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1347164
Default Alt Text
(42 KB)

Event Timeline