Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F4880289
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
42 KB
Subscribers
None
View Options
diff --git a/AutoCoverTool/online/tone_shift_one.py b/AutoCoverTool/online/tone_shift_one.py
index e395c8d..9a422f8 100644
--- a/AutoCoverTool/online/tone_shift_one.py
+++ b/AutoCoverTool/online/tone_shift_one.py
@@ -1,328 +1,338 @@
"""
变调的方式做处理
1. 下载
2. 分离
3. 针对于人声变调+2,伴奏+1
4. 合成
"""
import os
import json
import shutil
import librosa
import logging
import numpy as np
from ref.music_remover.separate_interface import SeparateInterface
from online.inference_worker import upload_file2cos, gs_state_use, gs_state_finish, gs_state_default
from online.common import *
+from ref.online.voice_class_online import VoiceClass
logging.basicConfig(filename='/tmp/tone_shift_one.log', level=logging.INFO)
gs_tone_shift_exe = "/opt/soft/bin/tone_shift_exe"
gs_simple_mixer_path = "/opt/soft/bin/simple_mixer"
gs_err_code_success = 0
gs_err_code_tone_shift = 1
gs_err_code_mix = 2
gs_err_code_transcode = 3
gs_err_code_upload = 4
gs_err_code_download = 5
gs_err_code_trans_to_mp3 = 6
gs_err_code_separate = 7
gs_err_code_duration_too_long = 8
gs_err_code_duration_no_vocal = 9
gs_err_code_duration_err = 10
gs_err_code_transcode_acc = 11
gs_err_code_upload_acc = 12
gs_err_code_download_acc = 13
gs_err_code_download_vocal = 14
gs_err_code_transcode_acc_v1 = 15
gs_err_code_transcode_vocal_v1 = 16
gs_err_code_silence_no_data = 17
gs_err_code_silence_no_process = 18
def exec_cmd(cmd):
r = os.popen(cmd)
text = r.read()
r.close()
return text
def get_d(audio_path):
cmd = "ffprobe -v quiet -print_format json -show_format -show_streams {}".format(audio_path)
data = exec_cmd(cmd)
data = json.loads(data)
# 返回秒
if 'format' in data.keys() and 'duration' in data['format']:
return float(data["format"]["duration"])
return -1
def get_mean_power(audio_path):
sr = 44100
audio, sr = librosa.load(audio_path, sr=sr, mono=True)
mm = np.mean(np.abs(audio))
return mm
class ToneShift:
def __init__(self):
self.separate_inst = SeparateInterface()
+ model_path = "./models"
+ music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth")
+ music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth")
+ gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth")
+ gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth")
+
+ self.voice_class = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model,
+ gender_no_pure_model)
def update_state(self, song_id, state):
sql = "update svc_queue_table set state={},update_time={} where song_id = {}". \
format(state, int(time.time()), song_id)
banned_user_map['db'] = "av_db"
update_db(sql, banned_user_map)
def get_url_by_id(self, song_id):
sql = "select song_id, url from svc_queue_table where song_id={}".format(song_id)
banned_user_map["db"] = "av_db"
data = get_data_by_mysql(sql)
if len(data) == 0:
return None, None
return str(data[0][0]), data[0][1]
def get_one_data_logic(self):
"""
按照5,4,3的优先级进行获取
:return:
"""
song_src_arr = [5, 4, 3]
for song_src in song_src_arr:
song_id, song_url = self.get_one_data(song_src=song_src)
if song_id is not None:
return song_id, song_url
return None, None
def get_one_data(self, song_src=3):
sql = "select song_id, url from svc_queue_table where state = 0 and song_src={} order by create_time asc limit 1".format(
song_src)
banned_user_map["db"] = "av_db"
data = get_data_by_mysql(sql, banned_user_map)
if len(data) == 0:
return None, None
song_id, song_url = data[0]
if song_id != "":
self.update_state(song_id, gs_state_use)
return str(song_id), song_url
def pre_process(self, work_dir, song_url):
"""
创建文件夹,下载数据
:return:
"""
if "?sign=" in song_url:
return gs_err_code_download
ext = str(song_url).split(".")[-1]
dst_file = "{}/src_origin.{}".format(work_dir, ext)
cmd = "wget {} -O {}".format(song_url, dst_file)
os.system(cmd)
if not os.path.exists(dst_file):
return gs_err_code_download
duration = get_d(dst_file)
if duration < 0:
return gs_err_code_duration_err
print("Duration:", dst_file, duration)
if duration > 20 * 60:
return gs_err_code_duration_too_long
dst_mp3_file = "{}/src.wav".format(work_dir)
cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {} ".format(dst_file, dst_mp3_file)
os.system(cmd)
if not os.path.exists(dst_mp3_file):
return gs_err_code_trans_to_mp3
return gs_err_code_success
def tone_shift_one(self, in_file, dst_file, pitch):
cmd = "{} {} {} {}".format(gs_tone_shift_exe, in_file, dst_file, pitch)
os.system(cmd)
return os.path.exists(dst_file)
def mix(self, cid, vocal_path, acc_path, tp):
if tp == 1:
vocal_pitch = 2
acc_pitch = 0
else:
vocal_pitch = -2
acc_pitch = 0
vocal_path_2 = vocal_path.replace(".wav", "_{}.wav".format(vocal_pitch))
acc_path_2 = acc_path.replace(".wav", "_{}.wav".format(acc_pitch))
err = self.tone_shift_one(vocal_path, vocal_path_2, vocal_pitch)
if not err:
- return gs_err_code_tone_shift, None
+ return gs_err_code_tone_shift, None, None
+ gender, female_rate = self.voice_class.process_one(vocal_path_2)
err = self.tone_shift_one(acc_path, acc_path_2, acc_pitch)
if not err:
- return gs_err_code_tone_shift, None
+ return gs_err_code_tone_shift, None, None
base_dir = os.path.dirname(vocal_path)
mix_path = "{}/mix_{}_{}.wav".format(base_dir, vocal_pitch, acc_pitch)
cmd = "{} {} {} {}".format(gs_simple_mixer_path, vocal_path_2, acc_path_2, mix_path)
print("exec_cmd={}".format(cmd))
os.system(cmd)
if not os.path.exists(mix_path):
- return gs_err_code_mix, None
+ return gs_err_code_mix, None, None
# 转码
mix_path_mp3 = mix_path.replace(".wav", ".mp4")
cmd = "ffmpeg -i {} -b:a 128k -c:a aac -ar 44100 -ac 2 -y {} -loglevel fatal".format(mix_path, mix_path_mp3)
os.system(cmd)
if not os.path.exists(mix_path_mp3):
- return gs_err_code_transcode, None
+ return gs_err_code_transcode, None, None
# 上传到cos
mix_name = os.path.basename(mix_path_mp3)
key = "av_res/svc_res_tone_shift/{}/{}".format(str(cid), mix_name)
if not upload_file2cos(key, mix_path_mp3):
- return gs_err_code_upload, None
- return gs_err_code_success, key
+ return gs_err_code_upload, None, None
+ return gs_err_code_success, key, gender
def upload_acc(self, cid, acc_path):
# 转码
mix_path_aac = acc_path.replace(".wav", ".m4a")
cmd = "ffmpeg -i {} -b:a 128k -c:a aac -ar 44100 -ac 2 -y {} -loglevel fatal".format(acc_path, mix_path_aac)
os.system(cmd)
if not os.path.exists(mix_path_aac):
return gs_err_code_transcode_acc, None
# 上传
mix_name = os.path.basename(mix_path_aac)
key = "av_res/svc_res_tone_shift/{}/{}".format(str(cid), mix_name)
if not upload_file2cos(key, mix_path_aac):
return gs_err_code_upload_acc, None
return gs_err_code_success, key
def process_one(self, cid, work_dir):
"""
:param cid:
:param work_dir:
:return:
"""
src_mp3 = os.path.join(work_dir, "src.wav")
vocal_path = os.path.join(work_dir, "vocal.wav")
acc_path = os.path.join(work_dir, "acc.wav")
if not (os.path.exists(vocal_path) and os.path.exists(acc_path)):
if not self.separate_inst.process(cid, src_mp3, vocal_path, acc_path):
return gs_err_code_separate, []
if not os.path.exists(vocal_path) or not os.path.exists(acc_path):
return gs_err_code_separate, []
# 当人声的平均能量小于一定值时,则认为无人声(0.01是经验值判定,样本分析来看)
# 无人声的样本[0.0056, 0.0003], 有人声的样本(目前最小)[0.046, 0.049]
print("power:{},{}".format(cid, get_mean_power(vocal_path)))
if get_mean_power(vocal_path) < 0.02:
return gs_err_code_duration_no_vocal, []
- err, type1_mix_mp3 = self.mix(cid, vocal_path, acc_path, 1)
+ err, type1_mix_mp3, gender = self.mix(cid, vocal_path, acc_path, 1)
if err != gs_err_code_success:
return err, []
- err, type2_mix_mp3 = self.mix(cid, vocal_path, acc_path, 2)
+ err, type2_mix_mp3, gender2 = self.mix(cid, vocal_path, acc_path, 2)
if err != gs_err_code_success:
return err, []
# 上传伴奏文件
# err, acc_path_m4a = self.upload_acc(cid, acc_path)
# if err != gs_err_code_success:
# return err, []
- return gs_err_code_success, [type1_mix_mp3, type2_mix_mp3]
+ return gs_err_code_success, [type1_mix_mp3, type2_mix_mp3, str(gender), str(gender2)]
def download_and_transcode(self, url, local_path, local_path_wav):
cmd = "wget {} -O {}".format(url, local_path)
os.system(cmd)
if not os.path.exists(local_path):
return -1
cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {}".format(local_path, local_path_wav)
os.system(cmd)
if not os.path.exists(local_path_wav):
return -2
return 0
def get_data_from_mysql(self, cid, work_dir):
sql = "select starmaker_songid,task_url,complete_url,voice_url from starmaker_musicbook.silence where starmaker_songid={} order by task_id desc limit 1".format(
cid)
data = get_data_by_mysql(sql, banned_user_map)
if len(data) == 0:
return gs_err_code_silence_no_data
song_id, task_url, complete_url, voice_url = data[0]
if complete_url != "" and voice_url != "":
"""
将人声与伴奏下载下来
"""
ext = str(complete_url).split(".")[-1]
acc_dst_file = os.path.join(work_dir, "acc.{}".format(ext))
acc_wav_dst_file = os.path.join(work_dir, "acc.wav")
err = self.download_and_transcode(complete_url, acc_dst_file, acc_wav_dst_file)
os.unlink(acc_dst_file)
if err == -1:
return gs_err_code_download_acc
if err == -2:
return gs_err_code_transcode_acc_v1
ext = str(voice_url).split(".")[-1]
vocal_dst_file = os.path.join(work_dir, "vocal.{}".format(ext))
vocal_wav_dst_file = os.path.join(work_dir, "vocal.wav")
err = self.download_and_transcode(voice_url, vocal_dst_file, vocal_wav_dst_file)
os.unlink(vocal_dst_file)
if err == -1:
return gs_err_code_download_vocal
if err == -2:
return gs_err_code_transcode_vocal_v1
return gs_err_code_success
return gs_err_code_silence_no_process
def process_worker(self):
logging.info("start process_worker .....")
base_dir = "/tmp/tone_shift_one"
if not os.path.exists(base_dir):
os.makedirs(base_dir)
while True:
worker_st = time.time()
cid, song_url = self.get_one_data_logic()
- # cid, song_url = self.get_url_by_id('175210503076374799')
+ # cid, song_url = self.get_url_by_id('611752105030548048')
if cid is None:
time.sleep(5)
logging.info("get one data is None ...")
continue
work_dir = os.path.join(base_dir, str(cid))
if os.path.exists(work_dir):
shutil.rmtree(work_dir)
os.makedirs(work_dir)
# 先查看消音数据库中是否已经完成了该项目,已经有的话,就直接下载即可
err = self.get_data_from_mysql(cid, work_dir)
if err != gs_err_code_success:
# 清空磁盘
shutil.rmtree(work_dir)
os.makedirs(work_dir)
err = self.pre_process(work_dir, song_url)
if err != gs_err_code_success:
self.update_state(str(cid), -err)
continue
st = time.time()
err, data = self.process_one(str(cid), work_dir)
logging.info("process_finish,{},{}".format(cid, time.time() - st))
if err == gs_err_code_success and len(data) != 0:
sql = "update svc_queue_table set state={},update_time={},svc_url=\"{}\" where song_id = {}". \
format(gs_state_finish, int(time.time()), ",".join(data), str(cid))
banned_user_map['db'] = "av_db"
update_db(sql, banned_user_map)
else:
self.update_state(str(cid), -err)
shutil.rmtree(work_dir)
logging.info("process_finish,{},{}".format(cid, time.time() - worker_st))
if __name__ == '__main__':
ts = ToneShift()
ts.process_worker()
diff --git a/AutoCoverTool/ref/online/common.py b/AutoCoverTool/ref/online/common.py
new file mode 100644
index 0000000..af3487a
--- /dev/null
+++ b/AutoCoverTool/ref/online/common.py
@@ -0,0 +1,93 @@
+#-*-encording=utf-8-*-
+"""
+程序绑定核心
+一个脚本启动多次,每次绑定一个核心,不会多次绑定到同一个核心
+每个进程选定绑定n个核心,或者自己传入需要绑定的核心编号
+"""
+
+import time
+import psutil
+import os
+import sys
+import hashlib
+import fcntl
+
+"""
+自动获取可用核心
+"""
+
+
+def exec_cmd_ints(cmd):
+ """
+ 执行cmd,获取返回值
+ :param cmd:
+ :return:
+ """
+ r = os.popen(cmd)
+ lines = r.readlines()
+ ids = []
+ for line in lines:
+ line = line.strip()
+ if line.isdigit():
+ id = int(float(line))
+ ids.append(id)
+ return ids
+
+
+def get_idle_kernel(n=1):
+ cur_id = os.getpid()
+ name = os.path.basename(sys.argv[0])
+ command = "ps -ef | grep {} |grep python | awk \'{{print $2}}\'".format(name)
+ print(command)
+ ids = exec_cmd_ints(command)
+
+ print(ids, cur_id)
+ # 获取所有被绑定的核心
+ count = psutil.cpu_count()
+ used = [False] * (count // n)
+ command = "pidstat | grep {} | awk \'{{print $(NF-1)}}\'"
+ for i in range(0, len(ids)):
+ if cur_id != ids[i]:
+ cmd = command.format(ids[i])
+ kers = exec_cmd_ints(cmd)
+ for ker in kers:
+ ker = ker // n
+ used[ker] = True
+ print(used)
+ # 获取N个可用的核心
+ for i in range(0, len(used)):
+ if not used[i]:
+ res = []
+ cur_i = i * n
+ for idx in range(cur_i, cur_i+n):
+ if idx < count:
+ res.append(idx)
+ return res
+ return 0
+
+
+def bind_kernel(n=1, kernel=[]):
+ p = psutil.Process()
+
+ # 加锁
+ name = hashlib.md5(os.path.basename(sys.argv[0]).encode('utf-8')).hexdigest()
+ name = os.path.join("/tmp", name + ".lock")
+ if not os.path.exists(name):
+ with open(name, "w") as f:
+ f.write("0")
+ file = open(name)
+ fcntl.flock(file.fileno(), fcntl.LOCK_EX) # 排他锁
+ print("lock file --- {}".format(name))
+ if len(kernel) > 0:
+ kernels = kernel
+ else:
+ kernels = get_idle_kernel(n)
+ p.cpu_affinity(kernels) # 绑定特定核心
+ print("bind_kernel", kernels)
+ file.close() # 释放锁
+ print("unlock file --- {}".format(name))
+
+
+def calc_forever():
+ for i in range(0, 10000):
+ time.sleep(1000)
\ No newline at end of file
diff --git a/AutoCoverTool/ref/online/mobilenet_v2_custom.py b/AutoCoverTool/ref/online/mobilenet_v2_custom.py
new file mode 100644
index 0000000..57b1227
--- /dev/null
+++ b/AutoCoverTool/ref/online/mobilenet_v2_custom.py
@@ -0,0 +1,142 @@
+"""
+直接从代码库中拷贝出的代码
+目的: mobilenet_v2只允许输入图片的通道数为3,不满足要求,因此拷贝出来做修改
+"""
+
+from torch import nn
+
+
+def _make_divisible(v, divisor, min_value=None):
+ """
+ This function is taken from the original tf repo.
+ It ensures that all layers have a channel number that is divisible by 8
+ It can be seen here:
+ https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+ :param v:
+ :param divisor:
+ :param min_value:
+ :return:
+ """
+ if min_value is None:
+ min_value = divisor
+ new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+ # Make sure that round down does not go down by more than 10%.
+ if new_v < 0.9 * v:
+ new_v += divisor
+ return new_v
+
+
+class ConvBNReLU(nn.Sequential):
+ def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
+ padding = (kernel_size - 1) // 2
+ super(ConvBNReLU, self).__init__(
+ nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
+ nn.BatchNorm2d(out_planes),
+ nn.ReLU6(inplace=True)
+ )
+
+
+class InvertedResidual(nn.Module):
+ def __init__(self, inp, oup, stride, expand_ratio):
+ super(InvertedResidual, self).__init__()
+ self.stride = stride
+ assert stride in [1, 2]
+
+ hidden_dim = int(round(inp * expand_ratio))
+ self.use_res_connect = self.stride == 1 and inp == oup
+
+ layers = []
+ if expand_ratio != 1:
+ # pw
+ layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
+ layers.extend([
+ # dw
+ ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim),
+ # pw-linear
+ nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+ nn.BatchNorm2d(oup),
+ ])
+ self.conv = nn.Sequential(*layers)
+
+ def forward(self, x):
+ if self.use_res_connect:
+ return x + self.conv(x)
+ else:
+ return self.conv(x)
+
+
+class MobileNetV2Custom(nn.Module):
+ def __init__(self, num_classes=2, in_channel=1, width_mult=1.0, inverted_residual_setting=None, round_nearest=8):
+ """
+ MobileNet V2 main class
+
+ Args:
+ num_classes (int): Number of classes
+ width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
+ inverted_residual_setting: Network structure
+ round_nearest (int): Round the number of channels in each layer to be a multiple of this number
+ Set to 1 to turn off rounding
+ """
+ super(MobileNetV2Custom, self).__init__()
+ block = InvertedResidual
+ input_channel = 32
+ last_channel = 1280
+
+ if inverted_residual_setting is None:
+ inverted_residual_setting = [
+ # t, c, n, s
+ [1, 16, 1, 1],
+ [6, 24, 2, 2],
+ [6, 32, 3, 2],
+ [6, 64, 4, 2],
+ [6, 96, 3, 1],
+ [6, 160, 3, 2],
+ [6, 320, 1, 1],
+ ]
+
+ # only check the first element, assuming user knows t,c,n,s are required
+ if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
+ raise ValueError("inverted_residual_setting should be non-empty "
+ "or a 4-element list, got {}".format(inverted_residual_setting))
+
+ # building first layer
+ input_channel = _make_divisible(input_channel * width_mult, round_nearest)
+ self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
+ # 修改的地方,原来in_channel=3
+ features = [ConvBNReLU(in_channel, input_channel, stride=2)]
+ # building inverted residual blocks
+ for t, c, n, s in inverted_residual_setting:
+ output_channel = _make_divisible(c * width_mult, round_nearest)
+ for i in range(n):
+ stride = s if i == 0 else 1
+ features.append(block(input_channel, output_channel, stride, expand_ratio=t))
+ input_channel = output_channel
+ # building last several layers
+ features.append(ConvBNReLU(input_channel, self.last_channel, kernel_size=1))
+ # make it nn.Sequential
+ self.features = nn.Sequential(*features)
+
+ # building classifier
+ self.classifier = nn.Sequential(
+ nn.Dropout(0.2),
+ nn.Linear(self.last_channel, num_classes),
+ )
+
+ # weight initialization
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ nn.init.kaiming_normal_(m.weight, mode='fan_out')
+ if m.bias is not None:
+ nn.init.zeros_(m.bias)
+ elif isinstance(m, nn.BatchNorm2d):
+ nn.init.ones_(m.weight)
+ nn.init.zeros_(m.bias)
+ elif isinstance(m, nn.Linear):
+ nn.init.normal_(m.weight, 0, 0.01)
+ nn.init.zeros_(m.bias)
+
+ def forward(self, x):
+ x = self.features(x)
+ x = x.mean([2, 3])
+ x = self.classifier(x)
+ return x
diff --git a/AutoCoverTool/ref/online/model.py b/AutoCoverTool/ref/online/model.py
new file mode 100644
index 0000000..c5e8adc
--- /dev/null
+++ b/AutoCoverTool/ref/online/model.py
@@ -0,0 +1,71 @@
+from mobilenet_v2_custom import MobileNetV2Custom
+import torch
+import torch.nn as nn
+
+MFCC_LEN = 80
+FRAME_LEN = 128
+
+
+class MobileNetV2Gender(MobileNetV2Custom):
+
+ def forward(self, x):
+ x = x.view([-1, 1, FRAME_LEN, MFCC_LEN])
+ return super(MobileNetV2Gender, self).forward(x)
+
+
+class MusicVoiceV5Model(nn.Module):
+ def __init__(self):
+ super(MusicVoiceV5Model, self).__init__()
+
+ def conv_bn(inp, oup, stride):
+ return nn.Sequential(
+ nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+ nn.BatchNorm2d(oup),
+ nn.ReLU(inplace=True)
+ )
+
+ def conv_dw(inp, oup, stride):
+ return nn.Sequential(
+ nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
+ nn.BatchNorm2d(inp),
+ nn.ReLU(inplace=True),
+
+ nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+ nn.BatchNorm2d(oup),
+ nn.ReLU(inplace=True),
+ )
+
+ self.model = nn.Sequential(
+ conv_bn(1, 32, 2),
+ conv_dw(32, 64, 1),
+ conv_dw(64, 128, 2),
+ conv_dw(128, 128, 1),
+ conv_dw(128, 256, 2),
+ conv_dw(256, 256, 1),
+ conv_dw(256, 512, 2),
+ conv_dw(512, 512, 1),
+ conv_dw(512, 512, 1),
+ conv_dw(512, 512, 1),
+ conv_dw(512, 512, 1),
+ conv_dw(512, 512, 1),
+ conv_dw(512, 1024, 2),
+ conv_dw(1024, 1024, 1),
+ nn.AvgPool2d((4, 3)),
+ )
+ self.fc = nn.Linear(1024, 2)
+
+ def forward(self, x):
+ x = x.view([-1, 1, FRAME_LEN, MFCC_LEN])
+ x = self.model(x)
+ x = x.view(-1, 1024)
+ x = self.fc(x)
+ return x
+
+
+def load_model(model_type, model_path, device):
+ model = model_type()
+ params = torch.load(model_path, map_location=torch.device(device))
+ model.load_state_dict(state_dict=params)
+ model.eval()
+ model.to(device)
+ return model
diff --git a/AutoCoverTool/ref/online/readme.md b/AutoCoverTool/ref/online/readme.md
new file mode 100644
index 0000000..10a1f09
--- /dev/null
+++ b/AutoCoverTool/ref/online/readme.md
@@ -0,0 +1,50 @@
+#男女声识别
+
+```
+模型名称以及对应作用:
+---gender_8k_ratev5_v6_adam.pth // 男女声(纯人声)分类模型(使用8k纯人声数据集进行训练,mobilenet_v2,adam优化器)
+---gender_8k_v6_adam.pth // 男女声(带人声)分类模型(使用8k带人声数据集进行训练,mobilenet_v2,adam优化器)
+---voice_005_rec_v5.pth // 纯人声分类模型(400首人工标注的歌曲,判定纯人声段(使用作品中带人声段当作负样本) mobilenet_v1, sgd优化器)
+---voice_10_v5.pth // 带人声分类模型(400首人工标注的歌曲,判定带人声段, mobilenet_v1, sgd优化器)
+模型地址:https://av-audit-sync-in-1256122840.cos.ap-mumbai.myqcloud.com/hub/voice_classification/models.zip
+```
+
+# 文件说明
+```
+---common.py // 用于绑定核心的代码
+---mobilenet_v2_custom.py // 模型代码
+---model.py // 调用模型的封装层
+---readme.MD // 说明文件
+---voice_class_online.py // 运行时使用的文件
+```
+
+# 环境安装
+```
+cd /home/worker
+wget "https://av-audit-sync-in-1256122840.cos.ap-mumbai.myqcloud.com/hub/voice_classification/bin/bin.zip"
+unzip bin.zip
+rm -f bin.zip
+export PATH=$PATH:/home/worker/bin # 需要写入到.zshrc中
+sudo yum install libsndfile-devel
+
+# 以下使用手动安装即可
+conda create -n voice_class python=3.7 -y
+conda activate voice_class
+pip3 install librosa
+pip3 install psutil
+pip3 install torch==1.5 torchvision torchaudio
+```
+
+# 使用说明
+```
+下载模型并解压后,按照voice_class_online.py中的运行方式运行即可
+```
+
+# 注意:
+目前代码中限制了CPU的核心数量,只允许占用一个核,建议根据核心的情况多开几个进程做处理
+
+# 性能测试(不加性能限制的情况下在GPU-2机器上测试得到):
+20个线上样本(男10,女10)
+
+CPU情况:spend_time:tot=31.91|transcode=5.92|vb=3.12|gen_feature=3.5|predict=18.94
+GPU情况:spend_time:tot=15.64|transcode=6.34|vb=4.17|gen_feature=3.3|predict=1.443
diff --git a/AutoCoverTool/ref/online/resource/female/4785074274851990.mp4 b/AutoCoverTool/ref/online/resource/female/4785074274851990.mp4
new file mode 100644
index 0000000..9b225ee
Binary files /dev/null and b/AutoCoverTool/ref/online/resource/female/4785074274851990.mp4 differ
diff --git a/AutoCoverTool/ref/online/voice_class_online.py b/AutoCoverTool/ref/online/voice_class_online.py
new file mode 100644
index 0000000..6041c94
--- /dev/null
+++ b/AutoCoverTool/ref/online/voice_class_online.py
@@ -0,0 +1,420 @@
+"""
+男女声分类在线工具
+1 转码为16bit单声道
+2 均衡化
+3 模型分类
+"""
+
+import os
+import sys
+import librosa
+import shutil
+import logging
+import time
+import torch.nn.functional as F
+import numpy as np
+from model import *
+# from common import bind_kernel
+
+logging.basicConfig(level=logging.INFO)
+
+os.environ["LRU_CACHE_CAPACITY"] = "1"
+
+# torch.set_num_threads(1)
+# bind_kernel(1)
+
+"""
+临时用一下,全局使用的变量
+"""
+
+transcode_time = 0
+vb_time = 0
+mfcc_time = 0
+predict_time = 0
+
+"""
+错误码
+"""
+ERR_CODE_SUCCESS = 0 # 处理成功
+ERR_CODE_NO_FILE = -1 # 文件不存在
+ERR_CODE_TRANSCODE = -2 # 转码失败
+ERR_CODE_VOLUME_BALANCED = -3 # 均衡化失败
+ERR_CODE_FEATURE_TOO_SHORT = -4 # 特征文件太短
+
+"""
+常量
+"""
+
+FRAME_LEN = 128
+MFCC_LEN = 80
+
+EBUR128_BIN = "/opt/soft/bin/standard_audio_no_cut"
+# EBUR128_BIN = "/Users/yangjianli/linux/opt/soft/bin/standard_audio_no_cut"
+GENDER_FEMALE = 0
+GENDER_MALE = 1
+GENDER_OTHER = 2
+"""
+通用函数
+"""
+
+
+def exec_cmd(cmd):
+ ret = os.system(cmd)
+ if ret != 0:
+ return False
+ return True
+
+
+"""
+业务需要的函数
+"""
+
+
+def get_one_mfcc(file_url):
+ st = time.time()
+ data, sr = librosa.load(file_url, sr=16000)
+ if len(data) < 512:
+ return []
+ mfcc = librosa.feature.mfcc(y=data, sr=sr, n_fft=512, hop_length=256, n_mfcc=MFCC_LEN)
+ mfcc = mfcc.transpose()
+ print("get_one_mfcc:spend_time={}".format(time.time() - st))
+ global mfcc_time
+ mfcc_time += time.time() - st
+ return mfcc
+
+
+def volume_balanced(src, dst):
+ st = time.time()
+ cmd = "{} {} {}".format(EBUR128_BIN, src, dst)
+ logging.info(cmd)
+ exec_cmd(cmd)
+ if not os.path.exists(dst):
+ logging.error("volume_balanced:cmd={}".format(cmd))
+ print("volume_balanced:spend_time={}".format(time.time() - st))
+
+ global vb_time
+ vb_time += time.time() - st
+ return os.path.exists(dst)
+
+
+def transcode(src, dst):
+ st = time.time()
+ cmd = "ffmpeg -loglevel quiet -i {} -ar 16000 -ac 1 {}".format(src, dst)
+ logging.info(cmd)
+ exec_cmd(cmd)
+ if not os.path.exists(dst):
+ logging.error("transcode:cmd={}".format(cmd))
+ print("transcode:spend_time={}".format(time.time() - st))
+ global transcode_time
+ transcode_time += time.time() - st
+ return os.path.exists(dst)
+
+
+class VoiceClass:
+
+ def __init__(self, music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model):
+ """
+ 四个模型
+ :param music_voice_pure_model: 分辨纯净人声/其他
+ :param music_voice_no_pure_model: 分辨有人声/其他
+ :param gender_pure_model: 纯净人声分辨男女
+ :param gender_no_pure_model: 有人声分辨男女
+ """
+ st = time.time()
+ self.device = "cpu"
+ self.batch_size = 256
+ self.music_voice_pure_model = load_model(MusicVoiceV5Model, music_voice_pure_model, self.device)
+ self.music_voice_no_pure_model = load_model(MusicVoiceV5Model, music_voice_no_pure_model, self.device)
+ self.gender_pure_model = load_model(MobileNetV2Gender, gender_pure_model, self.device)
+ self.gender_no_pure_model = load_model(MobileNetV2Gender, gender_no_pure_model, self.device)
+ logging.info("load model ok ! spend_time={}".format(time.time() - st))
+
+ def batch_predict(self, model, features):
+ st = time.time()
+ scores = []
+ with torch.no_grad():
+ for i in range(0, len(features), self.batch_size):
+ cur_data = features[i:i + self.batch_size].to(self.device)
+ predicts = model(cur_data)
+ predicts_score = F.softmax(predicts, dim=1)
+ scores.extend(predicts_score.cpu().numpy())
+ ret = np.array(scores)
+ global predict_time
+ predict_time += time.time() - st
+ return ret
+
+ def predict_pure(self, filename, features):
+ scores = self.batch_predict(self.music_voice_pure_model, features)
+ new_features = []
+ for idx, score in enumerate(scores):
+ if score[0] > 0.5: # 非人声
+ continue
+ new_features.append(features[idx].numpy())
+
+ # 人声段太少,不能进行处理
+ # 参数可以改
+ new_feature_len = len(new_features)
+ new_feature_rate = len(new_features) / len(features)
+ if new_feature_len < 4 or new_feature_rate < 0.4:
+ logging.warning(
+ "filename={}|predict_pure|other|len={}|rate={}".format(filename, new_feature_len, new_feature_rate)
+ )
+ return GENDER_OTHER, -1
+ new_features = torch.from_numpy(np.array(new_features))
+ scores = self.batch_predict(self.gender_pure_model, new_features)
+ f_avg = sum(scores[:, 0]) / len(scores)
+ m_avg = sum(scores[:, 1]) / len(scores)
+ female_rate = f_avg / (f_avg + m_avg)
+ if female_rate > 0.65:
+ return GENDER_FEMALE, female_rate
+ if female_rate < 0.12:
+ return GENDER_MALE, female_rate
+ logging.warning(
+ "filename={}|predict_pure|other|len={}|rate={}".format(filename, new_feature_len, new_feature_rate)
+ )
+ return GENDER_OTHER, female_rate
+
+ def predict_no_pure(self, filename, features):
+ scores = self.batch_predict(self.music_voice_no_pure_model, features)
+ new_features = []
+ for idx, score in enumerate(scores):
+ if score[0] > 0.5: # 非人声
+ continue
+ new_features.append(features[idx].numpy())
+
+ # 人声段太少,不能进行处理
+ # 参数可以改
+ new_feature_len = len(new_features)
+ new_feature_rate = len(new_features) / len(features)
+ if new_feature_len < 4 or new_feature_rate < 0.4:
+ logging.warning(
+ "filename={}|predict_no_pure|other|len={}|rate={}".format(filename, new_feature_len, new_feature_rate)
+ )
+ return GENDER_OTHER, -1
+ new_features = torch.from_numpy(np.array(new_features))
+ scores = self.batch_predict(self.gender_no_pure_model, new_features)
+ f_avg = sum(scores[:, 0]) / len(scores)
+ m_avg = sum(scores[:, 1]) / len(scores)
+ female_rate = f_avg / (f_avg + m_avg)
+ if female_rate > 0.75:
+ return GENDER_FEMALE, female_rate
+ if female_rate < 0.1:
+ return GENDER_MALE, female_rate
+ logging.warning(
+ "filename={}|predict_no_pure|other|len={}|rate={}".format(filename, new_feature_len, new_feature_rate)
+ )
+ return GENDER_OTHER, female_rate
+
+ def predict(self, filename, features):
+ st = time.time()
+ new_features = []
+ for i in range(FRAME_LEN, len(features), FRAME_LEN):
+ new_features.append(features[i - FRAME_LEN: i])
+ new_features = torch.from_numpy(np.array(new_features))
+ gender, rate = self.predict_pure(filename, new_features)
+ if gender == GENDER_OTHER:
+ logging.info("start no pure process...")
+ return self.predict_no_pure(filename, new_features)
+ print("predict|spend_time={}".format(time.time() - st))
+ return gender, rate
+
+ def process_one_logic(self, filename, file_path, cache_dir):
+ tmp_wav = os.path.join(cache_dir, "tmp.wav")
+ tmp_vb_wav = os.path.join(cache_dir, "tmp_vb.wav")
+ if not transcode(file_path, tmp_wav):
+ return ERR_CODE_TRANSCODE
+ if not volume_balanced(tmp_wav, tmp_vb_wav):
+ return ERR_CODE_VOLUME_BALANCED
+ features = get_one_mfcc(tmp_vb_wav)
+ if len(features) < FRAME_LEN:
+ logging.error("feature too short|file_path={}".format(file_path))
+ return ERR_CODE_FEATURE_TOO_SHORT
+ return self.predict(filename, features)
+
+ def process_one(self, file_path):
+ base_dir = os.path.dirname(file_path)
+ filename = os.path.splitext(file_path)[0]
+ cache_dir = os.path.join(base_dir, filename + "_cache")
+ if os.path.exists(cache_dir):
+ shutil.rmtree(cache_dir)
+ os.makedirs(cache_dir)
+ ret = self.process_one_logic(filename, file_path, cache_dir)
+ shutil.rmtree(cache_dir)
+ return ret
+
+ def process(self, file_path):
+ gender, female_rate = self.process_one(file_path)
+ logging.info("{}|gender={}|female_rate={}".format(file_path, gender, female_rate))
+ return gender, female_rate
+
+ def process_by_feature(self, feature_file):
+ """
+ 直接处理特征文件
+ :param feature_file:
+ :return:
+ """
+ filename = os.path.splitext(feature_file)[0]
+ features = np.load(feature_file)
+ gender, female_rate = self.predict(filename, features)
+ return gender, female_rate
+
+
+def test_all_feature():
+ import glob
+ base_dir = "/data/datasets/music_voice_dataset_full/feature_online_data_v3"
+ female = glob.glob(os.path.join(base_dir, "female/*feature.npy"))
+ male = glob.glob(os.path.join(base_dir, "male/*feature.npy"))
+ other = glob.glob(os.path.join(base_dir, "other/*feature.npy"))
+ model_path = "/data/jianli.yang/voice_classification/online/models"
+ music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth")
+ music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth")
+ gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth")
+ gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth")
+ vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model)
+
+ tot_st = time.time()
+ ret_map = {
+ 0: {0: 0, 1: 0, 2: 0},
+ 1: {0: 0, 1: 0, 2: 0},
+ 2: {0: 0, 1: 0, 2: 0}
+ }
+ for file in female:
+ st = time.time()
+ print("------------------------------>>>>>")
+ gender, female_score = vc.process_by_feature(file)
+ ret_map[0][gender] += 1
+ if gender != 0:
+ print("err:female->{}|{}|{}".format(gender, file, female_score))
+ print("process|spend_tm=={}".format(time.time() - st))
+
+ for file in male:
+ st = time.time()
+ print("------------------------------>>>>>")
+ gender, female_score = vc.process_by_feature(file)
+ ret_map[1][gender] += 1
+ if gender != 1:
+ print("err:male->{}|{}|{}".format(gender, file, female_score))
+ print("process|spend_tm=={}".format(time.time() - st))
+
+ for file in other:
+ st = time.time()
+ print("------------------------------>>>>>")
+ gender, female_score = vc.process_by_feature(file)
+ ret_map[2][gender] += 1
+ if gender != 2:
+ print("err:other->{}|{}|{}".format(gender, file, female_score))
+ print("process|spend_tm=={}".format(time.time() - st))
+
+ global transcode_time, vb_time, mfcc_time, predict_time
+ print("spend_time:tot={}|transcode={}|vb={}|gen_feature={}|predict={}".format(time.time() - tot_st, transcode_time,
+ vb_time, mfcc_time, predict_time))
+ f_f = ret_map[0][0]
+ f_m = ret_map[0][1]
+ f_o = ret_map[0][2]
+ m_f = ret_map[1][0]
+ m_m = ret_map[1][1]
+ m_o = ret_map[1][2]
+ o_f = ret_map[2][0]
+ o_m = ret_map[2][1]
+ o_o = ret_map[2][2]
+
+ print("ff:{},fm:{},fo:{}".format(f_f, f_m, f_o))
+ print("mm:{},mf:{},mo:{}".format(m_m, m_f, m_o))
+ print("om:{},of:{},oo:{}".format(o_m, o_f, o_o))
+ # 女性准确率和召回率
+ f_acc = f_f / (f_f + m_f + o_f)
+ f_recall = f_f / (f_f + f_m + f_o)
+ # 男性准确率和召回率
+ m_acc = m_m / (m_m + f_m + o_m)
+ m_recall = m_m / (m_m + m_f + m_o)
+ print("female: acc={}|recall={}".format(f_acc, f_recall))
+ print("male: acc={}|recall={}".format(m_acc, m_recall))
+
+
+def test_all():
+ import glob
+ base_dir = "/data/datasets/music_voice_dataset_full/online_data_v3_top200"
+ female = glob.glob(os.path.join(base_dir, "female/*mp4"))
+ male = glob.glob(os.path.join(base_dir, "male/*mp4"))
+ other = glob.glob(os.path.join(base_dir, "other/*mp4"))
+ model_path = "/data/jianli.yang/voice_classification/online/models"
+ music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth")
+ music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth")
+ gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth")
+ gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth")
+ vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model)
+
+ tot_st = time.time()
+ ret_map = {
+ 0: {0: 0, 1: 0, 2: 0},
+ 1: {0: 0, 1: 0, 2: 0},
+ 2: {0: 0, 1: 0, 2: 0}
+ }
+ for file in female:
+ st = time.time()
+ print("------------------------------>>>>>")
+ gender, female_score = vc.process(file)
+ ret_map[0][gender] += 1
+ if gender != 0:
+ print("err:female->{}|{}|{}".format(gender, file, female_score))
+ print("process|spend_tm=={}".format(time.time() - st))
+
+ for file in male:
+ st = time.time()
+ print("------------------------------>>>>>")
+ gender, female_score = vc.process(file)
+ ret_map[1][gender] += 1
+ if gender != 1:
+ print("err:male->{}|{}|{}".format(gender, file, female_score))
+ print("process|spend_tm=={}".format(time.time() - st))
+
+ for file in other:
+ st = time.time()
+ print("------------------------------>>>>>")
+ gender, female_score = vc.process(file)
+ ret_map[2][gender] += 1
+ if gender != 2:
+ print("err:other->{}|{}|{}".format(gender, file, female_score))
+ print("process|spend_tm=={}".format(time.time() - st))
+
+ global transcode_time, vb_time, mfcc_time, predict_time
+ print("spend_time:tot={}|transcode={}|vb={}|gen_feature={}|predict={}".format(time.time() - tot_st, transcode_time,
+ vb_time, mfcc_time, predict_time))
+ f_f = ret_map[0][0]
+ f_m = ret_map[0][1]
+ f_o = ret_map[0][2]
+ m_f = ret_map[1][0]
+ m_m = ret_map[1][1]
+ m_o = ret_map[1][2]
+ o_f = ret_map[2][0]
+ o_m = ret_map[2][1]
+ o_o = ret_map[2][2]
+
+ print("ff:{},fm:{},fo:{}".format(f_f, f_m, f_o))
+ print("mm:{},mf:{},mo:{}".format(m_m, m_f, m_o))
+ print("om:{},of:{},oo:{}".format(o_m, o_f, o_o))
+ # 女性准确率和召回率
+ f_acc = f_f / (f_f + m_f + o_f)
+ f_recall = f_f / (f_f + f_m + f_o)
+ # 男性准确率和召回率
+ m_acc = m_m / (m_m + f_m + o_m)
+ m_recall = m_m / (m_m + m_f + m_o)
+ print("female: acc={}|recall={}".format(f_acc, f_recall))
+ print("male: acc={}|recall={}".format(m_acc, m_recall))
+
+
+if __name__ == "__main__":
+ # test_all()
+ # test_all_feature()
+ model_path = sys.argv[1]
+ voice_path = sys.argv[2]
+ music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth")
+ music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth")
+ gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth")
+ gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth")
+ vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model)
+ for i in range(0, 1):
+ st = time.time()
+ print("------------------------------>>>>>")
+ vc.process(voice_path)
+ print("process|spend_tm=={}".format(time.time() - st))
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Sun, Jan 12, 08:30 (1 d, 15 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1347164
Default Alt Text
(42 KB)
Attached To
R350 av_svc
Event Timeline
Log In to Comment