No OneTemporary
Actions

Size

42 KB

Subscribers

None

View Options

	diff --git a/AutoCoverTool/online/tone_shift_one.py b/AutoCoverTool/online/tone_shift_one.py
	index e395c8d..9a422f8 100644
	--- a/AutoCoverTool/online/tone_shift_one.py
	+++ b/AutoCoverTool/online/tone_shift_one.py
	@@ -1,328 +1,338 @@
	"""
	变调的方式做处理
	1. 下载
	2. 分离
	3. 针对于人声变调+2，伴奏+1
	4. 合成
	"""

	import os
	import json
	import shutil
	import librosa
	import logging
	import numpy as np
	from ref.music_remover.separate_interface import SeparateInterface
	from online.inference_worker import upload_file2cos, gs_state_use, gs_state_finish, gs_state_default
	from online.common import *
	+from ref.online.voice_class_online import VoiceClass

	logging.basicConfig(filename='/tmp/tone_shift_one.log', level=logging.INFO)

	gs_tone_shift_exe = "/opt/soft/bin/tone_shift_exe"
	gs_simple_mixer_path = "/opt/soft/bin/simple_mixer"

	gs_err_code_success = 0
	gs_err_code_tone_shift = 1
	gs_err_code_mix = 2
	gs_err_code_transcode = 3
	gs_err_code_upload = 4
	gs_err_code_download = 5
	gs_err_code_trans_to_mp3 = 6
	gs_err_code_separate = 7
	gs_err_code_duration_too_long = 8
	gs_err_code_duration_no_vocal = 9
	gs_err_code_duration_err = 10
	gs_err_code_transcode_acc = 11
	gs_err_code_upload_acc = 12
	gs_err_code_download_acc = 13
	gs_err_code_download_vocal = 14
	gs_err_code_transcode_acc_v1 = 15
	gs_err_code_transcode_vocal_v1 = 16
	gs_err_code_silence_no_data = 17
	gs_err_code_silence_no_process = 18


	def exec_cmd(cmd):
	r = os.popen(cmd)
	text = r.read()
	r.close()
	return text


	def get_d(audio_path):
	cmd = "ffprobe -v quiet -print_format json -show_format -show_streams {}".format(audio_path)
	data = exec_cmd(cmd)
	data = json.loads(data)
	# 返回秒
	if 'format' in data.keys() and 'duration' in data['format']:
	return float(data["format"]["duration"])
	return -1


	def get_mean_power(audio_path):
	sr = 44100
	audio, sr = librosa.load(audio_path, sr=sr, mono=True)
	mm = np.mean(np.abs(audio))
	return mm


	class ToneShift:
	def __init__(self):
	self.separate_inst = SeparateInterface()
	+ model_path = "./models"
	+ music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth")
	+ music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth")
	+ gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth")
	+ gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth")
	+
	+ self.voice_class = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model,
	+ gender_no_pure_model)

	def update_state(self, song_id, state):
	sql = "update svc_queue_table set state={},update_time={} where song_id = {}". \
	format(state, int(time.time()), song_id)
	banned_user_map['db'] = "av_db"
	update_db(sql, banned_user_map)

	def get_url_by_id(self, song_id):
	sql = "select song_id, url from svc_queue_table where song_id={}".format(song_id)
	banned_user_map["db"] = "av_db"
	data = get_data_by_mysql(sql)
	if len(data) == 0:
	return None, None
	return str(data[0][0]), data[0][1]

	def get_one_data_logic(self):
	"""
	按照5,4,3的优先级进行获取
	:return:
	"""
	song_src_arr = [5, 4, 3]
	for song_src in song_src_arr:
	song_id, song_url = self.get_one_data(song_src=song_src)
	if song_id is not None:
	return song_id, song_url
	return None, None

	def get_one_data(self, song_src=3):
	sql = "select song_id, url from svc_queue_table where state = 0 and song_src={} order by create_time asc limit 1".format(
	song_src)
	banned_user_map["db"] = "av_db"
	data = get_data_by_mysql(sql, banned_user_map)
	if len(data) == 0:
	return None, None
	song_id, song_url = data[0]
	if song_id != "":
	self.update_state(song_id, gs_state_use)
	return str(song_id), song_url

	def pre_process(self, work_dir, song_url):
	"""
	创建文件夹，下载数据
	:return:
	"""

	if "?sign=" in song_url:
	return gs_err_code_download
	ext = str(song_url).split(".")[-1]
	dst_file = "{}/src_origin.{}".format(work_dir, ext)
	cmd = "wget {} -O {}".format(song_url, dst_file)
	os.system(cmd)
	if not os.path.exists(dst_file):
	return gs_err_code_download

	duration = get_d(dst_file)
	if duration < 0:
	return gs_err_code_duration_err
	print("Duration:", dst_file, duration)
	if duration > 20 * 60:
	return gs_err_code_duration_too_long

	dst_mp3_file = "{}/src.wav".format(work_dir)
	cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {} ".format(dst_file, dst_mp3_file)
	os.system(cmd)
	if not os.path.exists(dst_mp3_file):
	return gs_err_code_trans_to_mp3
	return gs_err_code_success

	def tone_shift_one(self, in_file, dst_file, pitch):
	cmd = "{} {} {} {}".format(gs_tone_shift_exe, in_file, dst_file, pitch)
	os.system(cmd)
	return os.path.exists(dst_file)

	def mix(self, cid, vocal_path, acc_path, tp):
	if tp == 1:
	vocal_pitch = 2
	acc_pitch = 0
	else:
	vocal_pitch = -2
	acc_pitch = 0

	vocal_path_2 = vocal_path.replace(".wav", "_{}.wav".format(vocal_pitch))
	acc_path_2 = acc_path.replace(".wav", "_{}.wav".format(acc_pitch))
	err = self.tone_shift_one(vocal_path, vocal_path_2, vocal_pitch)
	if not err:
	- return gs_err_code_tone_shift, None
	+ return gs_err_code_tone_shift, None, None

	+ gender, female_rate = self.voice_class.process_one(vocal_path_2)
	err = self.tone_shift_one(acc_path, acc_path_2, acc_pitch)
	if not err:
	- return gs_err_code_tone_shift, None
	+ return gs_err_code_tone_shift, None, None

	base_dir = os.path.dirname(vocal_path)
	mix_path = "{}/mix_{}_{}.wav".format(base_dir, vocal_pitch, acc_pitch)
	cmd = "{} {} {} {}".format(gs_simple_mixer_path, vocal_path_2, acc_path_2, mix_path)
	print("exec_cmd={}".format(cmd))
	os.system(cmd)

	if not os.path.exists(mix_path):
	- return gs_err_code_mix, None
	+ return gs_err_code_mix, None, None

	# 转码
	mix_path_mp3 = mix_path.replace(".wav", ".mp4")
	cmd = "ffmpeg -i {} -b:a 128k -c:a aac -ar 44100 -ac 2 -y {} -loglevel fatal".format(mix_path, mix_path_mp3)
	os.system(cmd)
	if not os.path.exists(mix_path_mp3):
	- return gs_err_code_transcode, None
	+ return gs_err_code_transcode, None, None

	# 上传到cos
	mix_name = os.path.basename(mix_path_mp3)
	key = "av_res/svc_res_tone_shift/{}/{}".format(str(cid), mix_name)
	if not upload_file2cos(key, mix_path_mp3):
	- return gs_err_code_upload, None
	- return gs_err_code_success, key
	+ return gs_err_code_upload, None, None
	+ return gs_err_code_success, key, gender

	def upload_acc(self, cid, acc_path):
	# 转码
	mix_path_aac = acc_path.replace(".wav", ".m4a")
	cmd = "ffmpeg -i {} -b:a 128k -c:a aac -ar 44100 -ac 2 -y {} -loglevel fatal".format(acc_path, mix_path_aac)
	os.system(cmd)
	if not os.path.exists(mix_path_aac):
	return gs_err_code_transcode_acc, None

	# 上传
	mix_name = os.path.basename(mix_path_aac)
	key = "av_res/svc_res_tone_shift/{}/{}".format(str(cid), mix_name)
	if not upload_file2cos(key, mix_path_aac):
	return gs_err_code_upload_acc, None
	return gs_err_code_success, key

	def process_one(self, cid, work_dir):
	"""
	:param cid:
	:param work_dir:
	:return:
	"""
	src_mp3 = os.path.join(work_dir, "src.wav")
	vocal_path = os.path.join(work_dir, "vocal.wav")
	acc_path = os.path.join(work_dir, "acc.wav")
	if not (os.path.exists(vocal_path) and os.path.exists(acc_path)):
	if not self.separate_inst.process(cid, src_mp3, vocal_path, acc_path):
	return gs_err_code_separate, []
	if not os.path.exists(vocal_path) or not os.path.exists(acc_path):
	return gs_err_code_separate, []

	# 当人声的平均能量小于一定值时，则认为无人声(0.01是经验值判定，样本分析来看)
	# 无人声的样本[0.0056, 0.0003], 有人声的样本(目前最小)[0.046, 0.049]
	print("power:{},{}".format(cid, get_mean_power(vocal_path)))
	if get_mean_power(vocal_path) < 0.02:
	return gs_err_code_duration_no_vocal, []
	- err, type1_mix_mp3 = self.mix(cid, vocal_path, acc_path, 1)
	+ err, type1_mix_mp3, gender = self.mix(cid, vocal_path, acc_path, 1)
	if err != gs_err_code_success:
	return err, []

	- err, type2_mix_mp3 = self.mix(cid, vocal_path, acc_path, 2)
	+ err, type2_mix_mp3, gender2 = self.mix(cid, vocal_path, acc_path, 2)
	if err != gs_err_code_success:
	return err, []

	# 上传伴奏文件
	# err, acc_path_m4a = self.upload_acc(cid, acc_path)
	# if err != gs_err_code_success:
	# return err, []
	- return gs_err_code_success, [type1_mix_mp3, type2_mix_mp3]
	+ return gs_err_code_success, [type1_mix_mp3, type2_mix_mp3, str(gender), str(gender2)]

	def download_and_transcode(self, url, local_path, local_path_wav):
	cmd = "wget {} -O {}".format(url, local_path)
	os.system(cmd)
	if not os.path.exists(local_path):
	return -1
	cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {}".format(local_path, local_path_wav)
	os.system(cmd)
	if not os.path.exists(local_path_wav):
	return -2
	return 0

	def get_data_from_mysql(self, cid, work_dir):
	sql = "select starmaker_songid,task_url,complete_url,voice_url from starmaker_musicbook.silence where starmaker_songid={} order by task_id desc limit 1".format(
	cid)
	data = get_data_by_mysql(sql, banned_user_map)
	if len(data) == 0:
	return gs_err_code_silence_no_data
	song_id, task_url, complete_url, voice_url = data[0]
	if complete_url != "" and voice_url != "":
	"""
	将人声与伴奏下载下来
	"""
	ext = str(complete_url).split(".")[-1]
	acc_dst_file = os.path.join(work_dir, "acc.{}".format(ext))
	acc_wav_dst_file = os.path.join(work_dir, "acc.wav")

	err = self.download_and_transcode(complete_url, acc_dst_file, acc_wav_dst_file)
	os.unlink(acc_dst_file)
	if err == -1:
	return gs_err_code_download_acc
	if err == -2:
	return gs_err_code_transcode_acc_v1

	ext = str(voice_url).split(".")[-1]
	vocal_dst_file = os.path.join(work_dir, "vocal.{}".format(ext))
	vocal_wav_dst_file = os.path.join(work_dir, "vocal.wav")

	err = self.download_and_transcode(voice_url, vocal_dst_file, vocal_wav_dst_file)
	os.unlink(vocal_dst_file)
	if err == -1:
	return gs_err_code_download_vocal
	if err == -2:
	return gs_err_code_transcode_vocal_v1
	return gs_err_code_success
	return gs_err_code_silence_no_process

	def process_worker(self):
	logging.info("start process_worker .....")
	base_dir = "/tmp/tone_shift_one"
	if not os.path.exists(base_dir):
	os.makedirs(base_dir)
	while True:
	worker_st = time.time()
	cid, song_url = self.get_one_data_logic()
	- # cid, song_url = self.get_url_by_id('175210503076374799')
	+ # cid, song_url = self.get_url_by_id('611752105030548048')
	if cid is None:
	time.sleep(5)
	logging.info("get one data is None ...")
	continue

	work_dir = os.path.join(base_dir, str(cid))
	if os.path.exists(work_dir):
	shutil.rmtree(work_dir)
	os.makedirs(work_dir)

	# 先查看消音数据库中是否已经完成了该项目,已经有的话，就直接下载即可
	err = self.get_data_from_mysql(cid, work_dir)
	if err != gs_err_code_success:
	# 清空磁盘
	shutil.rmtree(work_dir)
	os.makedirs(work_dir)

	err = self.pre_process(work_dir, song_url)
	if err != gs_err_code_success:
	self.update_state(str(cid), -err)
	continue

	st = time.time()
	err, data = self.process_one(str(cid), work_dir)
	logging.info("process_finish,{},{}".format(cid, time.time() - st))
	if err == gs_err_code_success and len(data) != 0:
	sql = "update svc_queue_table set state={},update_time={},svc_url=\"{}\" where song_id = {}". \
	format(gs_state_finish, int(time.time()), ",".join(data), str(cid))
	banned_user_map['db'] = "av_db"
	update_db(sql, banned_user_map)
	else:
	self.update_state(str(cid), -err)
	shutil.rmtree(work_dir)
	logging.info("process_finish,{},{}".format(cid, time.time() - worker_st))


	if __name__ == '__main__':
	ts = ToneShift()
	ts.process_worker()
	diff --git a/AutoCoverTool/ref/online/common.py b/AutoCoverTool/ref/online/common.py
	new file mode 100644
	index 0000000..af3487a
	--- /dev/null
	+++ b/AutoCoverTool/ref/online/common.py
	@@ -0,0 +1,93 @@
	+#--encording=utf-8--
	+"""
	+程序绑定核心
	+一个脚本启动多次，每次绑定一个核心，不会多次绑定到同一个核心
	+每个进程选定绑定n个核心,或者自己传入需要绑定的核心编号
	+"""
	+
	+import time
	+import psutil
	+import os
	+import sys
	+import hashlib
	+import fcntl
	+
	+"""
	+自动获取可用核心
	+"""
	+
	+
	+def exec_cmd_ints(cmd):
	+ """
	+ 执行cmd，获取返回值
	+ :param cmd:
	+ :return:
	+ """
	+ r = os.popen(cmd)
	+ lines = r.readlines()
	+ ids = []
	+ for line in lines:
	+ line = line.strip()
	+ if line.isdigit():
	+ id = int(float(line))
	+ ids.append(id)
	+ return ids
	+
	+
	+def get_idle_kernel(n=1):
	+ cur_id = os.getpid()
	+ name = os.path.basename(sys.argv[0])
	+ command = "ps -ef \| grep {} \|grep python \| awk \'{{print $2}}\'".format(name)
	+ print(command)
	+ ids = exec_cmd_ints(command)
	+
	+ print(ids, cur_id)
	+ # 获取所有被绑定的核心
	+ count = psutil.cpu_count()
	+ used = [False] * (count // n)
	+ command = "pidstat \| grep {} \| awk \'{{print $(NF-1)}}\'"
	+ for i in range(0, len(ids)):
	+ if cur_id != ids[i]:
	+ cmd = command.format(ids[i])
	+ kers = exec_cmd_ints(cmd)
	+ for ker in kers:
	+ ker = ker // n
	+ used[ker] = True
	+ print(used)
	+ # 获取N个可用的核心
	+ for i in range(0, len(used)):
	+ if not used[i]:
	+ res = []
	+ cur_i = i * n
	+ for idx in range(cur_i, cur_i+n):
	+ if idx < count:
	+ res.append(idx)
	+ return res
	+ return 0
	+
	+
	+def bind_kernel(n=1, kernel=[]):
	+ p = psutil.Process()
	+
	+ # 加锁
	+ name = hashlib.md5(os.path.basename(sys.argv[0]).encode('utf-8')).hexdigest()
	+ name = os.path.join("/tmp", name + ".lock")
	+ if not os.path.exists(name):
	+ with open(name, "w") as f:
	+ f.write("0")
	+ file = open(name)
	+ fcntl.flock(file.fileno(), fcntl.LOCK_EX) # 排他锁
	+ print("lock file --- {}".format(name))
	+ if len(kernel) > 0:
	+ kernels = kernel
	+ else:
	+ kernels = get_idle_kernel(n)
	+ p.cpu_affinity(kernels) # 绑定特定核心
	+ print("bind_kernel", kernels)
	+ file.close() # 释放锁
	+ print("unlock file --- {}".format(name))
	+
	+
	+def calc_forever():
	+ for i in range(0, 10000):
	+ time.sleep(1000)
	\ No newline at end of file
	diff --git a/AutoCoverTool/ref/online/mobilenet_v2_custom.py b/AutoCoverTool/ref/online/mobilenet_v2_custom.py
	new file mode 100644
	index 0000000..57b1227
	--- /dev/null
	+++ b/AutoCoverTool/ref/online/mobilenet_v2_custom.py
	@@ -0,0 +1,142 @@
	+"""
	+直接从代码库中拷贝出的代码
	+目的: mobilenet_v2只允许输入图片的通道数为3,不满足要求，因此拷贝出来做修改
	+"""
	+
	+from torch import nn
	+
	+
	+def _make_divisible(v, divisor, min_value=None):
	+ """
	+ This function is taken from the original tf repo.
	+ It ensures that all layers have a channel number that is divisible by 8
	+ It can be seen here:
	+ https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
	+ :param v:
	+ :param divisor:
	+ :param min_value:
	+ :return:
	+ """
	+ if min_value is None:
	+ min_value = divisor
	+ new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
	+ # Make sure that round down does not go down by more than 10%.
	+ if new_v < 0.9 * v:
	+ new_v += divisor
	+ return new_v
	+
	+
	+class ConvBNReLU(nn.Sequential):
	+ def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
	+ padding = (kernel_size - 1) // 2
	+ super(ConvBNReLU, self).__init__(
	+ nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
	+ nn.BatchNorm2d(out_planes),
	+ nn.ReLU6(inplace=True)
	+ )
	+
	+
	+class InvertedResidual(nn.Module):
	+ def __init__(self, inp, oup, stride, expand_ratio):
	+ super(InvertedResidual, self).__init__()
	+ self.stride = stride
	+ assert stride in [1, 2]
	+
	+ hidden_dim = int(round(inp * expand_ratio))
	+ self.use_res_connect = self.stride == 1 and inp == oup
	+
	+ layers = []
	+ if expand_ratio != 1:
	+ # pw
	+ layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
	+ layers.extend([
	+ # dw
	+ ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim),
	+ # pw-linear
	+ nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
	+ nn.BatchNorm2d(oup),
	+ ])
	+ self.conv = nn.Sequential(*layers)
	+
	+ def forward(self, x):
	+ if self.use_res_connect:
	+ return x + self.conv(x)
	+ else:
	+ return self.conv(x)
	+
	+
	+class MobileNetV2Custom(nn.Module):
	+ def __init__(self, num_classes=2, in_channel=1, width_mult=1.0, inverted_residual_setting=None, round_nearest=8):
	+ """
	+ MobileNet V2 main class
	+
	+ Args:
	+ num_classes (int): Number of classes
	+ width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
	+ inverted_residual_setting: Network structure
	+ round_nearest (int): Round the number of channels in each layer to be a multiple of this number
	+ Set to 1 to turn off rounding
	+ """
	+ super(MobileNetV2Custom, self).__init__()
	+ block = InvertedResidual
	+ input_channel = 32
	+ last_channel = 1280
	+
	+ if inverted_residual_setting is None:
	+ inverted_residual_setting = [
	+ # t, c, n, s
	+ [1, 16, 1, 1],
	+ [6, 24, 2, 2],
	+ [6, 32, 3, 2],
	+ [6, 64, 4, 2],
	+ [6, 96, 3, 1],
	+ [6, 160, 3, 2],
	+ [6, 320, 1, 1],
	+ ]
	+
	+ # only check the first element, assuming user knows t,c,n,s are required
	+ if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
	+ raise ValueError("inverted_residual_setting should be non-empty "
	+ "or a 4-element list, got {}".format(inverted_residual_setting))
	+
	+ # building first layer
	+ input_channel = _make_divisible(input_channel * width_mult, round_nearest)
	+ self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
	+ # 修改的地方,原来in_channel=3
	+ features = [ConvBNReLU(in_channel, input_channel, stride=2)]
	+ # building inverted residual blocks
	+ for t, c, n, s in inverted_residual_setting:
	+ output_channel = _make_divisible(c * width_mult, round_nearest)
	+ for i in range(n):
	+ stride = s if i == 0 else 1
	+ features.append(block(input_channel, output_channel, stride, expand_ratio=t))
	+ input_channel = output_channel
	+ # building last several layers
	+ features.append(ConvBNReLU(input_channel, self.last_channel, kernel_size=1))
	+ # make it nn.Sequential
	+ self.features = nn.Sequential(*features)
	+
	+ # building classifier
	+ self.classifier = nn.Sequential(
	+ nn.Dropout(0.2),
	+ nn.Linear(self.last_channel, num_classes),
	+ )
	+
	+ # weight initialization
	+ for m in self.modules():
	+ if isinstance(m, nn.Conv2d):
	+ nn.init.kaiming_normal_(m.weight, mode='fan_out')
	+ if m.bias is not None:
	+ nn.init.zeros_(m.bias)
	+ elif isinstance(m, nn.BatchNorm2d):
	+ nn.init.ones_(m.weight)
	+ nn.init.zeros_(m.bias)
	+ elif isinstance(m, nn.Linear):
	+ nn.init.normal_(m.weight, 0, 0.01)
	+ nn.init.zeros_(m.bias)
	+
	+ def forward(self, x):
	+ x = self.features(x)
	+ x = x.mean([2, 3])
	+ x = self.classifier(x)
	+ return x
	diff --git a/AutoCoverTool/ref/online/model.py b/AutoCoverTool/ref/online/model.py
	new file mode 100644
	index 0000000..c5e8adc
	--- /dev/null
	+++ b/AutoCoverTool/ref/online/model.py
	@@ -0,0 +1,71 @@
	+from mobilenet_v2_custom import MobileNetV2Custom
	+import torch
	+import torch.nn as nn
	+
	+MFCC_LEN = 80
	+FRAME_LEN = 128
	+
	+
	+class MobileNetV2Gender(MobileNetV2Custom):
	+
	+ def forward(self, x):
	+ x = x.view([-1, 1, FRAME_LEN, MFCC_LEN])
	+ return super(MobileNetV2Gender, self).forward(x)
	+
	+
	+class MusicVoiceV5Model(nn.Module):
	+ def __init__(self):
	+ super(MusicVoiceV5Model, self).__init__()
	+
	+ def conv_bn(inp, oup, stride):
	+ return nn.Sequential(
	+ nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
	+ nn.BatchNorm2d(oup),
	+ nn.ReLU(inplace=True)
	+ )
	+
	+ def conv_dw(inp, oup, stride):
	+ return nn.Sequential(
	+ nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
	+ nn.BatchNorm2d(inp),
	+ nn.ReLU(inplace=True),
	+
	+ nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
	+ nn.BatchNorm2d(oup),
	+ nn.ReLU(inplace=True),
	+ )
	+
	+ self.model = nn.Sequential(
	+ conv_bn(1, 32, 2),
	+ conv_dw(32, 64, 1),
	+ conv_dw(64, 128, 2),
	+ conv_dw(128, 128, 1),
	+ conv_dw(128, 256, 2),
	+ conv_dw(256, 256, 1),
	+ conv_dw(256, 512, 2),
	+ conv_dw(512, 512, 1),
	+ conv_dw(512, 512, 1),
	+ conv_dw(512, 512, 1),
	+ conv_dw(512, 512, 1),
	+ conv_dw(512, 512, 1),
	+ conv_dw(512, 1024, 2),
	+ conv_dw(1024, 1024, 1),
	+ nn.AvgPool2d((4, 3)),
	+ )
	+ self.fc = nn.Linear(1024, 2)
	+
	+ def forward(self, x):
	+ x = x.view([-1, 1, FRAME_LEN, MFCC_LEN])
	+ x = self.model(x)
	+ x = x.view(-1, 1024)
	+ x = self.fc(x)
	+ return x
	+
	+
	+def load_model(model_type, model_path, device):
	+ model = model_type()
	+ params = torch.load(model_path, map_location=torch.device(device))
	+ model.load_state_dict(state_dict=params)
	+ model.eval()
	+ model.to(device)
	+ return model
	diff --git a/AutoCoverTool/ref/online/readme.md b/AutoCoverTool/ref/online/readme.md
	new file mode 100644
	index 0000000..10a1f09
	--- /dev/null
	+++ b/AutoCoverTool/ref/online/readme.md
	@@ -0,0 +1,50 @@
	+#男女声识别
	+
	+```
	+模型名称以及对应作用:
	+---gender_8k_ratev5_v6_adam.pth // 男女声(纯人声)分类模型(使用8k纯人声数据集进行训练,mobilenet_v2,adam优化器)
	+---gender_8k_v6_adam.pth // 男女声(带人声)分类模型(使用8k带人声数据集进行训练,mobilenet_v2,adam优化器)
	+---voice_005_rec_v5.pth // 纯人声分类模型(400首人工标注的歌曲,判定纯人声段(使用作品中带人声段当作负样本) mobilenet_v1, sgd优化器)
	+---voice_10_v5.pth // 带人声分类模型(400首人工标注的歌曲,判定带人声段, mobilenet_v1, sgd优化器)
	+模型地址:https://av-audit-sync-in-1256122840.cos.ap-mumbai.myqcloud.com/hub/voice_classification/models.zip
	+```
	+
	+# 文件说明
	+```
	+---common.py // 用于绑定核心的代码
	+---mobilenet_v2_custom.py // 模型代码
	+---model.py // 调用模型的封装层
	+---readme.MD // 说明文件
	+---voice_class_online.py // 运行时使用的文件
	+```
	+
	+# 环境安装
	+```
	+cd /home/worker
	+wget "https://av-audit-sync-in-1256122840.cos.ap-mumbai.myqcloud.com/hub/voice_classification/bin/bin.zip"
	+unzip bin.zip
	+rm -f bin.zip
	+export PATH=$PATH:/home/worker/bin # 需要写入到.zshrc中
	+sudo yum install libsndfile-devel
	+
	+# 以下使用手动安装即可
	+conda create -n voice_class python=3.7 -y
	+conda activate voice_class
	+pip3 install librosa
	+pip3 install psutil
	+pip3 install torch==1.5 torchvision torchaudio
	+```
	+
	+# 使用说明
	+```
	+下载模型并解压后,按照voice_class_online.py中的运行方式运行即可
	+```
	+
	+# 注意:
	+目前代码中限制了CPU的核心数量,只允许占用一个核,建议根据核心的情况多开几个进程做处理
	+
	+# 性能测试(不加性能限制的情况下在GPU-2机器上测试得到):
	+20个线上样本(男10,女10)
	+
	+CPU情况:spend_time:tot=31.91\|transcode=5.92\|vb=3.12\|gen_feature=3.5\|predict=18.94
	+GPU情况:spend_time:tot=15.64\|transcode=6.34\|vb=4.17\|gen_feature=3.3\|predict=1.443
	diff --git a/AutoCoverTool/ref/online/resource/female/4785074274851990.mp4 b/AutoCoverTool/ref/online/resource/female/4785074274851990.mp4
	new file mode 100644
	index 0000000..9b225ee
	Binary files /dev/null and b/AutoCoverTool/ref/online/resource/female/4785074274851990.mp4 differ
	diff --git a/AutoCoverTool/ref/online/voice_class_online.py b/AutoCoverTool/ref/online/voice_class_online.py
	new file mode 100644
	index 0000000..6041c94
	--- /dev/null
	+++ b/AutoCoverTool/ref/online/voice_class_online.py
	@@ -0,0 +1,420 @@
	+"""
	+男女声分类在线工具
	+1 转码为16bit单声道
	+2 均衡化
	+3 模型分类
	+"""
	+
	+import os
	+import sys
	+import librosa
	+import shutil
	+import logging
	+import time
	+import torch.nn.functional as F
	+import numpy as np
	+from model import *
	+# from common import bind_kernel
	+
	+logging.basicConfig(level=logging.INFO)
	+
	+os.environ["LRU_CACHE_CAPACITY"] = "1"
	+
	+# torch.set_num_threads(1)
	+# bind_kernel(1)
	+
	+"""
	+临时用一下,全局使用的变量
	+"""
	+
	+transcode_time = 0
	+vb_time = 0
	+mfcc_time = 0
	+predict_time = 0
	+
	+"""
	+错误码
	+"""
	+ERR_CODE_SUCCESS = 0 # 处理成功
	+ERR_CODE_NO_FILE = -1 # 文件不存在
	+ERR_CODE_TRANSCODE = -2 # 转码失败
	+ERR_CODE_VOLUME_BALANCED = -3 # 均衡化失败
	+ERR_CODE_FEATURE_TOO_SHORT = -4 # 特征文件太短
	+
	+"""
	+常量
	+"""
	+
	+FRAME_LEN = 128
	+MFCC_LEN = 80
	+
	+EBUR128_BIN = "/opt/soft/bin/standard_audio_no_cut"
	+# EBUR128_BIN = "/Users/yangjianli/linux/opt/soft/bin/standard_audio_no_cut"
	+GENDER_FEMALE = 0
	+GENDER_MALE = 1
	+GENDER_OTHER = 2
	+"""
	+通用函数
	+"""
	+
	+
	+def exec_cmd(cmd):
	+ ret = os.system(cmd)
	+ if ret != 0:
	+ return False
	+ return True
	+
	+
	+"""
	+业务需要的函数
	+"""
	+
	+
	+def get_one_mfcc(file_url):
	+ st = time.time()
	+ data, sr = librosa.load(file_url, sr=16000)
	+ if len(data) < 512:
	+ return []
	+ mfcc = librosa.feature.mfcc(y=data, sr=sr, n_fft=512, hop_length=256, n_mfcc=MFCC_LEN)
	+ mfcc = mfcc.transpose()
	+ print("get_one_mfcc:spend_time={}".format(time.time() - st))
	+ global mfcc_time
	+ mfcc_time += time.time() - st
	+ return mfcc
	+
	+
	+def volume_balanced(src, dst):
	+ st = time.time()
	+ cmd = "{} {} {}".format(EBUR128_BIN, src, dst)
	+ logging.info(cmd)
	+ exec_cmd(cmd)
	+ if not os.path.exists(dst):
	+ logging.error("volume_balanced:cmd={}".format(cmd))
	+ print("volume_balanced:spend_time={}".format(time.time() - st))
	+
	+ global vb_time
	+ vb_time += time.time() - st
	+ return os.path.exists(dst)
	+
	+
	+def transcode(src, dst):
	+ st = time.time()
	+ cmd = "ffmpeg -loglevel quiet -i {} -ar 16000 -ac 1 {}".format(src, dst)
	+ logging.info(cmd)
	+ exec_cmd(cmd)
	+ if not os.path.exists(dst):
	+ logging.error("transcode:cmd={}".format(cmd))
	+ print("transcode:spend_time={}".format(time.time() - st))
	+ global transcode_time
	+ transcode_time += time.time() - st
	+ return os.path.exists(dst)
	+
	+
	+class VoiceClass:
	+
	+ def __init__(self, music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model):
	+ """
	+ 四个模型
	+ :param music_voice_pure_model: 分辨纯净人声/其他
	+ :param music_voice_no_pure_model: 分辨有人声/其他
	+ :param gender_pure_model: 纯净人声分辨男女
	+ :param gender_no_pure_model: 有人声分辨男女
	+ """
	+ st = time.time()
	+ self.device = "cpu"
	+ self.batch_size = 256
	+ self.music_voice_pure_model = load_model(MusicVoiceV5Model, music_voice_pure_model, self.device)
	+ self.music_voice_no_pure_model = load_model(MusicVoiceV5Model, music_voice_no_pure_model, self.device)
	+ self.gender_pure_model = load_model(MobileNetV2Gender, gender_pure_model, self.device)
	+ self.gender_no_pure_model = load_model(MobileNetV2Gender, gender_no_pure_model, self.device)
	+ logging.info("load model ok ! spend_time={}".format(time.time() - st))
	+
	+ def batch_predict(self, model, features):
	+ st = time.time()
	+ scores = []
	+ with torch.no_grad():
	+ for i in range(0, len(features), self.batch_size):
	+ cur_data = features[i:i + self.batch_size].to(self.device)
	+ predicts = model(cur_data)
	+ predicts_score = F.softmax(predicts, dim=1)
	+ scores.extend(predicts_score.cpu().numpy())
	+ ret = np.array(scores)
	+ global predict_time
	+ predict_time += time.time() - st
	+ return ret
	+
	+ def predict_pure(self, filename, features):
	+ scores = self.batch_predict(self.music_voice_pure_model, features)
	+ new_features = []
	+ for idx, score in enumerate(scores):
	+ if score[0] > 0.5: # 非人声
	+ continue
	+ new_features.append(features[idx].numpy())
	+
	+ # 人声段太少,不能进行处理
	+ # 参数可以改
	+ new_feature_len = len(new_features)
	+ new_feature_rate = len(new_features) / len(features)
	+ if new_feature_len < 4 or new_feature_rate < 0.4:
	+ logging.warning(
	+ "filename={}\|predict_pure\|other\|len={}\|rate={}".format(filename, new_feature_len, new_feature_rate)
	+ )
	+ return GENDER_OTHER, -1
	+ new_features = torch.from_numpy(np.array(new_features))
	+ scores = self.batch_predict(self.gender_pure_model, new_features)
	+ f_avg = sum(scores[:, 0]) / len(scores)
	+ m_avg = sum(scores[:, 1]) / len(scores)
	+ female_rate = f_avg / (f_avg + m_avg)
	+ if female_rate > 0.65:
	+ return GENDER_FEMALE, female_rate
	+ if female_rate < 0.12:
	+ return GENDER_MALE, female_rate
	+ logging.warning(
	+ "filename={}\|predict_pure\|other\|len={}\|rate={}".format(filename, new_feature_len, new_feature_rate)
	+ )
	+ return GENDER_OTHER, female_rate
	+
	+ def predict_no_pure(self, filename, features):
	+ scores = self.batch_predict(self.music_voice_no_pure_model, features)
	+ new_features = []
	+ for idx, score in enumerate(scores):
	+ if score[0] > 0.5: # 非人声
	+ continue
	+ new_features.append(features[idx].numpy())
	+
	+ # 人声段太少,不能进行处理
	+ # 参数可以改
	+ new_feature_len = len(new_features)
	+ new_feature_rate = len(new_features) / len(features)
	+ if new_feature_len < 4 or new_feature_rate < 0.4:
	+ logging.warning(
	+ "filename={}\|predict_no_pure\|other\|len={}\|rate={}".format(filename, new_feature_len, new_feature_rate)
	+ )
	+ return GENDER_OTHER, -1
	+ new_features = torch.from_numpy(np.array(new_features))
	+ scores = self.batch_predict(self.gender_no_pure_model, new_features)
	+ f_avg = sum(scores[:, 0]) / len(scores)
	+ m_avg = sum(scores[:, 1]) / len(scores)
	+ female_rate = f_avg / (f_avg + m_avg)
	+ if female_rate > 0.75:
	+ return GENDER_FEMALE, female_rate
	+ if female_rate < 0.1:
	+ return GENDER_MALE, female_rate
	+ logging.warning(
	+ "filename={}\|predict_no_pure\|other\|len={}\|rate={}".format(filename, new_feature_len, new_feature_rate)
	+ )
	+ return GENDER_OTHER, female_rate
	+
	+ def predict(self, filename, features):
	+ st = time.time()
	+ new_features = []
	+ for i in range(FRAME_LEN, len(features), FRAME_LEN):
	+ new_features.append(features[i - FRAME_LEN: i])
	+ new_features = torch.from_numpy(np.array(new_features))
	+ gender, rate = self.predict_pure(filename, new_features)
	+ if gender == GENDER_OTHER:
	+ logging.info("start no pure process...")
	+ return self.predict_no_pure(filename, new_features)
	+ print("predict\|spend_time={}".format(time.time() - st))
	+ return gender, rate
	+
	+ def process_one_logic(self, filename, file_path, cache_dir):
	+ tmp_wav = os.path.join(cache_dir, "tmp.wav")
	+ tmp_vb_wav = os.path.join(cache_dir, "tmp_vb.wav")
	+ if not transcode(file_path, tmp_wav):
	+ return ERR_CODE_TRANSCODE
	+ if not volume_balanced(tmp_wav, tmp_vb_wav):
	+ return ERR_CODE_VOLUME_BALANCED
	+ features = get_one_mfcc(tmp_vb_wav)
	+ if len(features) < FRAME_LEN:
	+ logging.error("feature too short\|file_path={}".format(file_path))
	+ return ERR_CODE_FEATURE_TOO_SHORT
	+ return self.predict(filename, features)
	+
	+ def process_one(self, file_path):
	+ base_dir = os.path.dirname(file_path)
	+ filename = os.path.splitext(file_path)[0]
	+ cache_dir = os.path.join(base_dir, filename + "_cache")
	+ if os.path.exists(cache_dir):
	+ shutil.rmtree(cache_dir)
	+ os.makedirs(cache_dir)
	+ ret = self.process_one_logic(filename, file_path, cache_dir)
	+ shutil.rmtree(cache_dir)
	+ return ret
	+
	+ def process(self, file_path):
	+ gender, female_rate = self.process_one(file_path)
	+ logging.info("{}\|gender={}\|female_rate={}".format(file_path, gender, female_rate))
	+ return gender, female_rate
	+
	+ def process_by_feature(self, feature_file):
	+ """
	+ 直接处理特征文件
	+ :param feature_file:
	+ :return:
	+ """
	+ filename = os.path.splitext(feature_file)[0]
	+ features = np.load(feature_file)
	+ gender, female_rate = self.predict(filename, features)
	+ return gender, female_rate
	+
	+
	+def test_all_feature():
	+ import glob
	+ base_dir = "/data/datasets/music_voice_dataset_full/feature_online_data_v3"
	+ female = glob.glob(os.path.join(base_dir, "female/*feature.npy"))
	+ male = glob.glob(os.path.join(base_dir, "male/*feature.npy"))
	+ other = glob.glob(os.path.join(base_dir, "other/*feature.npy"))
	+ model_path = "/data/jianli.yang/voice_classification/online/models"
	+ music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth")
	+ music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth")
	+ gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth")
	+ gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth")
	+ vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model)
	+
	+ tot_st = time.time()
	+ ret_map = {
	+ 0: {0: 0, 1: 0, 2: 0},
	+ 1: {0: 0, 1: 0, 2: 0},
	+ 2: {0: 0, 1: 0, 2: 0}
	+ }
	+ for file in female:
	+ st = time.time()
	+ print("------------------------------>>>>>")
	+ gender, female_score = vc.process_by_feature(file)
	+ ret_map[0][gender] += 1
	+ if gender != 0:
	+ print("err:female->{}\|{}\|{}".format(gender, file, female_score))
	+ print("process\|spend_tm=={}".format(time.time() - st))
	+
	+ for file in male:
	+ st = time.time()
	+ print("------------------------------>>>>>")
	+ gender, female_score = vc.process_by_feature(file)
	+ ret_map[1][gender] += 1
	+ if gender != 1:
	+ print("err:male->{}\|{}\|{}".format(gender, file, female_score))
	+ print("process\|spend_tm=={}".format(time.time() - st))
	+
	+ for file in other:
	+ st = time.time()
	+ print("------------------------------>>>>>")
	+ gender, female_score = vc.process_by_feature(file)
	+ ret_map[2][gender] += 1
	+ if gender != 2:
	+ print("err:other->{}\|{}\|{}".format(gender, file, female_score))
	+ print("process\|spend_tm=={}".format(time.time() - st))
	+
	+ global transcode_time, vb_time, mfcc_time, predict_time
	+ print("spend_time:tot={}\|transcode={}\|vb={}\|gen_feature={}\|predict={}".format(time.time() - tot_st, transcode_time,
	+ vb_time, mfcc_time, predict_time))
	+ f_f = ret_map[0][0]
	+ f_m = ret_map[0][1]
	+ f_o = ret_map[0][2]
	+ m_f = ret_map[1][0]
	+ m_m = ret_map[1][1]
	+ m_o = ret_map[1][2]
	+ o_f = ret_map[2][0]
	+ o_m = ret_map[2][1]
	+ o_o = ret_map[2][2]
	+
	+ print("ff:{},fm:{},fo:{}".format(f_f, f_m, f_o))
	+ print("mm:{},mf:{},mo:{}".format(m_m, m_f, m_o))
	+ print("om:{},of:{},oo:{}".format(o_m, o_f, o_o))
	+ # 女性准确率和召回率
	+ f_acc = f_f / (f_f + m_f + o_f)
	+ f_recall = f_f / (f_f + f_m + f_o)
	+ # 男性准确率和召回率
	+ m_acc = m_m / (m_m + f_m + o_m)
	+ m_recall = m_m / (m_m + m_f + m_o)
	+ print("female: acc={}\|recall={}".format(f_acc, f_recall))
	+ print("male: acc={}\|recall={}".format(m_acc, m_recall))
	+
	+
	+def test_all():
	+ import glob
	+ base_dir = "/data/datasets/music_voice_dataset_full/online_data_v3_top200"
	+ female = glob.glob(os.path.join(base_dir, "female/*mp4"))
	+ male = glob.glob(os.path.join(base_dir, "male/*mp4"))
	+ other = glob.glob(os.path.join(base_dir, "other/*mp4"))
	+ model_path = "/data/jianli.yang/voice_classification/online/models"
	+ music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth")
	+ music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth")
	+ gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth")
	+ gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth")
	+ vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model)
	+
	+ tot_st = time.time()
	+ ret_map = {
	+ 0: {0: 0, 1: 0, 2: 0},
	+ 1: {0: 0, 1: 0, 2: 0},
	+ 2: {0: 0, 1: 0, 2: 0}
	+ }
	+ for file in female:
	+ st = time.time()
	+ print("------------------------------>>>>>")
	+ gender, female_score = vc.process(file)
	+ ret_map[0][gender] += 1
	+ if gender != 0:
	+ print("err:female->{}\|{}\|{}".format(gender, file, female_score))
	+ print("process\|spend_tm=={}".format(time.time() - st))
	+
	+ for file in male:
	+ st = time.time()
	+ print("------------------------------>>>>>")
	+ gender, female_score = vc.process(file)
	+ ret_map[1][gender] += 1
	+ if gender != 1:
	+ print("err:male->{}\|{}\|{}".format(gender, file, female_score))
	+ print("process\|spend_tm=={}".format(time.time() - st))
	+
	+ for file in other:
	+ st = time.time()
	+ print("------------------------------>>>>>")
	+ gender, female_score = vc.process(file)
	+ ret_map[2][gender] += 1
	+ if gender != 2:
	+ print("err:other->{}\|{}\|{}".format(gender, file, female_score))
	+ print("process\|spend_tm=={}".format(time.time() - st))
	+
	+ global transcode_time, vb_time, mfcc_time, predict_time
	+ print("spend_time:tot={}\|transcode={}\|vb={}\|gen_feature={}\|predict={}".format(time.time() - tot_st, transcode_time,
	+ vb_time, mfcc_time, predict_time))
	+ f_f = ret_map[0][0]
	+ f_m = ret_map[0][1]
	+ f_o = ret_map[0][2]
	+ m_f = ret_map[1][0]
	+ m_m = ret_map[1][1]
	+ m_o = ret_map[1][2]
	+ o_f = ret_map[2][0]
	+ o_m = ret_map[2][1]
	+ o_o = ret_map[2][2]
	+
	+ print("ff:{},fm:{},fo:{}".format(f_f, f_m, f_o))
	+ print("mm:{},mf:{},mo:{}".format(m_m, m_f, m_o))
	+ print("om:{},of:{},oo:{}".format(o_m, o_f, o_o))
	+ # 女性准确率和召回率
	+ f_acc = f_f / (f_f + m_f + o_f)
	+ f_recall = f_f / (f_f + f_m + f_o)
	+ # 男性准确率和召回率
	+ m_acc = m_m / (m_m + f_m + o_m)
	+ m_recall = m_m / (m_m + m_f + m_o)
	+ print("female: acc={}\|recall={}".format(f_acc, f_recall))
	+ print("male: acc={}\|recall={}".format(m_acc, m_recall))
	+
	+
	+if __name__ == "__main__":
	+ # test_all()
	+ # test_all_feature()
	+ model_path = sys.argv[1]
	+ voice_path = sys.argv[2]
	+ music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth")
	+ music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth")
	+ gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth")
	+ gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth")
	+ vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model)
	+ for i in range(0, 1):
	+ st = time.time()
	+ print("------------------------------>>>>>")
	+ vc.process(voice_path)
	+ print("process\|spend_tm=={}".format(time.time() - st))

File Metadata

Mime Type: text/x-diff
Expires: Sun, Jan 12, 08:30 (1 d, 15 h)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 1347164
Default Alt Text: (42 KB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions