diff --git a/AutoCoverTool/online/tone_shift_one.py b/AutoCoverTool/online/tone_shift_one.py index e395c8d..9a422f8 100644 --- a/AutoCoverTool/online/tone_shift_one.py +++ b/AutoCoverTool/online/tone_shift_one.py @@ -1,328 +1,338 @@ """ 变调的方式做处理 1. 下载 2. 分离 3. 针对于人声变调+2,伴奏+1 4. 合成 """ import os import json import shutil import librosa import logging import numpy as np from ref.music_remover.separate_interface import SeparateInterface from online.inference_worker import upload_file2cos, gs_state_use, gs_state_finish, gs_state_default from online.common import * +from ref.online.voice_class_online import VoiceClass logging.basicConfig(filename='/tmp/tone_shift_one.log', level=logging.INFO) gs_tone_shift_exe = "/opt/soft/bin/tone_shift_exe" gs_simple_mixer_path = "/opt/soft/bin/simple_mixer" gs_err_code_success = 0 gs_err_code_tone_shift = 1 gs_err_code_mix = 2 gs_err_code_transcode = 3 gs_err_code_upload = 4 gs_err_code_download = 5 gs_err_code_trans_to_mp3 = 6 gs_err_code_separate = 7 gs_err_code_duration_too_long = 8 gs_err_code_duration_no_vocal = 9 gs_err_code_duration_err = 10 gs_err_code_transcode_acc = 11 gs_err_code_upload_acc = 12 gs_err_code_download_acc = 13 gs_err_code_download_vocal = 14 gs_err_code_transcode_acc_v1 = 15 gs_err_code_transcode_vocal_v1 = 16 gs_err_code_silence_no_data = 17 gs_err_code_silence_no_process = 18 def exec_cmd(cmd): r = os.popen(cmd) text = r.read() r.close() return text def get_d(audio_path): cmd = "ffprobe -v quiet -print_format json -show_format -show_streams {}".format(audio_path) data = exec_cmd(cmd) data = json.loads(data) # 返回秒 if 'format' in data.keys() and 'duration' in data['format']: return float(data["format"]["duration"]) return -1 def get_mean_power(audio_path): sr = 44100 audio, sr = librosa.load(audio_path, sr=sr, mono=True) mm = np.mean(np.abs(audio)) return mm class ToneShift: def __init__(self): self.separate_inst = SeparateInterface() + model_path = "./models" + music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth") + music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth") + gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth") + gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth") + + self.voice_class = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, + gender_no_pure_model) def update_state(self, song_id, state): sql = "update svc_queue_table set state={},update_time={} where song_id = {}". \ format(state, int(time.time()), song_id) banned_user_map['db'] = "av_db" update_db(sql, banned_user_map) def get_url_by_id(self, song_id): sql = "select song_id, url from svc_queue_table where song_id={}".format(song_id) banned_user_map["db"] = "av_db" data = get_data_by_mysql(sql) if len(data) == 0: return None, None return str(data[0][0]), data[0][1] def get_one_data_logic(self): """ 按照5,4,3的优先级进行获取 :return: """ song_src_arr = [5, 4, 3] for song_src in song_src_arr: song_id, song_url = self.get_one_data(song_src=song_src) if song_id is not None: return song_id, song_url return None, None def get_one_data(self, song_src=3): sql = "select song_id, url from svc_queue_table where state = 0 and song_src={} order by create_time asc limit 1".format( song_src) banned_user_map["db"] = "av_db" data = get_data_by_mysql(sql, banned_user_map) if len(data) == 0: return None, None song_id, song_url = data[0] if song_id != "": self.update_state(song_id, gs_state_use) return str(song_id), song_url def pre_process(self, work_dir, song_url): """ 创建文件夹,下载数据 :return: """ if "?sign=" in song_url: return gs_err_code_download ext = str(song_url).split(".")[-1] dst_file = "{}/src_origin.{}".format(work_dir, ext) cmd = "wget {} -O {}".format(song_url, dst_file) os.system(cmd) if not os.path.exists(dst_file): return gs_err_code_download duration = get_d(dst_file) if duration < 0: return gs_err_code_duration_err print("Duration:", dst_file, duration) if duration > 20 * 60: return gs_err_code_duration_too_long dst_mp3_file = "{}/src.wav".format(work_dir) cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {} ".format(dst_file, dst_mp3_file) os.system(cmd) if not os.path.exists(dst_mp3_file): return gs_err_code_trans_to_mp3 return gs_err_code_success def tone_shift_one(self, in_file, dst_file, pitch): cmd = "{} {} {} {}".format(gs_tone_shift_exe, in_file, dst_file, pitch) os.system(cmd) return os.path.exists(dst_file) def mix(self, cid, vocal_path, acc_path, tp): if tp == 1: vocal_pitch = 2 acc_pitch = 0 else: vocal_pitch = -2 acc_pitch = 0 vocal_path_2 = vocal_path.replace(".wav", "_{}.wav".format(vocal_pitch)) acc_path_2 = acc_path.replace(".wav", "_{}.wav".format(acc_pitch)) err = self.tone_shift_one(vocal_path, vocal_path_2, vocal_pitch) if not err: - return gs_err_code_tone_shift, None + return gs_err_code_tone_shift, None, None + gender, female_rate = self.voice_class.process_one(vocal_path_2) err = self.tone_shift_one(acc_path, acc_path_2, acc_pitch) if not err: - return gs_err_code_tone_shift, None + return gs_err_code_tone_shift, None, None base_dir = os.path.dirname(vocal_path) mix_path = "{}/mix_{}_{}.wav".format(base_dir, vocal_pitch, acc_pitch) cmd = "{} {} {} {}".format(gs_simple_mixer_path, vocal_path_2, acc_path_2, mix_path) print("exec_cmd={}".format(cmd)) os.system(cmd) if not os.path.exists(mix_path): - return gs_err_code_mix, None + return gs_err_code_mix, None, None # 转码 mix_path_mp3 = mix_path.replace(".wav", ".mp4") cmd = "ffmpeg -i {} -b:a 128k -c:a aac -ar 44100 -ac 2 -y {} -loglevel fatal".format(mix_path, mix_path_mp3) os.system(cmd) if not os.path.exists(mix_path_mp3): - return gs_err_code_transcode, None + return gs_err_code_transcode, None, None # 上传到cos mix_name = os.path.basename(mix_path_mp3) key = "av_res/svc_res_tone_shift/{}/{}".format(str(cid), mix_name) if not upload_file2cos(key, mix_path_mp3): - return gs_err_code_upload, None - return gs_err_code_success, key + return gs_err_code_upload, None, None + return gs_err_code_success, key, gender def upload_acc(self, cid, acc_path): # 转码 mix_path_aac = acc_path.replace(".wav", ".m4a") cmd = "ffmpeg -i {} -b:a 128k -c:a aac -ar 44100 -ac 2 -y {} -loglevel fatal".format(acc_path, mix_path_aac) os.system(cmd) if not os.path.exists(mix_path_aac): return gs_err_code_transcode_acc, None # 上传 mix_name = os.path.basename(mix_path_aac) key = "av_res/svc_res_tone_shift/{}/{}".format(str(cid), mix_name) if not upload_file2cos(key, mix_path_aac): return gs_err_code_upload_acc, None return gs_err_code_success, key def process_one(self, cid, work_dir): """ :param cid: :param work_dir: :return: """ src_mp3 = os.path.join(work_dir, "src.wav") vocal_path = os.path.join(work_dir, "vocal.wav") acc_path = os.path.join(work_dir, "acc.wav") if not (os.path.exists(vocal_path) and os.path.exists(acc_path)): if not self.separate_inst.process(cid, src_mp3, vocal_path, acc_path): return gs_err_code_separate, [] if not os.path.exists(vocal_path) or not os.path.exists(acc_path): return gs_err_code_separate, [] # 当人声的平均能量小于一定值时,则认为无人声(0.01是经验值判定,样本分析来看) # 无人声的样本[0.0056, 0.0003], 有人声的样本(目前最小)[0.046, 0.049] print("power:{},{}".format(cid, get_mean_power(vocal_path))) if get_mean_power(vocal_path) < 0.02: return gs_err_code_duration_no_vocal, [] - err, type1_mix_mp3 = self.mix(cid, vocal_path, acc_path, 1) + err, type1_mix_mp3, gender = self.mix(cid, vocal_path, acc_path, 1) if err != gs_err_code_success: return err, [] - err, type2_mix_mp3 = self.mix(cid, vocal_path, acc_path, 2) + err, type2_mix_mp3, gender2 = self.mix(cid, vocal_path, acc_path, 2) if err != gs_err_code_success: return err, [] # 上传伴奏文件 # err, acc_path_m4a = self.upload_acc(cid, acc_path) # if err != gs_err_code_success: # return err, [] - return gs_err_code_success, [type1_mix_mp3, type2_mix_mp3] + return gs_err_code_success, [type1_mix_mp3, type2_mix_mp3, str(gender), str(gender2)] def download_and_transcode(self, url, local_path, local_path_wav): cmd = "wget {} -O {}".format(url, local_path) os.system(cmd) if not os.path.exists(local_path): return -1 cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {}".format(local_path, local_path_wav) os.system(cmd) if not os.path.exists(local_path_wav): return -2 return 0 def get_data_from_mysql(self, cid, work_dir): sql = "select starmaker_songid,task_url,complete_url,voice_url from starmaker_musicbook.silence where starmaker_songid={} order by task_id desc limit 1".format( cid) data = get_data_by_mysql(sql, banned_user_map) if len(data) == 0: return gs_err_code_silence_no_data song_id, task_url, complete_url, voice_url = data[0] if complete_url != "" and voice_url != "": """ 将人声与伴奏下载下来 """ ext = str(complete_url).split(".")[-1] acc_dst_file = os.path.join(work_dir, "acc.{}".format(ext)) acc_wav_dst_file = os.path.join(work_dir, "acc.wav") err = self.download_and_transcode(complete_url, acc_dst_file, acc_wav_dst_file) os.unlink(acc_dst_file) if err == -1: return gs_err_code_download_acc if err == -2: return gs_err_code_transcode_acc_v1 ext = str(voice_url).split(".")[-1] vocal_dst_file = os.path.join(work_dir, "vocal.{}".format(ext)) vocal_wav_dst_file = os.path.join(work_dir, "vocal.wav") err = self.download_and_transcode(voice_url, vocal_dst_file, vocal_wav_dst_file) os.unlink(vocal_dst_file) if err == -1: return gs_err_code_download_vocal if err == -2: return gs_err_code_transcode_vocal_v1 return gs_err_code_success return gs_err_code_silence_no_process def process_worker(self): logging.info("start process_worker .....") base_dir = "/tmp/tone_shift_one" if not os.path.exists(base_dir): os.makedirs(base_dir) while True: worker_st = time.time() cid, song_url = self.get_one_data_logic() - # cid, song_url = self.get_url_by_id('175210503076374799') + # cid, song_url = self.get_url_by_id('611752105030548048') if cid is None: time.sleep(5) logging.info("get one data is None ...") continue work_dir = os.path.join(base_dir, str(cid)) if os.path.exists(work_dir): shutil.rmtree(work_dir) os.makedirs(work_dir) # 先查看消音数据库中是否已经完成了该项目,已经有的话,就直接下载即可 err = self.get_data_from_mysql(cid, work_dir) if err != gs_err_code_success: # 清空磁盘 shutil.rmtree(work_dir) os.makedirs(work_dir) err = self.pre_process(work_dir, song_url) if err != gs_err_code_success: self.update_state(str(cid), -err) continue st = time.time() err, data = self.process_one(str(cid), work_dir) logging.info("process_finish,{},{}".format(cid, time.time() - st)) if err == gs_err_code_success and len(data) != 0: sql = "update svc_queue_table set state={},update_time={},svc_url=\"{}\" where song_id = {}". \ format(gs_state_finish, int(time.time()), ",".join(data), str(cid)) banned_user_map['db'] = "av_db" update_db(sql, banned_user_map) else: self.update_state(str(cid), -err) shutil.rmtree(work_dir) logging.info("process_finish,{},{}".format(cid, time.time() - worker_st)) if __name__ == '__main__': ts = ToneShift() ts.process_worker() diff --git a/AutoCoverTool/ref/online/common.py b/AutoCoverTool/ref/online/common.py new file mode 100644 index 0000000..af3487a --- /dev/null +++ b/AutoCoverTool/ref/online/common.py @@ -0,0 +1,93 @@ +#-*-encording=utf-8-*- +""" +程序绑定核心 +一个脚本启动多次,每次绑定一个核心,不会多次绑定到同一个核心 +每个进程选定绑定n个核心,或者自己传入需要绑定的核心编号 +""" + +import time +import psutil +import os +import sys +import hashlib +import fcntl + +""" +自动获取可用核心 +""" + + +def exec_cmd_ints(cmd): + """ + 执行cmd,获取返回值 + :param cmd: + :return: + """ + r = os.popen(cmd) + lines = r.readlines() + ids = [] + for line in lines: + line = line.strip() + if line.isdigit(): + id = int(float(line)) + ids.append(id) + return ids + + +def get_idle_kernel(n=1): + cur_id = os.getpid() + name = os.path.basename(sys.argv[0]) + command = "ps -ef | grep {} |grep python | awk \'{{print $2}}\'".format(name) + print(command) + ids = exec_cmd_ints(command) + + print(ids, cur_id) + # 获取所有被绑定的核心 + count = psutil.cpu_count() + used = [False] * (count // n) + command = "pidstat | grep {} | awk \'{{print $(NF-1)}}\'" + for i in range(0, len(ids)): + if cur_id != ids[i]: + cmd = command.format(ids[i]) + kers = exec_cmd_ints(cmd) + for ker in kers: + ker = ker // n + used[ker] = True + print(used) + # 获取N个可用的核心 + for i in range(0, len(used)): + if not used[i]: + res = [] + cur_i = i * n + for idx in range(cur_i, cur_i+n): + if idx < count: + res.append(idx) + return res + return 0 + + +def bind_kernel(n=1, kernel=[]): + p = psutil.Process() + + # 加锁 + name = hashlib.md5(os.path.basename(sys.argv[0]).encode('utf-8')).hexdigest() + name = os.path.join("/tmp", name + ".lock") + if not os.path.exists(name): + with open(name, "w") as f: + f.write("0") + file = open(name) + fcntl.flock(file.fileno(), fcntl.LOCK_EX) # 排他锁 + print("lock file --- {}".format(name)) + if len(kernel) > 0: + kernels = kernel + else: + kernels = get_idle_kernel(n) + p.cpu_affinity(kernels) # 绑定特定核心 + print("bind_kernel", kernels) + file.close() # 释放锁 + print("unlock file --- {}".format(name)) + + +def calc_forever(): + for i in range(0, 10000): + time.sleep(1000) \ No newline at end of file diff --git a/AutoCoverTool/ref/online/mobilenet_v2_custom.py b/AutoCoverTool/ref/online/mobilenet_v2_custom.py new file mode 100644 index 0000000..57b1227 --- /dev/null +++ b/AutoCoverTool/ref/online/mobilenet_v2_custom.py @@ -0,0 +1,142 @@ +""" +直接从代码库中拷贝出的代码 +目的: mobilenet_v2只允许输入图片的通道数为3,不满足要求,因此拷贝出来做修改 +""" + +from torch import nn + + +def _make_divisible(v, divisor, min_value=None): + """ + This function is taken from the original tf repo. + It ensures that all layers have a channel number that is divisible by 8 + It can be seen here: + https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py + :param v: + :param divisor: + :param min_value: + :return: + """ + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +class ConvBNReLU(nn.Sequential): + def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1): + padding = (kernel_size - 1) // 2 + super(ConvBNReLU, self).__init__( + nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False), + nn.BatchNorm2d(out_planes), + nn.ReLU6(inplace=True) + ) + + +class InvertedResidual(nn.Module): + def __init__(self, inp, oup, stride, expand_ratio): + super(InvertedResidual, self).__init__() + self.stride = stride + assert stride in [1, 2] + + hidden_dim = int(round(inp * expand_ratio)) + self.use_res_connect = self.stride == 1 and inp == oup + + layers = [] + if expand_ratio != 1: + # pw + layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1)) + layers.extend([ + # dw + ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim), + # pw-linear + nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), + nn.BatchNorm2d(oup), + ]) + self.conv = nn.Sequential(*layers) + + def forward(self, x): + if self.use_res_connect: + return x + self.conv(x) + else: + return self.conv(x) + + +class MobileNetV2Custom(nn.Module): + def __init__(self, num_classes=2, in_channel=1, width_mult=1.0, inverted_residual_setting=None, round_nearest=8): + """ + MobileNet V2 main class + + Args: + num_classes (int): Number of classes + width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount + inverted_residual_setting: Network structure + round_nearest (int): Round the number of channels in each layer to be a multiple of this number + Set to 1 to turn off rounding + """ + super(MobileNetV2Custom, self).__init__() + block = InvertedResidual + input_channel = 32 + last_channel = 1280 + + if inverted_residual_setting is None: + inverted_residual_setting = [ + # t, c, n, s + [1, 16, 1, 1], + [6, 24, 2, 2], + [6, 32, 3, 2], + [6, 64, 4, 2], + [6, 96, 3, 1], + [6, 160, 3, 2], + [6, 320, 1, 1], + ] + + # only check the first element, assuming user knows t,c,n,s are required + if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4: + raise ValueError("inverted_residual_setting should be non-empty " + "or a 4-element list, got {}".format(inverted_residual_setting)) + + # building first layer + input_channel = _make_divisible(input_channel * width_mult, round_nearest) + self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest) + # 修改的地方,原来in_channel=3 + features = [ConvBNReLU(in_channel, input_channel, stride=2)] + # building inverted residual blocks + for t, c, n, s in inverted_residual_setting: + output_channel = _make_divisible(c * width_mult, round_nearest) + for i in range(n): + stride = s if i == 0 else 1 + features.append(block(input_channel, output_channel, stride, expand_ratio=t)) + input_channel = output_channel + # building last several layers + features.append(ConvBNReLU(input_channel, self.last_channel, kernel_size=1)) + # make it nn.Sequential + self.features = nn.Sequential(*features) + + # building classifier + self.classifier = nn.Sequential( + nn.Dropout(0.2), + nn.Linear(self.last_channel, num_classes), + ) + + # weight initialization + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out') + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.BatchNorm2d): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + nn.init.zeros_(m.bias) + + def forward(self, x): + x = self.features(x) + x = x.mean([2, 3]) + x = self.classifier(x) + return x diff --git a/AutoCoverTool/ref/online/model.py b/AutoCoverTool/ref/online/model.py new file mode 100644 index 0000000..c5e8adc --- /dev/null +++ b/AutoCoverTool/ref/online/model.py @@ -0,0 +1,71 @@ +from mobilenet_v2_custom import MobileNetV2Custom +import torch +import torch.nn as nn + +MFCC_LEN = 80 +FRAME_LEN = 128 + + +class MobileNetV2Gender(MobileNetV2Custom): + + def forward(self, x): + x = x.view([-1, 1, FRAME_LEN, MFCC_LEN]) + return super(MobileNetV2Gender, self).forward(x) + + +class MusicVoiceV5Model(nn.Module): + def __init__(self): + super(MusicVoiceV5Model, self).__init__() + + def conv_bn(inp, oup, stride): + return nn.Sequential( + nn.Conv2d(inp, oup, 3, stride, 1, bias=False), + nn.BatchNorm2d(oup), + nn.ReLU(inplace=True) + ) + + def conv_dw(inp, oup, stride): + return nn.Sequential( + nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False), + nn.BatchNorm2d(inp), + nn.ReLU(inplace=True), + + nn.Conv2d(inp, oup, 1, 1, 0, bias=False), + nn.BatchNorm2d(oup), + nn.ReLU(inplace=True), + ) + + self.model = nn.Sequential( + conv_bn(1, 32, 2), + conv_dw(32, 64, 1), + conv_dw(64, 128, 2), + conv_dw(128, 128, 1), + conv_dw(128, 256, 2), + conv_dw(256, 256, 1), + conv_dw(256, 512, 2), + conv_dw(512, 512, 1), + conv_dw(512, 512, 1), + conv_dw(512, 512, 1), + conv_dw(512, 512, 1), + conv_dw(512, 512, 1), + conv_dw(512, 1024, 2), + conv_dw(1024, 1024, 1), + nn.AvgPool2d((4, 3)), + ) + self.fc = nn.Linear(1024, 2) + + def forward(self, x): + x = x.view([-1, 1, FRAME_LEN, MFCC_LEN]) + x = self.model(x) + x = x.view(-1, 1024) + x = self.fc(x) + return x + + +def load_model(model_type, model_path, device): + model = model_type() + params = torch.load(model_path, map_location=torch.device(device)) + model.load_state_dict(state_dict=params) + model.eval() + model.to(device) + return model diff --git a/AutoCoverTool/ref/online/readme.md b/AutoCoverTool/ref/online/readme.md new file mode 100644 index 0000000..10a1f09 --- /dev/null +++ b/AutoCoverTool/ref/online/readme.md @@ -0,0 +1,50 @@ +#男女声识别 + +``` +模型名称以及对应作用: +---gender_8k_ratev5_v6_adam.pth // 男女声(纯人声)分类模型(使用8k纯人声数据集进行训练,mobilenet_v2,adam优化器) +---gender_8k_v6_adam.pth // 男女声(带人声)分类模型(使用8k带人声数据集进行训练,mobilenet_v2,adam优化器) +---voice_005_rec_v5.pth // 纯人声分类模型(400首人工标注的歌曲,判定纯人声段(使用作品中带人声段当作负样本) mobilenet_v1, sgd优化器) +---voice_10_v5.pth // 带人声分类模型(400首人工标注的歌曲,判定带人声段, mobilenet_v1, sgd优化器) +模型地址:https://av-audit-sync-in-1256122840.cos.ap-mumbai.myqcloud.com/hub/voice_classification/models.zip +``` + +# 文件说明 +``` +---common.py // 用于绑定核心的代码 +---mobilenet_v2_custom.py // 模型代码 +---model.py // 调用模型的封装层 +---readme.MD // 说明文件 +---voice_class_online.py // 运行时使用的文件 +``` + +# 环境安装 +``` +cd /home/worker +wget "https://av-audit-sync-in-1256122840.cos.ap-mumbai.myqcloud.com/hub/voice_classification/bin/bin.zip" +unzip bin.zip +rm -f bin.zip +export PATH=$PATH:/home/worker/bin # 需要写入到.zshrc中 +sudo yum install libsndfile-devel + +# 以下使用手动安装即可 +conda create -n voice_class python=3.7 -y +conda activate voice_class +pip3 install librosa +pip3 install psutil +pip3 install torch==1.5 torchvision torchaudio +``` + +# 使用说明 +``` +下载模型并解压后,按照voice_class_online.py中的运行方式运行即可 +``` + +# 注意: +目前代码中限制了CPU的核心数量,只允许占用一个核,建议根据核心的情况多开几个进程做处理 + +# 性能测试(不加性能限制的情况下在GPU-2机器上测试得到): +20个线上样本(男10,女10) + +CPU情况:spend_time:tot=31.91|transcode=5.92|vb=3.12|gen_feature=3.5|predict=18.94 +GPU情况:spend_time:tot=15.64|transcode=6.34|vb=4.17|gen_feature=3.3|predict=1.443 diff --git a/AutoCoverTool/ref/online/resource/female/4785074274851990.mp4 b/AutoCoverTool/ref/online/resource/female/4785074274851990.mp4 new file mode 100644 index 0000000..9b225ee Binary files /dev/null and b/AutoCoverTool/ref/online/resource/female/4785074274851990.mp4 differ diff --git a/AutoCoverTool/ref/online/voice_class_online.py b/AutoCoverTool/ref/online/voice_class_online.py new file mode 100644 index 0000000..6041c94 --- /dev/null +++ b/AutoCoverTool/ref/online/voice_class_online.py @@ -0,0 +1,420 @@ +""" +男女声分类在线工具 +1 转码为16bit单声道 +2 均衡化 +3 模型分类 +""" + +import os +import sys +import librosa +import shutil +import logging +import time +import torch.nn.functional as F +import numpy as np +from model import * +# from common import bind_kernel + +logging.basicConfig(level=logging.INFO) + +os.environ["LRU_CACHE_CAPACITY"] = "1" + +# torch.set_num_threads(1) +# bind_kernel(1) + +""" +临时用一下,全局使用的变量 +""" + +transcode_time = 0 +vb_time = 0 +mfcc_time = 0 +predict_time = 0 + +""" +错误码 +""" +ERR_CODE_SUCCESS = 0 # 处理成功 +ERR_CODE_NO_FILE = -1 # 文件不存在 +ERR_CODE_TRANSCODE = -2 # 转码失败 +ERR_CODE_VOLUME_BALANCED = -3 # 均衡化失败 +ERR_CODE_FEATURE_TOO_SHORT = -4 # 特征文件太短 + +""" +常量 +""" + +FRAME_LEN = 128 +MFCC_LEN = 80 + +EBUR128_BIN = "/opt/soft/bin/standard_audio_no_cut" +# EBUR128_BIN = "/Users/yangjianli/linux/opt/soft/bin/standard_audio_no_cut" +GENDER_FEMALE = 0 +GENDER_MALE = 1 +GENDER_OTHER = 2 +""" +通用函数 +""" + + +def exec_cmd(cmd): + ret = os.system(cmd) + if ret != 0: + return False + return True + + +""" +业务需要的函数 +""" + + +def get_one_mfcc(file_url): + st = time.time() + data, sr = librosa.load(file_url, sr=16000) + if len(data) < 512: + return [] + mfcc = librosa.feature.mfcc(y=data, sr=sr, n_fft=512, hop_length=256, n_mfcc=MFCC_LEN) + mfcc = mfcc.transpose() + print("get_one_mfcc:spend_time={}".format(time.time() - st)) + global mfcc_time + mfcc_time += time.time() - st + return mfcc + + +def volume_balanced(src, dst): + st = time.time() + cmd = "{} {} {}".format(EBUR128_BIN, src, dst) + logging.info(cmd) + exec_cmd(cmd) + if not os.path.exists(dst): + logging.error("volume_balanced:cmd={}".format(cmd)) + print("volume_balanced:spend_time={}".format(time.time() - st)) + + global vb_time + vb_time += time.time() - st + return os.path.exists(dst) + + +def transcode(src, dst): + st = time.time() + cmd = "ffmpeg -loglevel quiet -i {} -ar 16000 -ac 1 {}".format(src, dst) + logging.info(cmd) + exec_cmd(cmd) + if not os.path.exists(dst): + logging.error("transcode:cmd={}".format(cmd)) + print("transcode:spend_time={}".format(time.time() - st)) + global transcode_time + transcode_time += time.time() - st + return os.path.exists(dst) + + +class VoiceClass: + + def __init__(self, music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model): + """ + 四个模型 + :param music_voice_pure_model: 分辨纯净人声/其他 + :param music_voice_no_pure_model: 分辨有人声/其他 + :param gender_pure_model: 纯净人声分辨男女 + :param gender_no_pure_model: 有人声分辨男女 + """ + st = time.time() + self.device = "cpu" + self.batch_size = 256 + self.music_voice_pure_model = load_model(MusicVoiceV5Model, music_voice_pure_model, self.device) + self.music_voice_no_pure_model = load_model(MusicVoiceV5Model, music_voice_no_pure_model, self.device) + self.gender_pure_model = load_model(MobileNetV2Gender, gender_pure_model, self.device) + self.gender_no_pure_model = load_model(MobileNetV2Gender, gender_no_pure_model, self.device) + logging.info("load model ok ! spend_time={}".format(time.time() - st)) + + def batch_predict(self, model, features): + st = time.time() + scores = [] + with torch.no_grad(): + for i in range(0, len(features), self.batch_size): + cur_data = features[i:i + self.batch_size].to(self.device) + predicts = model(cur_data) + predicts_score = F.softmax(predicts, dim=1) + scores.extend(predicts_score.cpu().numpy()) + ret = np.array(scores) + global predict_time + predict_time += time.time() - st + return ret + + def predict_pure(self, filename, features): + scores = self.batch_predict(self.music_voice_pure_model, features) + new_features = [] + for idx, score in enumerate(scores): + if score[0] > 0.5: # 非人声 + continue + new_features.append(features[idx].numpy()) + + # 人声段太少,不能进行处理 + # 参数可以改 + new_feature_len = len(new_features) + new_feature_rate = len(new_features) / len(features) + if new_feature_len < 4 or new_feature_rate < 0.4: + logging.warning( + "filename={}|predict_pure|other|len={}|rate={}".format(filename, new_feature_len, new_feature_rate) + ) + return GENDER_OTHER, -1 + new_features = torch.from_numpy(np.array(new_features)) + scores = self.batch_predict(self.gender_pure_model, new_features) + f_avg = sum(scores[:, 0]) / len(scores) + m_avg = sum(scores[:, 1]) / len(scores) + female_rate = f_avg / (f_avg + m_avg) + if female_rate > 0.65: + return GENDER_FEMALE, female_rate + if female_rate < 0.12: + return GENDER_MALE, female_rate + logging.warning( + "filename={}|predict_pure|other|len={}|rate={}".format(filename, new_feature_len, new_feature_rate) + ) + return GENDER_OTHER, female_rate + + def predict_no_pure(self, filename, features): + scores = self.batch_predict(self.music_voice_no_pure_model, features) + new_features = [] + for idx, score in enumerate(scores): + if score[0] > 0.5: # 非人声 + continue + new_features.append(features[idx].numpy()) + + # 人声段太少,不能进行处理 + # 参数可以改 + new_feature_len = len(new_features) + new_feature_rate = len(new_features) / len(features) + if new_feature_len < 4 or new_feature_rate < 0.4: + logging.warning( + "filename={}|predict_no_pure|other|len={}|rate={}".format(filename, new_feature_len, new_feature_rate) + ) + return GENDER_OTHER, -1 + new_features = torch.from_numpy(np.array(new_features)) + scores = self.batch_predict(self.gender_no_pure_model, new_features) + f_avg = sum(scores[:, 0]) / len(scores) + m_avg = sum(scores[:, 1]) / len(scores) + female_rate = f_avg / (f_avg + m_avg) + if female_rate > 0.75: + return GENDER_FEMALE, female_rate + if female_rate < 0.1: + return GENDER_MALE, female_rate + logging.warning( + "filename={}|predict_no_pure|other|len={}|rate={}".format(filename, new_feature_len, new_feature_rate) + ) + return GENDER_OTHER, female_rate + + def predict(self, filename, features): + st = time.time() + new_features = [] + for i in range(FRAME_LEN, len(features), FRAME_LEN): + new_features.append(features[i - FRAME_LEN: i]) + new_features = torch.from_numpy(np.array(new_features)) + gender, rate = self.predict_pure(filename, new_features) + if gender == GENDER_OTHER: + logging.info("start no pure process...") + return self.predict_no_pure(filename, new_features) + print("predict|spend_time={}".format(time.time() - st)) + return gender, rate + + def process_one_logic(self, filename, file_path, cache_dir): + tmp_wav = os.path.join(cache_dir, "tmp.wav") + tmp_vb_wav = os.path.join(cache_dir, "tmp_vb.wav") + if not transcode(file_path, tmp_wav): + return ERR_CODE_TRANSCODE + if not volume_balanced(tmp_wav, tmp_vb_wav): + return ERR_CODE_VOLUME_BALANCED + features = get_one_mfcc(tmp_vb_wav) + if len(features) < FRAME_LEN: + logging.error("feature too short|file_path={}".format(file_path)) + return ERR_CODE_FEATURE_TOO_SHORT + return self.predict(filename, features) + + def process_one(self, file_path): + base_dir = os.path.dirname(file_path) + filename = os.path.splitext(file_path)[0] + cache_dir = os.path.join(base_dir, filename + "_cache") + if os.path.exists(cache_dir): + shutil.rmtree(cache_dir) + os.makedirs(cache_dir) + ret = self.process_one_logic(filename, file_path, cache_dir) + shutil.rmtree(cache_dir) + return ret + + def process(self, file_path): + gender, female_rate = self.process_one(file_path) + logging.info("{}|gender={}|female_rate={}".format(file_path, gender, female_rate)) + return gender, female_rate + + def process_by_feature(self, feature_file): + """ + 直接处理特征文件 + :param feature_file: + :return: + """ + filename = os.path.splitext(feature_file)[0] + features = np.load(feature_file) + gender, female_rate = self.predict(filename, features) + return gender, female_rate + + +def test_all_feature(): + import glob + base_dir = "/data/datasets/music_voice_dataset_full/feature_online_data_v3" + female = glob.glob(os.path.join(base_dir, "female/*feature.npy")) + male = glob.glob(os.path.join(base_dir, "male/*feature.npy")) + other = glob.glob(os.path.join(base_dir, "other/*feature.npy")) + model_path = "/data/jianli.yang/voice_classification/online/models" + music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth") + music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth") + gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth") + gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth") + vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model) + + tot_st = time.time() + ret_map = { + 0: {0: 0, 1: 0, 2: 0}, + 1: {0: 0, 1: 0, 2: 0}, + 2: {0: 0, 1: 0, 2: 0} + } + for file in female: + st = time.time() + print("------------------------------>>>>>") + gender, female_score = vc.process_by_feature(file) + ret_map[0][gender] += 1 + if gender != 0: + print("err:female->{}|{}|{}".format(gender, file, female_score)) + print("process|spend_tm=={}".format(time.time() - st)) + + for file in male: + st = time.time() + print("------------------------------>>>>>") + gender, female_score = vc.process_by_feature(file) + ret_map[1][gender] += 1 + if gender != 1: + print("err:male->{}|{}|{}".format(gender, file, female_score)) + print("process|spend_tm=={}".format(time.time() - st)) + + for file in other: + st = time.time() + print("------------------------------>>>>>") + gender, female_score = vc.process_by_feature(file) + ret_map[2][gender] += 1 + if gender != 2: + print("err:other->{}|{}|{}".format(gender, file, female_score)) + print("process|spend_tm=={}".format(time.time() - st)) + + global transcode_time, vb_time, mfcc_time, predict_time + print("spend_time:tot={}|transcode={}|vb={}|gen_feature={}|predict={}".format(time.time() - tot_st, transcode_time, + vb_time, mfcc_time, predict_time)) + f_f = ret_map[0][0] + f_m = ret_map[0][1] + f_o = ret_map[0][2] + m_f = ret_map[1][0] + m_m = ret_map[1][1] + m_o = ret_map[1][2] + o_f = ret_map[2][0] + o_m = ret_map[2][1] + o_o = ret_map[2][2] + + print("ff:{},fm:{},fo:{}".format(f_f, f_m, f_o)) + print("mm:{},mf:{},mo:{}".format(m_m, m_f, m_o)) + print("om:{},of:{},oo:{}".format(o_m, o_f, o_o)) + # 女性准确率和召回率 + f_acc = f_f / (f_f + m_f + o_f) + f_recall = f_f / (f_f + f_m + f_o) + # 男性准确率和召回率 + m_acc = m_m / (m_m + f_m + o_m) + m_recall = m_m / (m_m + m_f + m_o) + print("female: acc={}|recall={}".format(f_acc, f_recall)) + print("male: acc={}|recall={}".format(m_acc, m_recall)) + + +def test_all(): + import glob + base_dir = "/data/datasets/music_voice_dataset_full/online_data_v3_top200" + female = glob.glob(os.path.join(base_dir, "female/*mp4")) + male = glob.glob(os.path.join(base_dir, "male/*mp4")) + other = glob.glob(os.path.join(base_dir, "other/*mp4")) + model_path = "/data/jianli.yang/voice_classification/online/models" + music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth") + music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth") + gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth") + gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth") + vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model) + + tot_st = time.time() + ret_map = { + 0: {0: 0, 1: 0, 2: 0}, + 1: {0: 0, 1: 0, 2: 0}, + 2: {0: 0, 1: 0, 2: 0} + } + for file in female: + st = time.time() + print("------------------------------>>>>>") + gender, female_score = vc.process(file) + ret_map[0][gender] += 1 + if gender != 0: + print("err:female->{}|{}|{}".format(gender, file, female_score)) + print("process|spend_tm=={}".format(time.time() - st)) + + for file in male: + st = time.time() + print("------------------------------>>>>>") + gender, female_score = vc.process(file) + ret_map[1][gender] += 1 + if gender != 1: + print("err:male->{}|{}|{}".format(gender, file, female_score)) + print("process|spend_tm=={}".format(time.time() - st)) + + for file in other: + st = time.time() + print("------------------------------>>>>>") + gender, female_score = vc.process(file) + ret_map[2][gender] += 1 + if gender != 2: + print("err:other->{}|{}|{}".format(gender, file, female_score)) + print("process|spend_tm=={}".format(time.time() - st)) + + global transcode_time, vb_time, mfcc_time, predict_time + print("spend_time:tot={}|transcode={}|vb={}|gen_feature={}|predict={}".format(time.time() - tot_st, transcode_time, + vb_time, mfcc_time, predict_time)) + f_f = ret_map[0][0] + f_m = ret_map[0][1] + f_o = ret_map[0][2] + m_f = ret_map[1][0] + m_m = ret_map[1][1] + m_o = ret_map[1][2] + o_f = ret_map[2][0] + o_m = ret_map[2][1] + o_o = ret_map[2][2] + + print("ff:{},fm:{},fo:{}".format(f_f, f_m, f_o)) + print("mm:{},mf:{},mo:{}".format(m_m, m_f, m_o)) + print("om:{},of:{},oo:{}".format(o_m, o_f, o_o)) + # 女性准确率和召回率 + f_acc = f_f / (f_f + m_f + o_f) + f_recall = f_f / (f_f + f_m + f_o) + # 男性准确率和召回率 + m_acc = m_m / (m_m + f_m + o_m) + m_recall = m_m / (m_m + m_f + m_o) + print("female: acc={}|recall={}".format(f_acc, f_recall)) + print("male: acc={}|recall={}".format(m_acc, m_recall)) + + +if __name__ == "__main__": + # test_all() + # test_all_feature() + model_path = sys.argv[1] + voice_path = sys.argv[2] + music_voice_pure_model = os.path.join(model_path, "voice_005_rec_v5.pth") + music_voice_no_pure_model = os.path.join(model_path, "voice_10_v5.pth") + gender_pure_model = os.path.join(model_path, "gender_8k_ratev5_v6_adam.pth") + gender_no_pure_model = os.path.join(model_path, "gender_8k_v6_adam.pth") + vc = VoiceClass(music_voice_pure_model, music_voice_no_pure_model, gender_pure_model, gender_no_pure_model) + for i in range(0, 1): + st = time.time() + print("------------------------------>>>>>") + vc.process(voice_path) + print("process|spend_tm=={}".format(time.time() - st))