Page MenuHomePhabricator

No OneTemporary

diff --git a/AutoCoverTool/online/inference_one.py b/AutoCoverTool/online/inference_one.py
index 570d4a0..9893c1a 100644
--- a/AutoCoverTool/online/inference_one.py
+++ b/AutoCoverTool/online/inference_one.py
@@ -1,682 +1,683 @@
"""
单个处理的逻辑
song_id:
---src.mp3 // 源数据,需要提前放进去
---cache
---vocal.wav // 分离之后产生
---acc.wav // 分离之后产生
---vocal_32.wav // 分离之后产生
---song_id_sp1.wav // 合成之后产生
---song_id_sp2.wav // 合成之后产生
---song_id_sp2_d.wav // 降噪之后生成
---song_id_sp2_dv.wav // 降噪+拉伸之后产生 [占比太高的不产生]
---song_id_sp2_dve442.wav // 手动调整之后产生
---song_id_sp2_dve442_replace.wav // 替换之后产生
---song_id_sp2_dve442_replace_mix.wav // 人声+伴奏混合之后产生
---song_id
--acc.mp3 // 44k双声道320k
--vocal.mp3 // 44k双声道320k
--src.mp3 // 44k双声道320k
--song_id_sp2_dv.mp3 // 44k单声道320k
---song_id_out // 对外输出
--src.mp3 // 原始音频
--song_id_sp2_dv_replace_mix.mp3 // 制作完成的音频
环境安装:
conda create -n auto_song_cover python=3.9
# 安装demucs环境[进入到ref.music_remover 执行pip install -r requirements.txt]
# 安装so_vits_svc环境[进入到ref.so_vits_svc 执行pip install -r requirements.txt]
pip install librosa
pip install scikit-maad
pip install praat-parselmouth
pip install matplotlib
pip install torchvision
pip install madmom
pip install torchstat
环境设置:
export PATH=$PATH:/data/gpu_env_common/env/bin/ffmpeg/bin
export PYTHONPATH=$PWD:$PWD/ref/music_remover/demucs:$PWD/ref/so_vits_svc:$PWD/ref/split_dirty_frame
"""
import os
import time
import shutil
import random
import logging
import librosa
logging.basicConfig(filename='/tmp/inference.log', level=logging.INFO)
gs_err_code_success = 0
gs_err_code_no_src_mp3 = 1
gs_err_code_separate = 2
gs_err_code_trans_32 = 3
gs_err_code_encode_err = 4
gs_err_code_replace_err = 5
gs_err_code_replace_trans_err = 6
gs_err_code_mix_err = 7
gs_err_code_mix_transcode_err = 8
gs_err_code_no_src_dir = 9
gs_err_code_volume_err = 10
gs_err_code_trans2_442 = 11
gs_err_code_reverb = 12
gs_err_code_no_good_choice = 13
gs_err_code_preprocess_vocal = 14
gs_err_code_replace_except_err = 15
gs_denoise_exe = "/opt/soft/bin/denoise_exe"
gs_draw_volume_exe = "/opt/soft/bin/draw_volume"
gs_simple_mixer_path = "/opt/soft/bin/simple_mixer"
gs_rever_path = "/opt/soft/bin/dereverbrate"
from ref.music_remover.separate_interface import SeparateInterface
from ref.so_vits_svc.inference_main import *
from ref.split_dirty_frame.script.process_one import ReplaceVocalFrame, construct_power_fragment
class SongCoverInference:
def __init__(self):
self.work_dir = None
self.cache_dir = None
self.cid = None
self.src_mp3 = None
self.vocal_path = None
self.vocal_32_path = None
self.acc_path = None
self.speakers = [
10414574138721494,
10414574140317353,
1688849864840588,
3634463651,
5629499489839033,
5910973794723621,
6755399374234747,
8162774327817435,
8162774329368194,
1125899914308640, # 以下为男声,包括这个
12384898975368914,
12947848931397021,
3096224748076687,
3096224751151928,
5066549357604730,
5348024335101054,
6755399442719465,
7036874421386111
]
self.speakers2gender = {
- 10414574138721494: 1,
- 10414574140317353: 1,
- 1688849864840588: 1,
- 3634463651: 1,
- 5629499489839033: 1,
- 5910973794723621: 1,
- 6755399374234747: 1,
- 8162774327817435: 1,
- 8162774329368194: 1,
- 1125899914308640: 0, # 0是男
- 12384898975368914: 0,
- 12947848931397021: 0,
- 3096224748076687: 0,
- 3096224751151928: 0,
- 5066549357604730: 0,
- 5348024335101054: 0,
- 6755399442719465: 0,
- 7036874421386111: 0
+ 10414574138721494: 2,
+ 10414574140317353: 2,
+ 1688849864840588: 2,
+ 3634463651: 2,
+ 5629499489839033: 2,
+ 5910973794723621: 2,
+ 6755399374234747: 2,
+ 8162774327817435: 2,
+ 8162774329368194: 2,
+ 1125899914308640: 1, # 1是男
+ 12384898975368914: 1,
+ 12947848931397021: 1,
+ 3096224748076687: 1,
+ 3096224751151928: 1,
+ 5066549357604730: 1,
+ 5348024335101054: 1,
+ 6755399442719465: 1,
+ 7036874421386111: 1
}
self.speakers_model_path = "data/train_users/{}/logs/32k/G_2000.pth"
self.speakers_model_config = "data/train_users/{}/config/config.json"
st = time.time()
self.separate_inst = None
logging.info("post process ... ReplaceVocalFrame init sp={}".format(time.time() - st))
self.replace_vocal_frame_inst = None
logging.info("SongCoverInference init sp={}".format(time.time() - st))
def separate(self, cid, src_mp3, vocal_path, acc_path):
"""
人声伴奏分离
:param cid:
:param src_mp3:
:param vocal_path:
:param acc_path:
:return:
"""
st = time.time()
if self.separate_inst is None:
self.separate_inst = SeparateInterface()
if not self.separate_inst.process(cid, src_mp3, vocal_path, acc_path):
return gs_err_code_separate
if not os.path.exists(vocal_path) or not os.path.exists(acc_path):
return gs_err_code_separate
# 转码出一个32k单声道的数据
cmd = "ffmpeg -i {} -ar 32000 -ac 1 -y {} -loglevel fatal".format(vocal_path, self.vocal_32_path)
os.system(cmd)
if not os.path.exists(self.vocal_32_path):
return gs_err_code_trans_32
print("separate:cid={}|sp={}".format(cid, time.time() - st))
return gs_err_code_success
def get_start_ms(self, vocal_path):
"""
给定原始音频,找一段连续10s的音频
:param vocal_path:
:return:
"""
audio, sr = librosa.load(vocal_path, sr=16000)
audio = librosa.util.normalize(audio)
# 帧长100ms,帧移10ms,计算能量
power_arr = []
for i in range(0, len(audio) - 1600, 160):
power_arr.append(np.sum(np.abs(audio[i:i + 160])) / 160)
# 将能量小于等于10的部分做成段
power_arr = construct_power_fragment(power_arr)
fragments = []
last_pos = 0
for idx, line in enumerate(power_arr):
start = round(float(line[0]) * 0.01, 3)
duration = round(float(line[1]) * 0.01, 3)
fragments.append([last_pos, start - last_pos])
last_pos = start + duration
if last_pos < len(audio) / sr:
fragments.append([last_pos, len(audio) / sr - last_pos])
# 合并数据,两者间隔在50ms以内的合并起来
idx = 0
while idx < len(fragments) - 1:
if fragments[idx + 1][0] - (fragments[idx][0] + fragments[idx][1]) < 0.05:
fragments[idx][1] = fragments[idx + 1][0] + fragments[idx + 1][1] - fragments[idx][0]
del fragments[idx + 1]
idx -= 1
idx += 1
# out_file = vocal_path + "_power.csv"
# with open(out_file, "w") as f:
# f.write("Name\tStart\tDuration\tTime Format\tType\n")
# for fragment in fragments:
# start = round(float(fragment[0]), 3)
# duration = round(float(fragment[1]), 3)
# strr = "{}\t{}\t{}\t{}\n".format("11", start, duration, "decimal\tCue\t")
# f.write(strr)
# 筛选出开始的位置
# 1. 连续时长大于10s,当前段长度大于3s
# 2. 不可用
# 从0到fragments[idx], 包含idx其中人声段的总和
tot_vocal_duration = [fragments[0][1]]
for i in range(1, len(fragments)):
tot_vocal_duration.append(tot_vocal_duration[i - 1] + fragments[i][1])
# 计算出任意两段之间非人声占比
for i in range(0, len(fragments)):
if fragments[i][1] >= 3:
now_tot = 0
if i > 0:
now_tot = tot_vocal_duration[i - 1]
for j in range(i + 1, len(fragments)):
cur_rate = tot_vocal_duration[j] - now_tot
cur_rate = cur_rate / (fragments[j][1] + fragments[j][0] - fragments[i][0])
if cur_rate > 0.1:
return fragments[i][0]
return -1
def inference_speaker(self):
"""
推理生成合成后的音频
随机取5个干声,选择占比最小的,并且要求占比小于0.3
:return:
"""
st = time.time()
out_speakers = random.sample(self.speakers, 15)
out_songs_dict = {}
for speaker in out_speakers:
model_path = self.speakers_model_path.format(speaker)
config_path = self.speakers_model_config.format(speaker)
song_path = os.path.join(self.cache_dir, "{}_{}.wav".format(self.cid, speaker))
try:
inf(model_path, config_path, self.vocal_32_path, song_path, "prod")
except Exception as ex:
logging.info("cid={}, inference_speaker err={}".format(self.cid, ex))
continue
if os.path.exists(song_path):
if self.replace_vocal_frame_inst is None:
self.replace_vocal_frame_inst = ReplaceVocalFrame(
"data/models/split_dirty_frame_v5_3_epoch3_852.pth")
rate = self.replace_vocal_frame_inst.get_rate(song_path)
if rate < 0.3:
out_songs_dict[song_path] = rate
# 从内部选择占比最低的
out_songs = []
if len(out_songs_dict.keys()) > 0:
st_sec = self.get_start_ms(self.vocal_path)
song_msg = sorted(out_songs_dict.items(), key=lambda kv: kv[1])[0]
out_songs = [song_msg[0]]
logging.info("GetRate:cid={},song={},rate={},st_tm={}".format(self.cid, song_msg[0], round(song_msg[1], 2),
round(st_sec, 3)))
print("GetRate:cid={},song={},rate={},st_tm={}".format(self.cid, song_msg[0], round(song_msg[1], 2),
round(st_sec, 3)))
# logging.info("inference_speaker len = {} finish sp = {}".format(len(out_songs), time.time() - st))
print("inference_speaker len = {} finish sp = {}".format(len(out_songs), time.time() - st))
return out_songs
def get_new_vocal_rate(self, songs):
"""
获取人声的比率
:param songs:
:return:
"""
st = time.time()
need_to_process_song = []
for song in songs:
if self.replace_vocal_frame_inst is None:
self.replace_vocal_frame_inst = ReplaceVocalFrame("data/models/split_dirty_frame_v5_3_epoch3_852.pth")
rate = self.replace_vocal_frame_inst.get_rate(song)
logging.info("{} {} replace_rate={}".format(self.cid, song, rate))
if rate < 1.0:
need_to_process_song.append(song)
logging.info(
"get_new_vocal_rate belen = {} len = {} finish sp = {}".format(len(songs), len(need_to_process_song),
time.time() - st))
return need_to_process_song
def preprocess_vocal(self, songs, vocal_path):
"""
1. 降噪
2. 拉伸
:param songs:
:param vocal_path: 参考的音频信号
:return:
"""
st = time.time()
dv_out_list = []
for song in songs:
denoise_path = str(song).replace(".wav", "_d.wav")
cmd = "{} {} {}".format(gs_denoise_exe, song, denoise_path)
os.system(cmd)
if not os.path.exists(denoise_path):
print("{} {} ERROR denoise".format(self.cid, song))
continue
# 拉伸
volume_path = str(song).replace(".wav", "_dv.wav")
cmd = "{} {} {} {}".format(gs_draw_volume_exe, denoise_path, vocal_path, volume_path)
os.system(cmd)
if not os.path.exists(volume_path):
print("{} {} ERROR denoise".format(self.cid, volume_path))
continue
dv_out_list.append(volume_path)
print(
"preprocess_vocal belen = {} len = {} finish sp = {}".format(len(songs), len(dv_out_list),
time.time() - st))
return dv_out_list
def output(self, dv_out_list):
"""
对外输出数据
:param dv_out_list:
:return:
"""
st = time.time()
out_dir = os.path.join(self.work_dir, self.cid)
if os.path.exists(out_dir):
shutil.rmtree(out_dir)
os.makedirs(out_dir)
# 拷贝数据
dst_mp3_path = os.path.join(out_dir, "src_mp3")
dst_acc_path = os.path.join(out_dir, "acc.mp3")
dst_vocal_path = os.path.join(out_dir, "vocal.mp3")
shutil.copyfile(self.src_mp3, dst_mp3_path)
cmd = "ffmpeg -i {} -ab 320k -y {} -loglevel fatal".format(self.acc_path, dst_acc_path)
os.system(cmd)
if not os.path.exists(dst_acc_path):
return gs_err_code_encode_err
cmd = "ffmpeg -i {} -ab 320k -y {} -loglevel fatal".format(self.vocal_path, dst_vocal_path)
os.system(cmd)
if not os.path.exists(dst_vocal_path):
return gs_err_code_encode_err
# 将所有数据放到out_dir中,用于给人工标注
for dv_wav in dv_out_list:
dv_wav_name = str(dv_wav).split("/")[-1].replace(".wav", "_441.mp3")
dst_dv_path = os.path.join(out_dir, dv_wav_name)
cmd = "ffmpeg -i {} -ar 44100 -ac 1 -ab 320k -y {} -loglevel fatal".format(dv_wav, dst_dv_path)
os.system(cmd)
if not os.path.exists(dst_dv_path):
print("{} encode err!".format(cmd))
continue
logging.info(
"preprocess_vocal output sp = {}".format(time.time() - st))
def process_one(self, cid, work_dir, enable_output=False):
logging.info("\nstart:cid={},work_dir={}----------------------->>>>>>>>".format(cid, work_dir))
self.cid = cid
self.work_dir = work_dir
# 所有不对外交付的,全部放到这里
self.cache_dir = os.path.join(work_dir, "cache")
if os.path.exists(self.cache_dir):
shutil.rmtree(self.cache_dir)
os.makedirs(self.cache_dir)
self.src_mp3 = os.path.join(self.work_dir, "src.mp3")
if not os.path.exists(self.src_mp3):
return gs_err_code_no_src_mp3
self.vocal_path = os.path.join(self.cache_dir, "vocal.wav")
self.vocal_32_path = os.path.join(self.cache_dir, "vocal_32.wav")
self.acc_path = os.path.join(self.cache_dir, "acc.wav")
if not os.path.exists(self.vocal_32_path):
logging.info("start separate ... {} {} {}".format(self.src_mp3, self.vocal_path, self.acc_path))
err = self.separate(cid, self.src_mp3, self.vocal_path, self.acc_path)
if err != gs_err_code_success:
return err, None, None
logging.info("start inference_speaker ...")
out_songs = self.inference_speaker()
dv_out_list = self.preprocess_vocal(out_songs, self.vocal_path)
if len(dv_out_list) == 0:
return gs_err_code_no_good_choice, None, None
mix_mp3_path = None
gender = -1
if enable_output:
self.output(dv_out_list)
else:
# 默认全部处理一遍
for dv_out_path in dv_out_list:
src_path = dv_out_path.replace("_dv.wav", ".wav")
err, mix_mp3_path = self.after_process(self.cid, self.work_dir, src_path, dv_out_path, self.vocal_path,
self.acc_path,
True, False)
if err != gs_err_code_success:
logging.info("after_process err {}".format(err))
# 取出性别属性
if err == gs_err_code_success and mix_mp3_path is not None:
gender = self.speakers2gender[int(str(os.path.basename(mix_mp3_path)).split("_")[1])]
logging.info("finish:cid={},work_dir={}----------------------->>>>>>>>".format(cid, work_dir))
return gs_err_code_success, mix_mp3_path, gender
def reverb_by_vocal(self, file):
st = time.time()
file_442 = file.replace(".wav", "_442.wav")
if not os.path.exists(file_442):
cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {}".format(file, file_442)
os.system(cmd)
if not os.path.exists(file_442):
return gs_err_code_trans2_442, None
file_dst = file.replace(".wav", "_442_dr.wav")
cmd = "{} {} {} {}".format(gs_rever_path, self.vocal_path, file_442, file_dst)
os.system(cmd)
if not os.path.exists(file_dst):
return gs_err_code_reverb, None
print("cid = {}, reverb_by_vocal sp={}".format(self.cid, time.time() - st))
return gs_err_code_success, file_dst
def after_process(self, cid, work_dir, in_file, effect_file, vocal_file, acc_file, need_draw=True,
need_reverb=True):
"""
后处理逻辑
将处理好的音频进行替换,然后和伴奏进行混合,最后进行编码
:return:
"""
if need_reverb:
# 抓取混响
err, effect_file = self.reverb_by_vocal(in_file)
if err != gs_err_code_success:
return err, None
if need_draw:
# 增加一个拉伸的步骤
volume_path = str(effect_file).replace(".wav", "_dv.wav")
cmd = "{} {} {} {}".format(gs_draw_volume_exe, effect_file, vocal_file, volume_path)
print(cmd)
os.system(cmd)
if not os.path.exists(volume_path):
print("{} {} ERROR draw volume".format(self.cid, volume_path))
return gs_err_code_volume_err, None
effect_file = volume_path
st = time.time()
self.cid = cid
self.work_dir = work_dir
self.src_mp3 = os.path.join(self.work_dir, "src.mp3")
if not os.path.exists(self.work_dir):
return gs_err_code_no_src_dir
self.replace_vocal_frame_inst.process(in_file, effect_file, vocal_file)
dst_path = effect_file + "_replace.wav"
if not os.path.exists(dst_path):
return gs_err_code_replace_err, None
print("replace_vocal_frame_inst sp = {}".format(time.time() - st))
# 转码
dst_path_442 = dst_path.replace("_replace.wav", "_replace442.wav")
cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {} -loglevel fatal".format(dst_path, dst_path_442)
os.system(cmd)
if not os.path.exists(dst_path_442):
return gs_err_code_replace_trans_err, None
# 合并转码后再做一次拉伸,保证响度
volume_path = str(dst_path_442).replace(".wav", "_dv.wav")
cmd = "{} {} {} {}".format(gs_draw_volume_exe, dst_path_442, vocal_file, volume_path)
print(cmd)
os.system(cmd)
if not os.path.exists(volume_path):
print("{} {} ERROR draw volume".format(self.cid, volume_path))
return gs_err_code_volume_err, None
dst_path_442 = volume_path
# 混合
mix_path = dst_path_442.replace("_replace442.wav", "_replace442_mix.wav")
cmd = "{} {} {} {}".format(gs_simple_mixer_path, dst_path_442, acc_file, mix_path)
print("{}".format(cmd))
os.system(cmd)
if not os.path.exists(mix_path):
return gs_err_code_mix_err, None
# 编码为mp3
output_dir = os.path.join(self.work_dir, self.cid + "_out")
if not os.path.exists(output_dir):
os.makedirs(output_dir)
name = str(mix_path).replace("_replace442_mix.wav", "_replace442_mix.mp3").split("/")[-1]
mix_path_mp3 = os.path.join(output_dir, name)
cmd = "ffmpeg -i {} -ab 320k -y {} -loglevel fatal".format(mix_path, mix_path_mp3)
os.system(cmd)
if not os.path.exists(mix_path_mp3):
return gs_err_code_mix_transcode_err, None
# 拷贝src到output_dir
# shutil.copyfile(self.src_mp3, os.path.join(output_dir, "src.mp3"))
# logging.info("after_process sp = {}".format(time.time() - st))
return gs_err_code_success, mix_path_mp3
####################################新对外接口############################################################
def prepare_env(self, cid, work_dir, create_dir=False):
self.cid = cid
self.work_dir = work_dir
# 所有不对外交付的,全部放到这里
self.cache_dir = os.path.join(work_dir, "cache")
if create_dir:
if os.path.exists(self.cache_dir):
shutil.rmtree(self.cache_dir)
os.makedirs(self.cache_dir)
self.src_mp3 = os.path.join(self.work_dir, "src.mp3")
if not os.path.exists(self.src_mp3):
return gs_err_code_no_src_mp3
self.vocal_path = os.path.join(self.cache_dir, "vocal.wav")
self.vocal_32_path = os.path.join(self.cache_dir, "vocal_32.wav")
self.acc_path = os.path.join(self.cache_dir, "acc.wav")
return gs_err_code_success
def generate_svc_file(self, cid, work_dir):
"""
:param cid:
:param work_dir:
:return:err_code, 生成出的svc的文件名称
"""
err = self.prepare_env(cid, work_dir, create_dir=True)
if err != gs_err_code_success:
return err, None
# 音源分离
if not os.path.exists(self.vocal_32_path):
st = time.time()
err = self.separate(cid, self.src_mp3, self.vocal_path, self.acc_path)
logging.info("cid={},separate,sp={}".format(self.cid, time.time() - st))
if err != gs_err_code_success:
return err, None
# 生成svc,只保留一个最佳的
st = time.time()
out_songs = self.inference_speaker()
if len(out_songs) == 0:
return gs_err_code_no_good_choice, None
logging.info("cid={},inference_speaker,{},sp={}".format(self.cid, out_songs[0], time.time() - st))
# 预处理人声
dv_out_list = self.preprocess_vocal(out_songs, self.vocal_path)
if len(dv_out_list) == 0:
return gs_err_code_preprocess_vocal, None
return gs_err_code_success, dv_out_list[0]
def effect(self, cid, work_dir, svc_file):
st = time.time()
err = self.prepare_env(cid, work_dir)
if err != gs_err_code_success:
return err, None
logging.info("cid={},effect_and_mix,{},sp={}".format(self.cid, svc_file, time.time() - st))
# 做音效
st = time.time()
err, effect_file = self.reverb_by_vocal(svc_file)
if err != gs_err_code_success:
return err, None
logging.info("cid={},reverb_by_vocal,{},sp={}".format(self.cid, svc_file, time.time() - st))
return err, effect_file
def mix(self, cid, work_dir, svc_file, effect_file):
"""
做音效以及合并
:param cid:
:param work_dir:
:param svc_file:
:param effect_file:
:return: err_code, 完成的mp3文件
"""
st = time.time()
err = self.prepare_env(cid, work_dir)
if err != gs_err_code_success:
return err, None
logging.info("cid={},effect_and_mix,{},sp={}".format(self.cid, svc_file, time.time() - st))
# 拉伸
st = time.time()
volume_path = str(effect_file).replace(".wav", "_dv.wav")
cmd = "{} {} {} {}".format(gs_draw_volume_exe, effect_file, self.vocal_path, volume_path)
os.system(cmd)
if not os.path.exists(volume_path):
print("{} {} ERROR draw volume".format(self.cid, volume_path))
return gs_err_code_volume_err, None
effect_file = volume_path
logging.info("cid={},draw_volume,{},sp={}".format(self.cid, svc_file, time.time() - st))
# 替换
st = time.time()
try:
self.replace_vocal_frame_inst.process(svc_file, effect_file, self.vocal_path)
except Exception as ex:
logging.info("{},replace_vocal_frame_inst, {}", self.cid, ex)
return gs_err_code_replace_except_err, None
dst_path = effect_file + "_replace.wav"
if not os.path.exists(dst_path):
return gs_err_code_replace_err, None
logging.info("cid={},replace_vocal_frame_inst,{},sp={}".format(self.cid, svc_file, time.time() - st))
# 转码
st = time.time()
dst_path_442 = dst_path.replace("_replace.wav", "_replace442.wav")
cmd = "ffmpeg -i {} -ar 44100 -ac 2 -y {} -loglevel fatal".format(dst_path, dst_path_442)
os.system(cmd)
if not os.path.exists(dst_path_442):
return gs_err_code_replace_trans_err, None
logging.info("cid={},transcode,{},sp={}".format(self.cid, svc_file, time.time() - st))
# 合并转码后再做一次拉伸,保证响度
st = time.time()
volume_path = str(dst_path_442).replace("_replace442.wav", "_replace442_dv.wav")
cmd = "{} {} {} {}".format(gs_draw_volume_exe, dst_path_442, self.vocal_path, volume_path)
os.system(cmd)
if not os.path.exists(volume_path):
print("{} {} ERROR draw volume".format(self.cid, volume_path))
return gs_err_code_volume_err, None
dst_path_442 = volume_path
logging.info("cid={},draw_volume2,{},sp={}".format(self.cid, svc_file, time.time() - st))
# 混合
st = time.time()
mix_path = dst_path_442.replace("_replace442_dv.wav", "_replace442_dv_mix.wav")
cmd = "{} {} {} {}".format(gs_simple_mixer_path, dst_path_442, self.acc_path, mix_path)
os.system(cmd)
if not os.path.exists(mix_path):
return gs_err_code_mix_err, None
logging.info("cid={},mixer,{},sp={}".format(self.cid, svc_file, time.time() - st))
# 编码为mp3
st = time.time()
output_dir = os.path.join(self.work_dir, self.cid + "_out")
if not os.path.exists(output_dir):
os.makedirs(output_dir)
name = str(mix_path).replace("_replace442_dv_mix.wav", "_replace442_dv_mix.mp3").split("/")[-1]
mix_path_mp3 = os.path.join(output_dir, name)
cmd = "ffmpeg -i {} -ab 320k -y {} -loglevel fatal".format(mix_path, mix_path_mp3)
print(cmd)
os.system(cmd)
if not os.path.exists(mix_path_mp3):
return gs_err_code_mix_transcode_err, None
logging.info("cid={},encode,{},sp={}".format(self.cid, svc_file, time.time() - st))
return gs_err_code_success, mix_path_mp3
def get_gender(self, svc_file):
return self.speakers2gender[int(os.path.basename(svc_file).split("_")[1])]
def process_one_logic(self, cid, work_dir):
"""
搞成两部分:
1. 分离数据+5次推理,获取最佳结果,并保存
2. 利用最佳结果做音效以及合并
:return:
"""
err, svc_file = self.generate_svc_file(cid, work_dir)
gender = -1
if err != gs_err_code_success:
return err, svc_file, gender,
gender = self.get_gender(svc_file)
err, effect_file = self.effect(cid, work_dir, svc_file)
if err != gs_err_code_success:
return err, svc_file, gender
err, mix_mp3_path = self.mix(cid, work_dir, svc_file, effect_file)
return err, mix_mp3_path, gender
def test():
arr = [
# "611752105020343687",
# "611752105023532439",
- "611752105030419688",
+ # "611752105030419688",
+ "611752105030485748",
]
base_dir = "/data/rsync/jianli.yang/AutoCoverTool/data/test"
s_inst = SongCoverInference()
for cid in arr:
st = time.time()
# err, mix_mp3, gender = s_inst.process_one(cid, os.path.join(base_dir, cid), False)
err, mix_mp3, gender = s_inst.process_one_logic(cid, os.path.join(base_dir, cid))
print(mix_mp3, gender)
print("cid={} RealFinish err={} sp={}".format(cid, err, time.time() - st))
if __name__ == '__main__':
test()
diff --git a/AutoCoverTool/script/shuffle_music.py b/AutoCoverTool/script/shuffle_music.py
index 1b83bc9..0473944 100644
--- a/AutoCoverTool/script/shuffle_music.py
+++ b/AutoCoverTool/script/shuffle_music.py
@@ -1,25 +1,80 @@
"""
载入人声,将人声的频谱进行向上平移
"""
import librosa
import soundfile
+import numpy as np
from copy import deepcopy
+def local_maxium(x):
+ """
+ 求序列的极大值
+ :param x:
+ :return:
+ """
+ d = np.diff(x)
+ l_d = len(d)
+ maxium = []
+ loc = []
+ for i in range(l_d - 1):
+ if d[i] > 0 and d[i + 1] <= 0:
+ maxium.append(x[i + 1])
+ loc.append(i + 1)
+ return maxium, loc
+
+
+def Formant_Cepst(u, cepstL):
+ """
+ 来源: https://github.com/taw19960426/-Speech-signal-processing-experiment-tutorial-_python/blob/master/%E5%85%B1%E6%8C%AF%E5%B3%B0%E4%BC%B0%E8%AE%A1%E5%87%BD%E6%95%B0.py
+ 倒谱法共振峰估计函数
+ :param u:输入信号
+ :param cepstL:🔪频率上窗函数的宽度
+ :return: val共振峰幅值
+ :return: loc共振峰位置
+ :return: spec包络线
+ """
+ wlen2 = len(u) // 2
+ u_fft = np.fft.fft(u) # 按式(2-1)计算
+ U = np.log(np.abs(u_fft[:wlen2]))
+ Cepst = np.fft.ifft(U) # 按式(2-2)计算
+ cepst = np.zeros(wlen2, dtype=np.complex)
+ cepst[:cepstL] = Cepst[:cepstL] # 按式(2-3)计算
+ cepst[-cepstL + 1:] = Cepst[-cepstL + 1:] # 取第二个式子的相反
+ spec = np.real(np.fft.fft(cepst))
+ val, loc = local_maxium(spec) # 在包络线上寻找极大值
+ return val, loc, spec
+
+
def test(in_vocal):
sr = 44100
audio, sr = librosa.load(in_vocal, sr=sr, mono=True)
stft = librosa.stft(audio)
stft = stft.transpose()
new_stft = deepcopy(stft)
for ii in range(0, len(stft)):
- for jj in range(0, len(stft[0])):
- pass
+ # 寻找峰值点
+ max_pos = []
+ for i in range(1, len(stft[ii]) - 1, ):
+ if stft[ii][i + 1] > stft[ii][i] > stft[ii][i - 1]:
+ max_pos.append(i)
+ # 只调整1kHz以上的峰值点, 从48的位置开始调整
+ for i in range(len(max_pos) - 1):
+ if max_pos[i] < 48:
+ continue
+ # 平移到两峰之间
+ cur_i = max_pos[i]
+ next_i = max_pos[i + 1]
+ print(cur_i, next_i)
+ # 将cur_i移动到cur_i和next_i中间
+ dst_i = (cur_i + next_i) // 2
+ new_stft[ii][dst_i] = stft[ii][cur_i]
print(stft.shape)
- istft = librosa.istft(stft)
+ new_stft = new_stft.transpose()
+ istft = librosa.istft(new_stft)
soundfile.write(str(in_vocal).replace(".wav", "_out.wav"), istft, 44100, format="wav")
if __name__ == '__main__':
- test("/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal.wav")
+ test("/Users/yangjianli/starmaker-work/research/tmp_code/消音相关/test_out/ins_main_out/test2/tot/3/vocal_out_01.wav")

File Metadata

Mime Type
text/x-diff
Expires
Sun, Jan 12, 08:29 (1 d, 15 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1347149
Default Alt Text
(32 KB)

Event Timeline