Voice separator (#468)

* changes

* changes

* changes

* changes

* changes

* changes
This commit is contained in:
Harisreedhar
2024-04-02 13:29:37 +05:30
committed by GitHub
parent bd8034cbc7
commit f73c00acf5
5 changed files with 165 additions and 6 deletions
+23
View File
@@ -6,6 +6,7 @@ import scipy
from facefusion.filesystem import is_audio
from facefusion.ffmpeg import read_audio_buffer
from facefusion.typing import Fps, Audio, Spectrogram, AudioFrame
from facefusion.audio_extractor import batch_extract_voice
def get_audio_frame(audio_path : str, fps : Fps, frame_number : int = 0) -> Optional[AudioFrame]:
@@ -16,6 +17,14 @@ def get_audio_frame(audio_path : str, fps : Fps, frame_number : int = 0) -> Opti
return None
def get_voice_frame(audio_path : str, fps : Fps, frame_number : int = 0) -> Optional[AudioFrame]:
if is_audio(audio_path):
audio_frames = read_static_voice(audio_path, fps)
if frame_number in range(len(audio_frames)):
return audio_frames[frame_number]
return None
def create_empty_audio_frame() -> AudioFrame:
audio_frame = numpy.zeros((80, 16), dtype = numpy.int16)
return audio_frame
@@ -34,6 +43,20 @@ def read_static_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]
return None
@lru_cache(maxsize = None)
def read_static_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
if is_audio(audio_path):
audio_buffer = read_audio_buffer(audio_path, 16000, 2)
audio = numpy.frombuffer(audio_buffer, dtype = numpy.int16).reshape(-1, 2)
audio = batch_extract_voice(audio, 1000000, 0.75)
audio = normalize_audio(audio)
audio = filter_audio(audio, -0.97)
spectrogram = create_spectrogram(audio, 16000, 80, 800, 55.0, 7600.0)
audio_frames = extract_audio_frames(spectrogram, 80, 16, fps)
return audio_frames
return None
def normalize_audio(audio : numpy.ndarray[Any, Any]) -> Audio:
if audio.ndim > 1:
audio = numpy.mean(audio, axis = 1)
+132
View File
@@ -0,0 +1,132 @@
from typing import Any, Tuple
from functools import lru_cache
from time import sleep
import threading
import scipy
import numpy
import onnxruntime
import facefusion.globals
from facefusion import process_manager
from facefusion.typing import ModelSet, AudioChunk, Audio
from facefusion.execution import apply_execution_provider_options
from facefusion.filesystem import resolve_relative_path
from facefusion.download import conditional_download
VOICE_EXTRACTOR = None
THREAD_LOCK : threading.Lock = threading.Lock()
MODELS : ModelSet =\
{
'voice_extractor':
{
'url': 'https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/Kim_Vocal_2.onnx',
'path': resolve_relative_path('../.assets/models/Kim_Vocal_2.onnx')
}
}
def get_voice_extractor() -> Any:
global VOICE_EXTRACTOR
with THREAD_LOCK:
while process_manager.is_checking():
sleep(0.5)
if VOICE_EXTRACTOR is None:
model_path = MODELS.get('voice_extractor').get('path')
VOICE_EXTRACTOR = onnxruntime.InferenceSession(model_path, providers = apply_execution_provider_options(facefusion.globals.execution_providers))
return VOICE_EXTRACTOR
def clear_voice_extractor() -> None:
global VOICE_EXTRACTOR
VOICE_EXTRACTOR = None
def pre_check() -> bool:
if not facefusion.globals.skip_download:
download_directory_path = resolve_relative_path('../.assets/models')
model_urls =\
[
MODELS.get('voice_extractor').get('url'),
]
process_manager.check()
conditional_download(download_directory_path, model_urls)
process_manager.end()
return True
@lru_cache(maxsize = None)
def create_static_hanning_window(filter_size : int) -> Any:
window = scipy.signal.windows.hann(filter_size, sym = False)
return window
def batch_extract_voice(audio : Audio, chunk_size : int, overlap_size : float) -> Audio:
step_size = int(chunk_size * (1 - overlap_size))
audio_total = numpy.zeros((audio.shape[0], 2), dtype = numpy.float32)
audio_count = numpy.zeros((audio.shape[0], 2), dtype = numpy.float32)
for start in range(0, audio.shape[0], step_size):
end = min(start + chunk_size, audio.shape[0])
audio_total[start:end, ...] += extract_voice(audio[start:end, ...])
audio_count[start:end, ...] += 1
audio = audio_total / audio_count
return audio
def extract_voice(audio_chunk : AudioChunk) -> AudioChunk:
voice_extractor = get_voice_extractor()
extractor_shape = voice_extractor.get_inputs()[0].shape[1:]
hop_length = 1024
filter_size = 7680
trim_size = filter_size // 2
frequency_bins = trim_size + 1
chunk_size = hop_length * (extractor_shape[2] - 1)
audio_chunk, pad_size = prepare_audio_chunk(audio_chunk, chunk_size, trim_size)
audio_chunk = decompose_audio_chunk(audio_chunk, filter_size, hop_length, frequency_bins, extractor_shape)
audio_chunk = voice_extractor.run(None,
{
voice_extractor.get_inputs()[0].name: audio_chunk
})[0]
audio_chunk = compose_audio_chunk(audio_chunk, filter_size, hop_length, frequency_bins, extractor_shape)
audio_chunk = normalize_audio_chunk(audio_chunk, chunk_size, trim_size, pad_size)
return audio_chunk
def prepare_audio_chunk(audio_chunk : AudioChunk, chunk_size : int, trim_size : int) -> Tuple[AudioChunk, int]:
audio_chunk = audio_chunk.T
step_size = chunk_size - 2 * trim_size
pad_size = step_size - audio_chunk.shape[1] % step_size
audio_chunk_size = audio_chunk.shape[1] + pad_size
audio_chunk = audio_chunk.astype(numpy.float32) / numpy.iinfo(numpy.int16).max
audio_chunk = numpy.pad(audio_chunk, ((0, 0), (trim_size, trim_size + pad_size)), mode='constant', constant_values = 0)
audio_chunk = numpy.concatenate([audio_chunk[:,i:i + chunk_size] for i in range(0, audio_chunk_size, step_size)], axis = 0)
audio_chunk = audio_chunk.reshape((-1, chunk_size))
return audio_chunk, pad_size
def decompose_audio_chunk(audio_chunk : AudioChunk, filter_size : int, hop_length : int, frequency_bins : int, extractor_shape : Tuple[int, int, int]) -> AudioChunk:
window = create_static_hanning_window(filter_size)
audio_chunk = scipy.signal.stft(audio_chunk, nperseg = filter_size, noverlap = filter_size - hop_length, window = window, padded = False)[2]
audio_chunk = numpy.stack((numpy.real(audio_chunk), numpy.imag(audio_chunk)), axis = -1).transpose((0, 3, 1, 2))
audio_chunk = audio_chunk.reshape((-1, 2, 2, frequency_bins, extractor_shape[2])).reshape((-1, extractor_shape[0], frequency_bins, extractor_shape[2]))
audio_chunk = audio_chunk[:,:,:extractor_shape[1]]
audio_chunk /= numpy.sqrt(1.0 / window.sum() ** 2)
return audio_chunk
def compose_audio_chunk(audio_chunk : AudioChunk, filter_size : int, hop_length : int, frequency_bins : int, extractor_shape : Tuple[int, int, int]) -> AudioChunk:
window = create_static_hanning_window(filter_size)
audio_chunk = numpy.pad(audio_chunk, ((0, 0), (0, 0), (0, frequency_bins - extractor_shape[1]), (0, 0)), mode = 'constant')
audio_chunk = audio_chunk.reshape(-1, 2, frequency_bins, extractor_shape[2]).transpose((0, 2, 3, 1))
audio_chunk = audio_chunk[:,:,:,0] + 1j * audio_chunk[:,:,:,1]
audio_chunk = scipy.signal.istft(audio_chunk, nperseg = filter_size, noverlap = filter_size - hop_length, window = window)[1]
audio_chunk *= numpy.sqrt(1.0 / window.sum() ** 2)
return audio_chunk
def normalize_audio_chunk(audio_chunk : AudioChunk, chunk_size : int, trim_size : int, pad_size : int) -> AudioChunk:
audio_chunk = audio_chunk.reshape((-1, 2, chunk_size))
audio_chunk = audio_chunk[:,:,trim_size:-trim_size].transpose(1, 0, 2)
audio_chunk = audio_chunk.reshape(2, -1)[:,:-pad_size].T
return audio_chunk
+4 -3
View File
@@ -15,7 +15,7 @@ import facefusion.choices
import facefusion.globals
from facefusion.face_analyser import get_one_face, get_average_face
from facefusion.face_store import get_reference_faces, append_reference_face
from facefusion import face_analyser, face_masker, content_analyser, config, process_manager, metadata, logger, wording
from facefusion import face_analyser, face_masker, content_analyser, config, process_manager, metadata, logger, wording, audio_extractor
from facefusion.content_analyser import analyse_image, analyse_video
from facefusion.processors.frame.core import get_frame_processors_modules, load_frame_processor_module
from facefusion.common_helper import create_metavar, get_first
@@ -193,7 +193,7 @@ def run(program : ArgumentParser) -> None:
if facefusion.globals.force_download:
force_download()
return
if not pre_check() or not content_analyser.pre_check() or not face_analyser.pre_check() or not face_masker.pre_check():
if not pre_check() or not content_analyser.pre_check() or not face_analyser.pre_check() or not face_masker.pre_check() or not audio_extractor.pre_check():
return
for frame_processor_module in get_frame_processors_modules(facefusion.globals.frame_processors):
if not frame_processor_module.pre_check():
@@ -270,7 +270,8 @@ def force_download() -> None:
[
content_analyser.MODELS,
face_analyser.MODELS,
face_masker.MODELS
face_masker.MODELS,
audio_extractor.MODELS
]
for frame_processor_module in get_frame_processors_modules(available_frame_processors):
@@ -19,11 +19,12 @@ from facefusion.normalizer import normalize_output_path
from facefusion.typing import Face, VisionFrame, UpdateProcess, ProcessMode, ModelSet, OptionsWithModel, AudioFrame, QueuePayload
from facefusion.filesystem import is_file, has_audio, resolve_relative_path
from facefusion.download import conditional_download, is_download_done
from facefusion.audio import read_static_audio, get_audio_frame, create_empty_audio_frame
from facefusion.audio import read_static_voice, get_voice_frame, create_empty_audio_frame
from facefusion.filesystem import is_image, is_video, filter_audio_paths
from facefusion.common_helper import get_first
from facefusion.vision import read_image, write_image, read_static_image
from facefusion.processors.frame.typings import LipSyncerInputs
from facefusion.audio_extractor import clear_voice_extractor
from facefusion.processors.frame import globals as frame_processors_globals
from facefusion.processors.frame import choices as frame_processors_choices
@@ -125,7 +126,7 @@ def pre_process(mode : ProcessMode) -> bool:
def post_process() -> None:
read_static_image.cache_clear()
read_static_audio.cache_clear()
read_static_voice.cache_clear()
if facefusion.globals.video_memory_strategy == 'strict' or facefusion.globals.video_memory_strategy == 'moderate':
clear_frame_processor()
if facefusion.globals.video_memory_strategy == 'strict':
@@ -133,6 +134,7 @@ def post_process() -> None:
clear_content_analyser()
clear_face_occluder()
clear_face_parser()
clear_voice_extractor()
def sync_lip(target_face : Face, temp_audio_frame : AudioFrame, temp_vision_frame : VisionFrame) -> VisionFrame:
@@ -222,7 +224,7 @@ def process_frames(source_paths : List[str], queue_payloads : List[QueuePayload]
for queue_payload in process_manager.manage(queue_payloads):
frame_number = queue_payload['frame_number']
target_vision_path = queue_payload['frame_path']
source_audio_frame = get_audio_frame(source_audio_path, facefusion.globals.output_video_fps, frame_number)
source_audio_frame = get_voice_frame(source_audio_path, facefusion.globals.output_video_fps, frame_number)
if not numpy.any(source_audio_frame):
source_audio_frame = create_empty_audio_frame()
target_vision_frame = read_image(target_vision_path)
+1
View File
@@ -43,6 +43,7 @@ Translation = numpy.ndarray[Any, Any]
AudioBuffer = bytes
Audio = numpy.ndarray[Any, Any]
AudioChunk = numpy.ndarray[Any, Any]
AudioFrame = numpy.ndarray[Any, Any]
Spectrogram = numpy.ndarray[Any, Any]