diff --git a/facefusion/audio.py b/facefusion/audio.py index a77448f3..a028b29d 100644 --- a/facefusion/audio.py +++ b/facefusion/audio.py @@ -6,6 +6,7 @@ import scipy from facefusion.filesystem import is_audio from facefusion.ffmpeg import read_audio_buffer from facefusion.typing import Fps, Audio, Spectrogram, AudioFrame +from facefusion.audio_extractor import batch_extract_voice def get_audio_frame(audio_path : str, fps : Fps, frame_number : int = 0) -> Optional[AudioFrame]: @@ -16,6 +17,14 @@ def get_audio_frame(audio_path : str, fps : Fps, frame_number : int = 0) -> Opti return None +def get_voice_frame(audio_path : str, fps : Fps, frame_number : int = 0) -> Optional[AudioFrame]: + if is_audio(audio_path): + audio_frames = read_static_voice(audio_path, fps) + if frame_number in range(len(audio_frames)): + return audio_frames[frame_number] + return None + + def create_empty_audio_frame() -> AudioFrame: audio_frame = numpy.zeros((80, 16), dtype = numpy.int16) return audio_frame @@ -34,6 +43,20 @@ def read_static_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]] return None +@lru_cache(maxsize = None) +def read_static_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]: + if is_audio(audio_path): + audio_buffer = read_audio_buffer(audio_path, 16000, 2) + audio = numpy.frombuffer(audio_buffer, dtype = numpy.int16).reshape(-1, 2) + audio = batch_extract_voice(audio, 1000000, 0.75) + audio = normalize_audio(audio) + audio = filter_audio(audio, -0.97) + spectrogram = create_spectrogram(audio, 16000, 80, 800, 55.0, 7600.0) + audio_frames = extract_audio_frames(spectrogram, 80, 16, fps) + return audio_frames + return None + + def normalize_audio(audio : numpy.ndarray[Any, Any]) -> Audio: if audio.ndim > 1: audio = numpy.mean(audio, axis = 1) diff --git a/facefusion/audio_extractor.py b/facefusion/audio_extractor.py new file mode 100644 index 00000000..1e20ab3e --- /dev/null +++ b/facefusion/audio_extractor.py @@ -0,0 +1,132 @@ +from typing import Any, Tuple +from functools import lru_cache +from time import sleep +import threading +import scipy +import numpy +import onnxruntime + +import facefusion.globals +from facefusion import process_manager +from facefusion.typing import ModelSet, AudioChunk, Audio +from facefusion.execution import apply_execution_provider_options +from facefusion.filesystem import resolve_relative_path +from facefusion.download import conditional_download + +VOICE_EXTRACTOR = None +THREAD_LOCK : threading.Lock = threading.Lock() +MODELS : ModelSet =\ +{ + 'voice_extractor': + { + 'url': 'https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/Kim_Vocal_2.onnx', + 'path': resolve_relative_path('../.assets/models/Kim_Vocal_2.onnx') + } +} + + +def get_voice_extractor() -> Any: + global VOICE_EXTRACTOR + + with THREAD_LOCK: + while process_manager.is_checking(): + sleep(0.5) + if VOICE_EXTRACTOR is None: + model_path = MODELS.get('voice_extractor').get('path') + VOICE_EXTRACTOR = onnxruntime.InferenceSession(model_path, providers = apply_execution_provider_options(facefusion.globals.execution_providers)) + return VOICE_EXTRACTOR + + +def clear_voice_extractor() -> None: + global VOICE_EXTRACTOR + + VOICE_EXTRACTOR = None + + +def pre_check() -> bool: + if not facefusion.globals.skip_download: + download_directory_path = resolve_relative_path('../.assets/models') + model_urls =\ + [ + MODELS.get('voice_extractor').get('url'), + ] + process_manager.check() + conditional_download(download_directory_path, model_urls) + process_manager.end() + return True + + +@lru_cache(maxsize = None) +def create_static_hanning_window(filter_size : int) -> Any: + window = scipy.signal.windows.hann(filter_size, sym = False) + return window + + +def batch_extract_voice(audio : Audio, chunk_size : int, overlap_size : float) -> Audio: + step_size = int(chunk_size * (1 - overlap_size)) + audio_total = numpy.zeros((audio.shape[0], 2), dtype = numpy.float32) + audio_count = numpy.zeros((audio.shape[0], 2), dtype = numpy.float32) + for start in range(0, audio.shape[0], step_size): + end = min(start + chunk_size, audio.shape[0]) + audio_total[start:end, ...] += extract_voice(audio[start:end, ...]) + audio_count[start:end, ...] += 1 + audio = audio_total / audio_count + return audio + + +def extract_voice(audio_chunk : AudioChunk) -> AudioChunk: + voice_extractor = get_voice_extractor() + extractor_shape = voice_extractor.get_inputs()[0].shape[1:] + hop_length = 1024 + filter_size = 7680 + trim_size = filter_size // 2 + frequency_bins = trim_size + 1 + chunk_size = hop_length * (extractor_shape[2] - 1) + audio_chunk, pad_size = prepare_audio_chunk(audio_chunk, chunk_size, trim_size) + audio_chunk = decompose_audio_chunk(audio_chunk, filter_size, hop_length, frequency_bins, extractor_shape) + audio_chunk = voice_extractor.run(None, + { + voice_extractor.get_inputs()[0].name: audio_chunk + })[0] + audio_chunk = compose_audio_chunk(audio_chunk, filter_size, hop_length, frequency_bins, extractor_shape) + audio_chunk = normalize_audio_chunk(audio_chunk, chunk_size, trim_size, pad_size) + return audio_chunk + + +def prepare_audio_chunk(audio_chunk : AudioChunk, chunk_size : int, trim_size : int) -> Tuple[AudioChunk, int]: + audio_chunk = audio_chunk.T + step_size = chunk_size - 2 * trim_size + pad_size = step_size - audio_chunk.shape[1] % step_size + audio_chunk_size = audio_chunk.shape[1] + pad_size + audio_chunk = audio_chunk.astype(numpy.float32) / numpy.iinfo(numpy.int16).max + audio_chunk = numpy.pad(audio_chunk, ((0, 0), (trim_size, trim_size + pad_size)), mode='constant', constant_values = 0) + audio_chunk = numpy.concatenate([audio_chunk[:,i:i + chunk_size] for i in range(0, audio_chunk_size, step_size)], axis = 0) + audio_chunk = audio_chunk.reshape((-1, chunk_size)) + return audio_chunk, pad_size + + +def decompose_audio_chunk(audio_chunk : AudioChunk, filter_size : int, hop_length : int, frequency_bins : int, extractor_shape : Tuple[int, int, int]) -> AudioChunk: + window = create_static_hanning_window(filter_size) + audio_chunk = scipy.signal.stft(audio_chunk, nperseg = filter_size, noverlap = filter_size - hop_length, window = window, padded = False)[2] + audio_chunk = numpy.stack((numpy.real(audio_chunk), numpy.imag(audio_chunk)), axis = -1).transpose((0, 3, 1, 2)) + audio_chunk = audio_chunk.reshape((-1, 2, 2, frequency_bins, extractor_shape[2])).reshape((-1, extractor_shape[0], frequency_bins, extractor_shape[2])) + audio_chunk = audio_chunk[:,:,:extractor_shape[1]] + audio_chunk /= numpy.sqrt(1.0 / window.sum() ** 2) + return audio_chunk + + +def compose_audio_chunk(audio_chunk : AudioChunk, filter_size : int, hop_length : int, frequency_bins : int, extractor_shape : Tuple[int, int, int]) -> AudioChunk: + window = create_static_hanning_window(filter_size) + audio_chunk = numpy.pad(audio_chunk, ((0, 0), (0, 0), (0, frequency_bins - extractor_shape[1]), (0, 0)), mode = 'constant') + audio_chunk = audio_chunk.reshape(-1, 2, frequency_bins, extractor_shape[2]).transpose((0, 2, 3, 1)) + audio_chunk = audio_chunk[:,:,:,0] + 1j * audio_chunk[:,:,:,1] + audio_chunk = scipy.signal.istft(audio_chunk, nperseg = filter_size, noverlap = filter_size - hop_length, window = window)[1] + audio_chunk *= numpy.sqrt(1.0 / window.sum() ** 2) + return audio_chunk + + +def normalize_audio_chunk(audio_chunk : AudioChunk, chunk_size : int, trim_size : int, pad_size : int) -> AudioChunk: + audio_chunk = audio_chunk.reshape((-1, 2, chunk_size)) + audio_chunk = audio_chunk[:,:,trim_size:-trim_size].transpose(1, 0, 2) + audio_chunk = audio_chunk.reshape(2, -1)[:,:-pad_size].T + return audio_chunk diff --git a/facefusion/core.py b/facefusion/core.py index c006b3bc..b5bd11a8 100755 --- a/facefusion/core.py +++ b/facefusion/core.py @@ -15,7 +15,7 @@ import facefusion.choices import facefusion.globals from facefusion.face_analyser import get_one_face, get_average_face from facefusion.face_store import get_reference_faces, append_reference_face -from facefusion import face_analyser, face_masker, content_analyser, config, process_manager, metadata, logger, wording +from facefusion import face_analyser, face_masker, content_analyser, config, process_manager, metadata, logger, wording, audio_extractor from facefusion.content_analyser import analyse_image, analyse_video from facefusion.processors.frame.core import get_frame_processors_modules, load_frame_processor_module from facefusion.common_helper import create_metavar, get_first @@ -193,7 +193,7 @@ def run(program : ArgumentParser) -> None: if facefusion.globals.force_download: force_download() return - if not pre_check() or not content_analyser.pre_check() or not face_analyser.pre_check() or not face_masker.pre_check(): + if not pre_check() or not content_analyser.pre_check() or not face_analyser.pre_check() or not face_masker.pre_check() or not audio_extractor.pre_check(): return for frame_processor_module in get_frame_processors_modules(facefusion.globals.frame_processors): if not frame_processor_module.pre_check(): @@ -270,7 +270,8 @@ def force_download() -> None: [ content_analyser.MODELS, face_analyser.MODELS, - face_masker.MODELS + face_masker.MODELS, + audio_extractor.MODELS ] for frame_processor_module in get_frame_processors_modules(available_frame_processors): diff --git a/facefusion/processors/frame/modules/lip_syncer.py b/facefusion/processors/frame/modules/lip_syncer.py index c502531e..10734953 100755 --- a/facefusion/processors/frame/modules/lip_syncer.py +++ b/facefusion/processors/frame/modules/lip_syncer.py @@ -19,11 +19,12 @@ from facefusion.normalizer import normalize_output_path from facefusion.typing import Face, VisionFrame, UpdateProcess, ProcessMode, ModelSet, OptionsWithModel, AudioFrame, QueuePayload from facefusion.filesystem import is_file, has_audio, resolve_relative_path from facefusion.download import conditional_download, is_download_done -from facefusion.audio import read_static_audio, get_audio_frame, create_empty_audio_frame +from facefusion.audio import read_static_voice, get_voice_frame, create_empty_audio_frame from facefusion.filesystem import is_image, is_video, filter_audio_paths from facefusion.common_helper import get_first from facefusion.vision import read_image, write_image, read_static_image from facefusion.processors.frame.typings import LipSyncerInputs +from facefusion.audio_extractor import clear_voice_extractor from facefusion.processors.frame import globals as frame_processors_globals from facefusion.processors.frame import choices as frame_processors_choices @@ -125,7 +126,7 @@ def pre_process(mode : ProcessMode) -> bool: def post_process() -> None: read_static_image.cache_clear() - read_static_audio.cache_clear() + read_static_voice.cache_clear() if facefusion.globals.video_memory_strategy == 'strict' or facefusion.globals.video_memory_strategy == 'moderate': clear_frame_processor() if facefusion.globals.video_memory_strategy == 'strict': @@ -133,6 +134,7 @@ def post_process() -> None: clear_content_analyser() clear_face_occluder() clear_face_parser() + clear_voice_extractor() def sync_lip(target_face : Face, temp_audio_frame : AudioFrame, temp_vision_frame : VisionFrame) -> VisionFrame: @@ -222,7 +224,7 @@ def process_frames(source_paths : List[str], queue_payloads : List[QueuePayload] for queue_payload in process_manager.manage(queue_payloads): frame_number = queue_payload['frame_number'] target_vision_path = queue_payload['frame_path'] - source_audio_frame = get_audio_frame(source_audio_path, facefusion.globals.output_video_fps, frame_number) + source_audio_frame = get_voice_frame(source_audio_path, facefusion.globals.output_video_fps, frame_number) if not numpy.any(source_audio_frame): source_audio_frame = create_empty_audio_frame() target_vision_frame = read_image(target_vision_path) diff --git a/facefusion/typing.py b/facefusion/typing.py index 9d0c36f1..717facee 100755 --- a/facefusion/typing.py +++ b/facefusion/typing.py @@ -43,6 +43,7 @@ Translation = numpy.ndarray[Any, Any] AudioBuffer = bytes Audio = numpy.ndarray[Any, Any] +AudioChunk = numpy.ndarray[Any, Any] AudioFrame = numpy.ndarray[Any, Any] Spectrogram = numpy.ndarray[Any, Any]