From 79d7e2aceabbe6f5f13c62417fa8f8bd010246d8 Mon Sep 17 00:00:00 2001 From: henryruhs Date: Tue, 2 Apr 2024 11:23:26 +0200 Subject: [PATCH] Rename audio extractor to voice extractor --- facefusion/audio.py | 8 ++--- facefusion/core.py | 6 ++-- .../processors/frame/modules/lip_syncer.py | 2 +- ...{audio_extractor.py => voice_extractor.py} | 29 ++++++++++--------- 4 files changed, 23 insertions(+), 22 deletions(-) rename facefusion/{audio_extractor.py => voice_extractor.py} (87%) diff --git a/facefusion/audio.py b/facefusion/audio.py index a028b29d..ab703615 100644 --- a/facefusion/audio.py +++ b/facefusion/audio.py @@ -6,7 +6,7 @@ import scipy from facefusion.filesystem import is_audio from facefusion.ffmpeg import read_audio_buffer from facefusion.typing import Fps, Audio, Spectrogram, AudioFrame -from facefusion.audio_extractor import batch_extract_voice +from facefusion.voice_extractor import batch_extract_voice def get_audio_frame(audio_path : str, fps : Fps, frame_number : int = 0) -> Optional[AudioFrame]: @@ -19,9 +19,9 @@ def get_audio_frame(audio_path : str, fps : Fps, frame_number : int = 0) -> Opti def get_voice_frame(audio_path : str, fps : Fps, frame_number : int = 0) -> Optional[AudioFrame]: if is_audio(audio_path): - audio_frames = read_static_voice(audio_path, fps) - if frame_number in range(len(audio_frames)): - return audio_frames[frame_number] + voice_frames = read_static_voice(audio_path, fps) + if frame_number in range(len(voice_frames)): + return voice_frames[frame_number] return None diff --git a/facefusion/core.py b/facefusion/core.py index b5bd11a8..715bd71f 100755 --- a/facefusion/core.py +++ b/facefusion/core.py @@ -15,7 +15,7 @@ import facefusion.choices import facefusion.globals from facefusion.face_analyser import get_one_face, get_average_face from facefusion.face_store import get_reference_faces, append_reference_face -from facefusion import face_analyser, face_masker, content_analyser, config, process_manager, metadata, logger, wording, audio_extractor +from facefusion import face_analyser, face_masker, content_analyser, config, process_manager, metadata, logger, wording, voice_extractor from facefusion.content_analyser import analyse_image, analyse_video from facefusion.processors.frame.core import get_frame_processors_modules, load_frame_processor_module from facefusion.common_helper import create_metavar, get_first @@ -193,7 +193,7 @@ def run(program : ArgumentParser) -> None: if facefusion.globals.force_download: force_download() return - if not pre_check() or not content_analyser.pre_check() or not face_analyser.pre_check() or not face_masker.pre_check() or not audio_extractor.pre_check(): + if not pre_check() or not content_analyser.pre_check() or not face_analyser.pre_check() or not face_masker.pre_check() or not voice_extractor.pre_check(): return for frame_processor_module in get_frame_processors_modules(facefusion.globals.frame_processors): if not frame_processor_module.pre_check(): @@ -271,7 +271,7 @@ def force_download() -> None: content_analyser.MODELS, face_analyser.MODELS, face_masker.MODELS, - audio_extractor.MODELS + voice_extractor.MODELS ] for frame_processor_module in get_frame_processors_modules(available_frame_processors): diff --git a/facefusion/processors/frame/modules/lip_syncer.py b/facefusion/processors/frame/modules/lip_syncer.py index 10734953..ebad2706 100755 --- a/facefusion/processors/frame/modules/lip_syncer.py +++ b/facefusion/processors/frame/modules/lip_syncer.py @@ -24,7 +24,7 @@ from facefusion.filesystem import is_image, is_video, filter_audio_paths from facefusion.common_helper import get_first from facefusion.vision import read_image, write_image, read_static_image from facefusion.processors.frame.typings import LipSyncerInputs -from facefusion.audio_extractor import clear_voice_extractor +from facefusion.voice_extractor import clear_voice_extractor from facefusion.processors.frame import globals as frame_processors_globals from facefusion.processors.frame import choices as frame_processors_choices diff --git a/facefusion/audio_extractor.py b/facefusion/voice_extractor.py similarity index 87% rename from facefusion/audio_extractor.py rename to facefusion/voice_extractor.py index 1e20ab3e..881cce72 100644 --- a/facefusion/audio_extractor.py +++ b/facefusion/voice_extractor.py @@ -10,17 +10,18 @@ import facefusion.globals from facefusion import process_manager from facefusion.typing import ModelSet, AudioChunk, Audio from facefusion.execution import apply_execution_provider_options -from facefusion.filesystem import resolve_relative_path +from facefusion.filesystem import resolve_relative_path, is_file from facefusion.download import conditional_download VOICE_EXTRACTOR = None +THREAD_SEMAPHORE : threading.Semaphore = threading.Semaphore() THREAD_LOCK : threading.Lock = threading.Lock() MODELS : ModelSet =\ { 'voice_extractor': { - 'url': 'https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/Kim_Vocal_2.onnx', - 'path': resolve_relative_path('../.assets/models/Kim_Vocal_2.onnx') + 'url': 'https://github.com/facefusion/facefusion-assets/releases/download/models/voice_extractor.onnx', + 'path': resolve_relative_path('../.assets/models/voice_extractor.onnx') } } @@ -44,16 +45,15 @@ def clear_voice_extractor() -> None: def pre_check() -> bool: + download_directory_path = resolve_relative_path('../.assets/models') + model_url = MODELS.get('voice_extractor').get('url') + model_path = MODELS.get('voice_extractor').get('path') + if not facefusion.globals.skip_download: - download_directory_path = resolve_relative_path('../.assets/models') - model_urls =\ - [ - MODELS.get('voice_extractor').get('url'), - ] process_manager.check() - conditional_download(download_directory_path, model_urls) + conditional_download(download_directory_path, [ model_url ]) process_manager.end() - return True + return is_file(model_path) @lru_cache(maxsize = None) @@ -84,10 +84,11 @@ def extract_voice(audio_chunk : AudioChunk) -> AudioChunk: chunk_size = hop_length * (extractor_shape[2] - 1) audio_chunk, pad_size = prepare_audio_chunk(audio_chunk, chunk_size, trim_size) audio_chunk = decompose_audio_chunk(audio_chunk, filter_size, hop_length, frequency_bins, extractor_shape) - audio_chunk = voice_extractor.run(None, - { - voice_extractor.get_inputs()[0].name: audio_chunk - })[0] + with THREAD_SEMAPHORE: + audio_chunk = voice_extractor.run(None, + { + voice_extractor.get_inputs()[0].name: audio_chunk + })[0] audio_chunk = compose_audio_chunk(audio_chunk, filter_size, hop_length, frequency_bins, extractor_shape) audio_chunk = normalize_audio_chunk(audio_chunk, chunk_size, trim_size, pad_size) return audio_chunk