mirror of
https://github.com/facefusion/facefusion.git
synced 2026-06-07 05:03:54 +02:00
Voice separator (#468)
* changes * changes * changes * changes * changes * changes
This commit is contained in:
@@ -6,6 +6,7 @@ import scipy
|
||||
from facefusion.filesystem import is_audio
|
||||
from facefusion.ffmpeg import read_audio_buffer
|
||||
from facefusion.typing import Fps, Audio, Spectrogram, AudioFrame
|
||||
from facefusion.audio_extractor import batch_extract_voice
|
||||
|
||||
|
||||
def get_audio_frame(audio_path : str, fps : Fps, frame_number : int = 0) -> Optional[AudioFrame]:
|
||||
@@ -16,6 +17,14 @@ def get_audio_frame(audio_path : str, fps : Fps, frame_number : int = 0) -> Opti
|
||||
return None
|
||||
|
||||
|
||||
def get_voice_frame(audio_path : str, fps : Fps, frame_number : int = 0) -> Optional[AudioFrame]:
|
||||
if is_audio(audio_path):
|
||||
audio_frames = read_static_voice(audio_path, fps)
|
||||
if frame_number in range(len(audio_frames)):
|
||||
return audio_frames[frame_number]
|
||||
return None
|
||||
|
||||
|
||||
def create_empty_audio_frame() -> AudioFrame:
|
||||
audio_frame = numpy.zeros((80, 16), dtype = numpy.int16)
|
||||
return audio_frame
|
||||
@@ -34,6 +43,20 @@ def read_static_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]
|
||||
return None
|
||||
|
||||
|
||||
@lru_cache(maxsize = None)
|
||||
def read_static_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
|
||||
if is_audio(audio_path):
|
||||
audio_buffer = read_audio_buffer(audio_path, 16000, 2)
|
||||
audio = numpy.frombuffer(audio_buffer, dtype = numpy.int16).reshape(-1, 2)
|
||||
audio = batch_extract_voice(audio, 1000000, 0.75)
|
||||
audio = normalize_audio(audio)
|
||||
audio = filter_audio(audio, -0.97)
|
||||
spectrogram = create_spectrogram(audio, 16000, 80, 800, 55.0, 7600.0)
|
||||
audio_frames = extract_audio_frames(spectrogram, 80, 16, fps)
|
||||
return audio_frames
|
||||
return None
|
||||
|
||||
|
||||
def normalize_audio(audio : numpy.ndarray[Any, Any]) -> Audio:
|
||||
if audio.ndim > 1:
|
||||
audio = numpy.mean(audio, axis = 1)
|
||||
|
||||
@@ -0,0 +1,132 @@
|
||||
from typing import Any, Tuple
|
||||
from functools import lru_cache
|
||||
from time import sleep
|
||||
import threading
|
||||
import scipy
|
||||
import numpy
|
||||
import onnxruntime
|
||||
|
||||
import facefusion.globals
|
||||
from facefusion import process_manager
|
||||
from facefusion.typing import ModelSet, AudioChunk, Audio
|
||||
from facefusion.execution import apply_execution_provider_options
|
||||
from facefusion.filesystem import resolve_relative_path
|
||||
from facefusion.download import conditional_download
|
||||
|
||||
VOICE_EXTRACTOR = None
|
||||
THREAD_LOCK : threading.Lock = threading.Lock()
|
||||
MODELS : ModelSet =\
|
||||
{
|
||||
'voice_extractor':
|
||||
{
|
||||
'url': 'https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/Kim_Vocal_2.onnx',
|
||||
'path': resolve_relative_path('../.assets/models/Kim_Vocal_2.onnx')
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def get_voice_extractor() -> Any:
|
||||
global VOICE_EXTRACTOR
|
||||
|
||||
with THREAD_LOCK:
|
||||
while process_manager.is_checking():
|
||||
sleep(0.5)
|
||||
if VOICE_EXTRACTOR is None:
|
||||
model_path = MODELS.get('voice_extractor').get('path')
|
||||
VOICE_EXTRACTOR = onnxruntime.InferenceSession(model_path, providers = apply_execution_provider_options(facefusion.globals.execution_providers))
|
||||
return VOICE_EXTRACTOR
|
||||
|
||||
|
||||
def clear_voice_extractor() -> None:
|
||||
global VOICE_EXTRACTOR
|
||||
|
||||
VOICE_EXTRACTOR = None
|
||||
|
||||
|
||||
def pre_check() -> bool:
|
||||
if not facefusion.globals.skip_download:
|
||||
download_directory_path = resolve_relative_path('../.assets/models')
|
||||
model_urls =\
|
||||
[
|
||||
MODELS.get('voice_extractor').get('url'),
|
||||
]
|
||||
process_manager.check()
|
||||
conditional_download(download_directory_path, model_urls)
|
||||
process_manager.end()
|
||||
return True
|
||||
|
||||
|
||||
@lru_cache(maxsize = None)
|
||||
def create_static_hanning_window(filter_size : int) -> Any:
|
||||
window = scipy.signal.windows.hann(filter_size, sym = False)
|
||||
return window
|
||||
|
||||
|
||||
def batch_extract_voice(audio : Audio, chunk_size : int, overlap_size : float) -> Audio:
|
||||
step_size = int(chunk_size * (1 - overlap_size))
|
||||
audio_total = numpy.zeros((audio.shape[0], 2), dtype = numpy.float32)
|
||||
audio_count = numpy.zeros((audio.shape[0], 2), dtype = numpy.float32)
|
||||
for start in range(0, audio.shape[0], step_size):
|
||||
end = min(start + chunk_size, audio.shape[0])
|
||||
audio_total[start:end, ...] += extract_voice(audio[start:end, ...])
|
||||
audio_count[start:end, ...] += 1
|
||||
audio = audio_total / audio_count
|
||||
return audio
|
||||
|
||||
|
||||
def extract_voice(audio_chunk : AudioChunk) -> AudioChunk:
|
||||
voice_extractor = get_voice_extractor()
|
||||
extractor_shape = voice_extractor.get_inputs()[0].shape[1:]
|
||||
hop_length = 1024
|
||||
filter_size = 7680
|
||||
trim_size = filter_size // 2
|
||||
frequency_bins = trim_size + 1
|
||||
chunk_size = hop_length * (extractor_shape[2] - 1)
|
||||
audio_chunk, pad_size = prepare_audio_chunk(audio_chunk, chunk_size, trim_size)
|
||||
audio_chunk = decompose_audio_chunk(audio_chunk, filter_size, hop_length, frequency_bins, extractor_shape)
|
||||
audio_chunk = voice_extractor.run(None,
|
||||
{
|
||||
voice_extractor.get_inputs()[0].name: audio_chunk
|
||||
})[0]
|
||||
audio_chunk = compose_audio_chunk(audio_chunk, filter_size, hop_length, frequency_bins, extractor_shape)
|
||||
audio_chunk = normalize_audio_chunk(audio_chunk, chunk_size, trim_size, pad_size)
|
||||
return audio_chunk
|
||||
|
||||
|
||||
def prepare_audio_chunk(audio_chunk : AudioChunk, chunk_size : int, trim_size : int) -> Tuple[AudioChunk, int]:
|
||||
audio_chunk = audio_chunk.T
|
||||
step_size = chunk_size - 2 * trim_size
|
||||
pad_size = step_size - audio_chunk.shape[1] % step_size
|
||||
audio_chunk_size = audio_chunk.shape[1] + pad_size
|
||||
audio_chunk = audio_chunk.astype(numpy.float32) / numpy.iinfo(numpy.int16).max
|
||||
audio_chunk = numpy.pad(audio_chunk, ((0, 0), (trim_size, trim_size + pad_size)), mode='constant', constant_values = 0)
|
||||
audio_chunk = numpy.concatenate([audio_chunk[:,i:i + chunk_size] for i in range(0, audio_chunk_size, step_size)], axis = 0)
|
||||
audio_chunk = audio_chunk.reshape((-1, chunk_size))
|
||||
return audio_chunk, pad_size
|
||||
|
||||
|
||||
def decompose_audio_chunk(audio_chunk : AudioChunk, filter_size : int, hop_length : int, frequency_bins : int, extractor_shape : Tuple[int, int, int]) -> AudioChunk:
|
||||
window = create_static_hanning_window(filter_size)
|
||||
audio_chunk = scipy.signal.stft(audio_chunk, nperseg = filter_size, noverlap = filter_size - hop_length, window = window, padded = False)[2]
|
||||
audio_chunk = numpy.stack((numpy.real(audio_chunk), numpy.imag(audio_chunk)), axis = -1).transpose((0, 3, 1, 2))
|
||||
audio_chunk = audio_chunk.reshape((-1, 2, 2, frequency_bins, extractor_shape[2])).reshape((-1, extractor_shape[0], frequency_bins, extractor_shape[2]))
|
||||
audio_chunk = audio_chunk[:,:,:extractor_shape[1]]
|
||||
audio_chunk /= numpy.sqrt(1.0 / window.sum() ** 2)
|
||||
return audio_chunk
|
||||
|
||||
|
||||
def compose_audio_chunk(audio_chunk : AudioChunk, filter_size : int, hop_length : int, frequency_bins : int, extractor_shape : Tuple[int, int, int]) -> AudioChunk:
|
||||
window = create_static_hanning_window(filter_size)
|
||||
audio_chunk = numpy.pad(audio_chunk, ((0, 0), (0, 0), (0, frequency_bins - extractor_shape[1]), (0, 0)), mode = 'constant')
|
||||
audio_chunk = audio_chunk.reshape(-1, 2, frequency_bins, extractor_shape[2]).transpose((0, 2, 3, 1))
|
||||
audio_chunk = audio_chunk[:,:,:,0] + 1j * audio_chunk[:,:,:,1]
|
||||
audio_chunk = scipy.signal.istft(audio_chunk, nperseg = filter_size, noverlap = filter_size - hop_length, window = window)[1]
|
||||
audio_chunk *= numpy.sqrt(1.0 / window.sum() ** 2)
|
||||
return audio_chunk
|
||||
|
||||
|
||||
def normalize_audio_chunk(audio_chunk : AudioChunk, chunk_size : int, trim_size : int, pad_size : int) -> AudioChunk:
|
||||
audio_chunk = audio_chunk.reshape((-1, 2, chunk_size))
|
||||
audio_chunk = audio_chunk[:,:,trim_size:-trim_size].transpose(1, 0, 2)
|
||||
audio_chunk = audio_chunk.reshape(2, -1)[:,:-pad_size].T
|
||||
return audio_chunk
|
||||
+4
-3
@@ -15,7 +15,7 @@ import facefusion.choices
|
||||
import facefusion.globals
|
||||
from facefusion.face_analyser import get_one_face, get_average_face
|
||||
from facefusion.face_store import get_reference_faces, append_reference_face
|
||||
from facefusion import face_analyser, face_masker, content_analyser, config, process_manager, metadata, logger, wording
|
||||
from facefusion import face_analyser, face_masker, content_analyser, config, process_manager, metadata, logger, wording, audio_extractor
|
||||
from facefusion.content_analyser import analyse_image, analyse_video
|
||||
from facefusion.processors.frame.core import get_frame_processors_modules, load_frame_processor_module
|
||||
from facefusion.common_helper import create_metavar, get_first
|
||||
@@ -193,7 +193,7 @@ def run(program : ArgumentParser) -> None:
|
||||
if facefusion.globals.force_download:
|
||||
force_download()
|
||||
return
|
||||
if not pre_check() or not content_analyser.pre_check() or not face_analyser.pre_check() or not face_masker.pre_check():
|
||||
if not pre_check() or not content_analyser.pre_check() or not face_analyser.pre_check() or not face_masker.pre_check() or not audio_extractor.pre_check():
|
||||
return
|
||||
for frame_processor_module in get_frame_processors_modules(facefusion.globals.frame_processors):
|
||||
if not frame_processor_module.pre_check():
|
||||
@@ -270,7 +270,8 @@ def force_download() -> None:
|
||||
[
|
||||
content_analyser.MODELS,
|
||||
face_analyser.MODELS,
|
||||
face_masker.MODELS
|
||||
face_masker.MODELS,
|
||||
audio_extractor.MODELS
|
||||
]
|
||||
|
||||
for frame_processor_module in get_frame_processors_modules(available_frame_processors):
|
||||
|
||||
@@ -19,11 +19,12 @@ from facefusion.normalizer import normalize_output_path
|
||||
from facefusion.typing import Face, VisionFrame, UpdateProcess, ProcessMode, ModelSet, OptionsWithModel, AudioFrame, QueuePayload
|
||||
from facefusion.filesystem import is_file, has_audio, resolve_relative_path
|
||||
from facefusion.download import conditional_download, is_download_done
|
||||
from facefusion.audio import read_static_audio, get_audio_frame, create_empty_audio_frame
|
||||
from facefusion.audio import read_static_voice, get_voice_frame, create_empty_audio_frame
|
||||
from facefusion.filesystem import is_image, is_video, filter_audio_paths
|
||||
from facefusion.common_helper import get_first
|
||||
from facefusion.vision import read_image, write_image, read_static_image
|
||||
from facefusion.processors.frame.typings import LipSyncerInputs
|
||||
from facefusion.audio_extractor import clear_voice_extractor
|
||||
from facefusion.processors.frame import globals as frame_processors_globals
|
||||
from facefusion.processors.frame import choices as frame_processors_choices
|
||||
|
||||
@@ -125,7 +126,7 @@ def pre_process(mode : ProcessMode) -> bool:
|
||||
|
||||
def post_process() -> None:
|
||||
read_static_image.cache_clear()
|
||||
read_static_audio.cache_clear()
|
||||
read_static_voice.cache_clear()
|
||||
if facefusion.globals.video_memory_strategy == 'strict' or facefusion.globals.video_memory_strategy == 'moderate':
|
||||
clear_frame_processor()
|
||||
if facefusion.globals.video_memory_strategy == 'strict':
|
||||
@@ -133,6 +134,7 @@ def post_process() -> None:
|
||||
clear_content_analyser()
|
||||
clear_face_occluder()
|
||||
clear_face_parser()
|
||||
clear_voice_extractor()
|
||||
|
||||
|
||||
def sync_lip(target_face : Face, temp_audio_frame : AudioFrame, temp_vision_frame : VisionFrame) -> VisionFrame:
|
||||
@@ -222,7 +224,7 @@ def process_frames(source_paths : List[str], queue_payloads : List[QueuePayload]
|
||||
for queue_payload in process_manager.manage(queue_payloads):
|
||||
frame_number = queue_payload['frame_number']
|
||||
target_vision_path = queue_payload['frame_path']
|
||||
source_audio_frame = get_audio_frame(source_audio_path, facefusion.globals.output_video_fps, frame_number)
|
||||
source_audio_frame = get_voice_frame(source_audio_path, facefusion.globals.output_video_fps, frame_number)
|
||||
if not numpy.any(source_audio_frame):
|
||||
source_audio_frame = create_empty_audio_frame()
|
||||
target_vision_frame = read_image(target_vision_path)
|
||||
|
||||
@@ -43,6 +43,7 @@ Translation = numpy.ndarray[Any, Any]
|
||||
|
||||
AudioBuffer = bytes
|
||||
Audio = numpy.ndarray[Any, Any]
|
||||
AudioChunk = numpy.ndarray[Any, Any]
|
||||
AudioFrame = numpy.ndarray[Any, Any]
|
||||
Spectrogram = numpy.ndarray[Any, Any]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user