mirror of
https://github.com/facefusion/facefusion.git
synced 2026-06-07 05:03:54 +02:00
Some refactoring for audio.py and some astype() here and there
This commit is contained in:
+43
-33
@@ -9,6 +9,41 @@ from facefusion.typing import Fps, Audio, AudioFrame, Spectrogram, MelFilter
|
||||
from facefusion.voice_extractor import batch_extract_voice
|
||||
|
||||
|
||||
@lru_cache(maxsize = 128)
|
||||
def read_static_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
|
||||
return read_audio(audio_path, fps)
|
||||
|
||||
|
||||
def read_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
|
||||
if is_audio(audio_path):
|
||||
audio_buffer = read_audio_buffer(audio_path, 16000, 2)
|
||||
audio = numpy.frombuffer(audio_buffer, dtype = numpy.int16).reshape(-1, 2)
|
||||
audio = normalize_audio(audio)
|
||||
audio = filter_audio(audio, -0.97)
|
||||
spectrogram = create_spectrogram(audio, 16000, 80, 800, 55.0, 7600.0)
|
||||
audio_frames = extract_audio_frames(spectrogram, 80, 16, fps)
|
||||
return audio_frames
|
||||
return None
|
||||
|
||||
|
||||
@lru_cache(maxsize = 128)
|
||||
def read_static_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
|
||||
return read_voice(audio_path, fps)
|
||||
|
||||
|
||||
def read_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
|
||||
if is_audio(audio_path):
|
||||
audio_buffer = read_audio_buffer(audio_path, 16000, 2)
|
||||
audio = numpy.frombuffer(audio_buffer, dtype = numpy.int16).reshape(-1, 2)
|
||||
audio = batch_extract_voice(audio, 1000000, 0.75)
|
||||
audio = normalize_audio(audio)
|
||||
audio = filter_audio(audio, -0.97)
|
||||
spectrogram = create_spectrogram(audio, 16000, 80, 800, 55.0, 7600.0)
|
||||
audio_frames = extract_audio_frames(spectrogram, 80, 16, fps)
|
||||
return audio_frames
|
||||
return None
|
||||
|
||||
|
||||
def get_audio_frame(audio_path : str, fps : Fps, frame_number : int = 0) -> Optional[AudioFrame]:
|
||||
if is_audio(audio_path):
|
||||
audio_frames = read_static_audio(audio_path, fps)
|
||||
@@ -26,37 +61,10 @@ def get_voice_frame(audio_path : str, fps : Fps, frame_number : int = 0) -> Opti
|
||||
|
||||
|
||||
def create_empty_audio_frame() -> AudioFrame:
|
||||
audio_frame = numpy.zeros((80, 16), dtype = numpy.int16)
|
||||
audio_frame = numpy.zeros((80, 16)).astype(numpy.int16)
|
||||
return audio_frame
|
||||
|
||||
|
||||
@lru_cache(maxsize = None)
|
||||
def read_static_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
|
||||
if is_audio(audio_path):
|
||||
audio_buffer = read_audio_buffer(audio_path, 16000, 2)
|
||||
audio = numpy.frombuffer(audio_buffer, dtype = numpy.int16).reshape(-1, 2)
|
||||
audio = normalize_audio(audio)
|
||||
audio = filter_audio(audio, -0.97)
|
||||
spectrogram = create_spectrogram(audio, 16000, 80, 800, 55.0, 7600.0)
|
||||
audio_frames = extract_audio_frames(spectrogram, 80, 16, fps)
|
||||
return audio_frames
|
||||
return None
|
||||
|
||||
|
||||
@lru_cache(maxsize = None)
|
||||
def read_static_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
|
||||
if is_audio(audio_path):
|
||||
audio_buffer = read_audio_buffer(audio_path, 16000, 2)
|
||||
audio = numpy.frombuffer(audio_buffer, dtype = numpy.int16).reshape(-1, 2)
|
||||
audio = batch_extract_voice(audio, 1000000, 0.75)
|
||||
audio = normalize_audio(audio)
|
||||
audio = filter_audio(audio, -0.97)
|
||||
spectrogram = create_spectrogram(audio, 16000, 80, 800, 55.0, 7600.0)
|
||||
audio_frames = extract_audio_frames(spectrogram, 80, 16, fps)
|
||||
return audio_frames
|
||||
return None
|
||||
|
||||
|
||||
def normalize_audio(audio : numpy.ndarray[Any, Any]) -> Audio:
|
||||
if audio.ndim > 1:
|
||||
audio = numpy.mean(audio, axis = 1)
|
||||
@@ -77,19 +85,20 @@ def convert_mel_to_hertz(mel : numpy.ndarray[Any, Any]) -> numpy.ndarray[Any, An
|
||||
return 700 * (10 ** (mel / 2595) - 1)
|
||||
|
||||
|
||||
@lru_cache(maxsize = None)
|
||||
def create_static_mel_filter(sample_rate : int, filter_total : int, filter_size : int, frequency_minimum : float, frequency_maximum : float) -> MelFilter:
|
||||
def create_mel_filter(sample_rate : int, filter_total : int, filter_size : int, frequency_minimum : float, frequency_maximum : float) -> MelFilter:
|
||||
mel_filter = numpy.zeros((filter_total, filter_size // 2 + 1))
|
||||
mel_bins = numpy.linspace(convert_hertz_to_mel(frequency_minimum), convert_hertz_to_mel(frequency_maximum), filter_total + 2)
|
||||
indices = numpy.floor((filter_size + 1) * convert_mel_to_hertz(mel_bins) / sample_rate).astype(numpy.int16)
|
||||
|
||||
for index in range(filter_total):
|
||||
mel_filter[index, indices[index]: indices[index + 1]] = scipy.signal.windows.triang(indices[index + 1] - indices[index])
|
||||
start = indices[index]
|
||||
end = indices[index + 1]
|
||||
mel_filter[index, start:end] = scipy.signal.windows.triang(end - start)
|
||||
return mel_filter
|
||||
|
||||
|
||||
def create_spectrogram(audio : Audio, sample_rate : int, filter_total : int, filter_size : int, frequency_minimum : float, frequency_maximum : float) -> Spectrogram:
|
||||
mel_filter = create_static_mel_filter(sample_rate, filter_total, filter_size, frequency_minimum, frequency_maximum)
|
||||
mel_filter = create_mel_filter(sample_rate, filter_total, filter_size, frequency_minimum, frequency_maximum)
|
||||
spectrogram = scipy.signal.stft(audio, nperseg = filter_size, noverlap = 600, nfft = filter_size)[2]
|
||||
spectrogram = numpy.dot(mel_filter, numpy.abs(spectrogram))
|
||||
return spectrogram
|
||||
@@ -101,5 +110,6 @@ def extract_audio_frames(spectrogram : Spectrogram, filter_total : int, audio_fr
|
||||
indices = indices[indices >= audio_frame_step]
|
||||
|
||||
for index in indices:
|
||||
audio_frames.append(spectrogram[:, max(0, index - audio_frame_step) : index])
|
||||
start = max(0, index - audio_frame_step)
|
||||
audio_frames.append(spectrogram[:, start:index])
|
||||
return audio_frames
|
||||
|
||||
@@ -56,8 +56,8 @@ def warp_face_by_face_landmark_5(temp_vision_frame : VisionFrame, face_landmark_
|
||||
|
||||
|
||||
def warp_face_by_bounding_box(temp_vision_frame : VisionFrame, bounding_box : BoundingBox, crop_size : Size) -> Tuple[VisionFrame, Matrix]:
|
||||
source_points = numpy.array([ [ bounding_box[0], bounding_box[1] ], [bounding_box[2], bounding_box[1] ], [ bounding_box[0], bounding_box[3] ] ], dtype = numpy.float32)
|
||||
target_points = numpy.array([ [ 0, 0 ], [ crop_size[0], 0 ], [ 0, crop_size[1] ] ], dtype = numpy.float32)
|
||||
source_points = numpy.array([ [ bounding_box[0], bounding_box[1] ], [bounding_box[2], bounding_box[1] ], [ bounding_box[0], bounding_box[3] ] ]).astype(numpy.float32)
|
||||
target_points = numpy.array([ [ 0, 0 ], [ crop_size[0], 0 ], [ 0, crop_size[1] ] ]).astype(numpy.float32)
|
||||
affine_matrix = cv2.getAffineTransform(source_points, target_points)
|
||||
if bounding_box[2] - bounding_box[0] > crop_size[0] or bounding_box[3] - bounding_box[1] > crop_size[1]:
|
||||
interpolation_method = cv2.INTER_AREA
|
||||
|
||||
@@ -144,10 +144,10 @@ def create_region_mask(crop_vision_frame : VisionFrame, face_mask_regions : List
|
||||
region_mask = (cv2.GaussianBlur(region_mask.clip(0, 1), (0, 0), 5).clip(0.5, 1) - 0.5) * 2
|
||||
return region_mask
|
||||
|
||||
|
||||
1
|
||||
def create_mouth_mask(face_landmark_68 : FaceLandmark68) -> Mask:
|
||||
convex_hull = cv2.convexHull(face_landmark_68[numpy.r_[3:14, 31:36]].astype(numpy.int32))
|
||||
mouth_mask : Mask = numpy.zeros((512, 512), dtype = numpy.float32)
|
||||
mouth_mask : Mask = numpy.zeros((512, 512)).astype(numpy.float32)
|
||||
mouth_mask = cv2.fillConvexPoly(mouth_mask, convex_hull, 1.0)
|
||||
mouth_mask = cv2.erode(mouth_mask.clip(0, 1), numpy.ones((21, 3)))
|
||||
mouth_mask = cv2.GaussianBlur(mouth_mask, (0, 0), sigmaX = 1, sigmaY = 15)
|
||||
|
||||
@@ -219,7 +219,7 @@ def apply_enhance(crop_vision_frame : VisionFrame) -> VisionFrame:
|
||||
if frame_processor_input.name == 'input':
|
||||
frame_processor_inputs[frame_processor_input.name] = crop_vision_frame
|
||||
if frame_processor_input.name == 'weight':
|
||||
weight = numpy.array([ 1 ], dtype = numpy.double)
|
||||
weight = numpy.array([ 1 ]).astype(numpy.double)
|
||||
frame_processor_inputs[frame_processor_input.name] = weight
|
||||
with THREAD_SEMAPHORE:
|
||||
crop_vision_frame = frame_processor.run(None, frame_processor_inputs)[0][0]
|
||||
|
||||
@@ -64,8 +64,9 @@ def create_static_hanning_window(filter_size : int) -> Any:
|
||||
|
||||
def batch_extract_voice(audio : Audio, chunk_size : int, overlap_size : float) -> Audio:
|
||||
step_size = int(chunk_size * (1 - overlap_size))
|
||||
audio_total = numpy.zeros((audio.shape[0], 2), dtype = numpy.float32)
|
||||
audio_count = numpy.zeros((audio.shape[0], 2), dtype = numpy.float32)
|
||||
audio_total = numpy.zeros((audio.shape[0], 2)).astype(numpy.float32)
|
||||
audio_count = numpy.zeros((audio.shape[0], 2)).astype(numpy.float32)
|
||||
|
||||
for start in range(0, audio.shape[0], step_size):
|
||||
end = min(start + chunk_size, audio.shape[0])
|
||||
audio_total[start:end, ...] += extract_voice(audio[start:end, ...])
|
||||
@@ -94,14 +95,19 @@ def extract_voice(audio_chunk : AudioChunk) -> AudioChunk:
|
||||
return audio_chunk
|
||||
|
||||
|
||||
def prepare_audio_chunk(audio_chunk : AudioChunk, chunk_size : int, trim_size : int) -> Tuple[AudioChunk, int]:
|
||||
def prepare_audio_chunk(audio_chunk: AudioChunk, chunk_size: int, trim_size: int) -> Tuple[AudioChunk, int]:
|
||||
audio_chunk = audio_chunk.T
|
||||
step_size = chunk_size - 2 * trim_size
|
||||
pad_size = step_size - audio_chunk.shape[1] % step_size
|
||||
audio_chunk_size = audio_chunk.shape[1] + pad_size
|
||||
audio_chunk = audio_chunk.astype(numpy.float32) / numpy.iinfo(numpy.int16).max
|
||||
audio_chunk = numpy.pad(audio_chunk, ((0, 0), (trim_size, trim_size + pad_size)), mode = 'constant', constant_values = 0)
|
||||
audio_chunk = numpy.concatenate([ audio_chunk[:,i:i + chunk_size ] for i in range(0, audio_chunk_size, step_size)], axis = 0)
|
||||
temp_audio_chunks = []
|
||||
|
||||
for index in range(0, audio_chunk_size, step_size):
|
||||
chunk = audio_chunk[:, index:index + chunk_size]
|
||||
temp_audio_chunks.append(chunk)
|
||||
audio_chunk = numpy.concatenate(temp_audio_chunks, axis = 0)
|
||||
audio_chunk = audio_chunk.reshape((-1, chunk_size))
|
||||
return audio_chunk, pad_size
|
||||
|
||||
|
||||
Reference in New Issue
Block a user