diff --git a/facefusion/audio.py b/facefusion/audio.py index 7b4904b3..f941d2c2 100644 --- a/facefusion/audio.py +++ b/facefusion/audio.py @@ -9,6 +9,41 @@ from facefusion.typing import Fps, Audio, AudioFrame, Spectrogram, MelFilter from facefusion.voice_extractor import batch_extract_voice +@lru_cache(maxsize = 128) +def read_static_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]: + return read_audio(audio_path, fps) + + +def read_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]: + if is_audio(audio_path): + audio_buffer = read_audio_buffer(audio_path, 16000, 2) + audio = numpy.frombuffer(audio_buffer, dtype = numpy.int16).reshape(-1, 2) + audio = normalize_audio(audio) + audio = filter_audio(audio, -0.97) + spectrogram = create_spectrogram(audio, 16000, 80, 800, 55.0, 7600.0) + audio_frames = extract_audio_frames(spectrogram, 80, 16, fps) + return audio_frames + return None + + +@lru_cache(maxsize = 128) +def read_static_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]: + return read_voice(audio_path, fps) + + +def read_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]: + if is_audio(audio_path): + audio_buffer = read_audio_buffer(audio_path, 16000, 2) + audio = numpy.frombuffer(audio_buffer, dtype = numpy.int16).reshape(-1, 2) + audio = batch_extract_voice(audio, 1000000, 0.75) + audio = normalize_audio(audio) + audio = filter_audio(audio, -0.97) + spectrogram = create_spectrogram(audio, 16000, 80, 800, 55.0, 7600.0) + audio_frames = extract_audio_frames(spectrogram, 80, 16, fps) + return audio_frames + return None + + def get_audio_frame(audio_path : str, fps : Fps, frame_number : int = 0) -> Optional[AudioFrame]: if is_audio(audio_path): audio_frames = read_static_audio(audio_path, fps) @@ -26,37 +61,10 @@ def get_voice_frame(audio_path : str, fps : Fps, frame_number : int = 0) -> Opti def create_empty_audio_frame() -> AudioFrame: - audio_frame = numpy.zeros((80, 16), dtype = numpy.int16) + audio_frame = numpy.zeros((80, 16)).astype(numpy.int16) return audio_frame -@lru_cache(maxsize = None) -def read_static_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]: - if is_audio(audio_path): - audio_buffer = read_audio_buffer(audio_path, 16000, 2) - audio = numpy.frombuffer(audio_buffer, dtype = numpy.int16).reshape(-1, 2) - audio = normalize_audio(audio) - audio = filter_audio(audio, -0.97) - spectrogram = create_spectrogram(audio, 16000, 80, 800, 55.0, 7600.0) - audio_frames = extract_audio_frames(spectrogram, 80, 16, fps) - return audio_frames - return None - - -@lru_cache(maxsize = None) -def read_static_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]: - if is_audio(audio_path): - audio_buffer = read_audio_buffer(audio_path, 16000, 2) - audio = numpy.frombuffer(audio_buffer, dtype = numpy.int16).reshape(-1, 2) - audio = batch_extract_voice(audio, 1000000, 0.75) - audio = normalize_audio(audio) - audio = filter_audio(audio, -0.97) - spectrogram = create_spectrogram(audio, 16000, 80, 800, 55.0, 7600.0) - audio_frames = extract_audio_frames(spectrogram, 80, 16, fps) - return audio_frames - return None - - def normalize_audio(audio : numpy.ndarray[Any, Any]) -> Audio: if audio.ndim > 1: audio = numpy.mean(audio, axis = 1) @@ -77,19 +85,20 @@ def convert_mel_to_hertz(mel : numpy.ndarray[Any, Any]) -> numpy.ndarray[Any, An return 700 * (10 ** (mel / 2595) - 1) -@lru_cache(maxsize = None) -def create_static_mel_filter(sample_rate : int, filter_total : int, filter_size : int, frequency_minimum : float, frequency_maximum : float) -> MelFilter: +def create_mel_filter(sample_rate : int, filter_total : int, filter_size : int, frequency_minimum : float, frequency_maximum : float) -> MelFilter: mel_filter = numpy.zeros((filter_total, filter_size // 2 + 1)) mel_bins = numpy.linspace(convert_hertz_to_mel(frequency_minimum), convert_hertz_to_mel(frequency_maximum), filter_total + 2) indices = numpy.floor((filter_size + 1) * convert_mel_to_hertz(mel_bins) / sample_rate).astype(numpy.int16) for index in range(filter_total): - mel_filter[index, indices[index]: indices[index + 1]] = scipy.signal.windows.triang(indices[index + 1] - indices[index]) + start = indices[index] + end = indices[index + 1] + mel_filter[index, start:end] = scipy.signal.windows.triang(end - start) return mel_filter def create_spectrogram(audio : Audio, sample_rate : int, filter_total : int, filter_size : int, frequency_minimum : float, frequency_maximum : float) -> Spectrogram: - mel_filter = create_static_mel_filter(sample_rate, filter_total, filter_size, frequency_minimum, frequency_maximum) + mel_filter = create_mel_filter(sample_rate, filter_total, filter_size, frequency_minimum, frequency_maximum) spectrogram = scipy.signal.stft(audio, nperseg = filter_size, noverlap = 600, nfft = filter_size)[2] spectrogram = numpy.dot(mel_filter, numpy.abs(spectrogram)) return spectrogram @@ -101,5 +110,6 @@ def extract_audio_frames(spectrogram : Spectrogram, filter_total : int, audio_fr indices = indices[indices >= audio_frame_step] for index in indices: - audio_frames.append(spectrogram[:, max(0, index - audio_frame_step) : index]) + start = max(0, index - audio_frame_step) + audio_frames.append(spectrogram[:, start:index]) return audio_frames diff --git a/facefusion/face_helper.py b/facefusion/face_helper.py index d0697db5..83eff56e 100644 --- a/facefusion/face_helper.py +++ b/facefusion/face_helper.py @@ -56,8 +56,8 @@ def warp_face_by_face_landmark_5(temp_vision_frame : VisionFrame, face_landmark_ def warp_face_by_bounding_box(temp_vision_frame : VisionFrame, bounding_box : BoundingBox, crop_size : Size) -> Tuple[VisionFrame, Matrix]: - source_points = numpy.array([ [ bounding_box[0], bounding_box[1] ], [bounding_box[2], bounding_box[1] ], [ bounding_box[0], bounding_box[3] ] ], dtype = numpy.float32) - target_points = numpy.array([ [ 0, 0 ], [ crop_size[0], 0 ], [ 0, crop_size[1] ] ], dtype = numpy.float32) + source_points = numpy.array([ [ bounding_box[0], bounding_box[1] ], [bounding_box[2], bounding_box[1] ], [ bounding_box[0], bounding_box[3] ] ]).astype(numpy.float32) + target_points = numpy.array([ [ 0, 0 ], [ crop_size[0], 0 ], [ 0, crop_size[1] ] ]).astype(numpy.float32) affine_matrix = cv2.getAffineTransform(source_points, target_points) if bounding_box[2] - bounding_box[0] > crop_size[0] or bounding_box[3] - bounding_box[1] > crop_size[1]: interpolation_method = cv2.INTER_AREA diff --git a/facefusion/face_masker.py b/facefusion/face_masker.py index 3c68a4c0..b6796c82 100755 --- a/facefusion/face_masker.py +++ b/facefusion/face_masker.py @@ -144,10 +144,10 @@ def create_region_mask(crop_vision_frame : VisionFrame, face_mask_regions : List region_mask = (cv2.GaussianBlur(region_mask.clip(0, 1), (0, 0), 5).clip(0.5, 1) - 0.5) * 2 return region_mask - +1 def create_mouth_mask(face_landmark_68 : FaceLandmark68) -> Mask: convex_hull = cv2.convexHull(face_landmark_68[numpy.r_[3:14, 31:36]].astype(numpy.int32)) - mouth_mask : Mask = numpy.zeros((512, 512), dtype = numpy.float32) + mouth_mask : Mask = numpy.zeros((512, 512)).astype(numpy.float32) mouth_mask = cv2.fillConvexPoly(mouth_mask, convex_hull, 1.0) mouth_mask = cv2.erode(mouth_mask.clip(0, 1), numpy.ones((21, 3))) mouth_mask = cv2.GaussianBlur(mouth_mask, (0, 0), sigmaX = 1, sigmaY = 15) diff --git a/facefusion/processors/frame/modules/face_enhancer.py b/facefusion/processors/frame/modules/face_enhancer.py index 998ecf09..ca40ab70 100755 --- a/facefusion/processors/frame/modules/face_enhancer.py +++ b/facefusion/processors/frame/modules/face_enhancer.py @@ -219,7 +219,7 @@ def apply_enhance(crop_vision_frame : VisionFrame) -> VisionFrame: if frame_processor_input.name == 'input': frame_processor_inputs[frame_processor_input.name] = crop_vision_frame if frame_processor_input.name == 'weight': - weight = numpy.array([ 1 ], dtype = numpy.double) + weight = numpy.array([ 1 ]).astype(numpy.double) frame_processor_inputs[frame_processor_input.name] = weight with THREAD_SEMAPHORE: crop_vision_frame = frame_processor.run(None, frame_processor_inputs)[0][0] diff --git a/facefusion/voice_extractor.py b/facefusion/voice_extractor.py index 4932aef8..c914e01c 100644 --- a/facefusion/voice_extractor.py +++ b/facefusion/voice_extractor.py @@ -64,8 +64,9 @@ def create_static_hanning_window(filter_size : int) -> Any: def batch_extract_voice(audio : Audio, chunk_size : int, overlap_size : float) -> Audio: step_size = int(chunk_size * (1 - overlap_size)) - audio_total = numpy.zeros((audio.shape[0], 2), dtype = numpy.float32) - audio_count = numpy.zeros((audio.shape[0], 2), dtype = numpy.float32) + audio_total = numpy.zeros((audio.shape[0], 2)).astype(numpy.float32) + audio_count = numpy.zeros((audio.shape[0], 2)).astype(numpy.float32) + for start in range(0, audio.shape[0], step_size): end = min(start + chunk_size, audio.shape[0]) audio_total[start:end, ...] += extract_voice(audio[start:end, ...]) @@ -94,14 +95,19 @@ def extract_voice(audio_chunk : AudioChunk) -> AudioChunk: return audio_chunk -def prepare_audio_chunk(audio_chunk : AudioChunk, chunk_size : int, trim_size : int) -> Tuple[AudioChunk, int]: +def prepare_audio_chunk(audio_chunk: AudioChunk, chunk_size: int, trim_size: int) -> Tuple[AudioChunk, int]: audio_chunk = audio_chunk.T step_size = chunk_size - 2 * trim_size pad_size = step_size - audio_chunk.shape[1] % step_size audio_chunk_size = audio_chunk.shape[1] + pad_size audio_chunk = audio_chunk.astype(numpy.float32) / numpy.iinfo(numpy.int16).max audio_chunk = numpy.pad(audio_chunk, ((0, 0), (trim_size, trim_size + pad_size)), mode = 'constant', constant_values = 0) - audio_chunk = numpy.concatenate([ audio_chunk[:,i:i + chunk_size ] for i in range(0, audio_chunk_size, step_size)], axis = 0) + temp_audio_chunks = [] + + for index in range(0, audio_chunk_size, step_size): + chunk = audio_chunk[:, index:index + chunk_size] + temp_audio_chunks.append(chunk) + audio_chunk = numpy.concatenate(temp_audio_chunks, axis = 0) audio_chunk = audio_chunk.reshape((-1, chunk_size)) return audio_chunk, pad_size