Some refactoring for audio.py and some astype() here and there

This commit is contained in:
henryruhs
2024-04-02 15:21:23 +02:00
parent 7b9ccdcf3a
commit 66afcfd83d
5 changed files with 58 additions and 42 deletions
+43 -33
View File
@@ -9,6 +9,41 @@ from facefusion.typing import Fps, Audio, AudioFrame, Spectrogram, MelFilter
from facefusion.voice_extractor import batch_extract_voice
@lru_cache(maxsize = 128)
def read_static_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
return read_audio(audio_path, fps)
def read_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
if is_audio(audio_path):
audio_buffer = read_audio_buffer(audio_path, 16000, 2)
audio = numpy.frombuffer(audio_buffer, dtype = numpy.int16).reshape(-1, 2)
audio = normalize_audio(audio)
audio = filter_audio(audio, -0.97)
spectrogram = create_spectrogram(audio, 16000, 80, 800, 55.0, 7600.0)
audio_frames = extract_audio_frames(spectrogram, 80, 16, fps)
return audio_frames
return None
@lru_cache(maxsize = 128)
def read_static_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
return read_voice(audio_path, fps)
def read_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
if is_audio(audio_path):
audio_buffer = read_audio_buffer(audio_path, 16000, 2)
audio = numpy.frombuffer(audio_buffer, dtype = numpy.int16).reshape(-1, 2)
audio = batch_extract_voice(audio, 1000000, 0.75)
audio = normalize_audio(audio)
audio = filter_audio(audio, -0.97)
spectrogram = create_spectrogram(audio, 16000, 80, 800, 55.0, 7600.0)
audio_frames = extract_audio_frames(spectrogram, 80, 16, fps)
return audio_frames
return None
def get_audio_frame(audio_path : str, fps : Fps, frame_number : int = 0) -> Optional[AudioFrame]:
if is_audio(audio_path):
audio_frames = read_static_audio(audio_path, fps)
@@ -26,37 +61,10 @@ def get_voice_frame(audio_path : str, fps : Fps, frame_number : int = 0) -> Opti
def create_empty_audio_frame() -> AudioFrame:
audio_frame = numpy.zeros((80, 16), dtype = numpy.int16)
audio_frame = numpy.zeros((80, 16)).astype(numpy.int16)
return audio_frame
@lru_cache(maxsize = None)
def read_static_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
if is_audio(audio_path):
audio_buffer = read_audio_buffer(audio_path, 16000, 2)
audio = numpy.frombuffer(audio_buffer, dtype = numpy.int16).reshape(-1, 2)
audio = normalize_audio(audio)
audio = filter_audio(audio, -0.97)
spectrogram = create_spectrogram(audio, 16000, 80, 800, 55.0, 7600.0)
audio_frames = extract_audio_frames(spectrogram, 80, 16, fps)
return audio_frames
return None
@lru_cache(maxsize = None)
def read_static_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
if is_audio(audio_path):
audio_buffer = read_audio_buffer(audio_path, 16000, 2)
audio = numpy.frombuffer(audio_buffer, dtype = numpy.int16).reshape(-1, 2)
audio = batch_extract_voice(audio, 1000000, 0.75)
audio = normalize_audio(audio)
audio = filter_audio(audio, -0.97)
spectrogram = create_spectrogram(audio, 16000, 80, 800, 55.0, 7600.0)
audio_frames = extract_audio_frames(spectrogram, 80, 16, fps)
return audio_frames
return None
def normalize_audio(audio : numpy.ndarray[Any, Any]) -> Audio:
if audio.ndim > 1:
audio = numpy.mean(audio, axis = 1)
@@ -77,19 +85,20 @@ def convert_mel_to_hertz(mel : numpy.ndarray[Any, Any]) -> numpy.ndarray[Any, An
return 700 * (10 ** (mel / 2595) - 1)
@lru_cache(maxsize = None)
def create_static_mel_filter(sample_rate : int, filter_total : int, filter_size : int, frequency_minimum : float, frequency_maximum : float) -> MelFilter:
def create_mel_filter(sample_rate : int, filter_total : int, filter_size : int, frequency_minimum : float, frequency_maximum : float) -> MelFilter:
mel_filter = numpy.zeros((filter_total, filter_size // 2 + 1))
mel_bins = numpy.linspace(convert_hertz_to_mel(frequency_minimum), convert_hertz_to_mel(frequency_maximum), filter_total + 2)
indices = numpy.floor((filter_size + 1) * convert_mel_to_hertz(mel_bins) / sample_rate).astype(numpy.int16)
for index in range(filter_total):
mel_filter[index, indices[index]: indices[index + 1]] = scipy.signal.windows.triang(indices[index + 1] - indices[index])
start = indices[index]
end = indices[index + 1]
mel_filter[index, start:end] = scipy.signal.windows.triang(end - start)
return mel_filter
def create_spectrogram(audio : Audio, sample_rate : int, filter_total : int, filter_size : int, frequency_minimum : float, frequency_maximum : float) -> Spectrogram:
mel_filter = create_static_mel_filter(sample_rate, filter_total, filter_size, frequency_minimum, frequency_maximum)
mel_filter = create_mel_filter(sample_rate, filter_total, filter_size, frequency_minimum, frequency_maximum)
spectrogram = scipy.signal.stft(audio, nperseg = filter_size, noverlap = 600, nfft = filter_size)[2]
spectrogram = numpy.dot(mel_filter, numpy.abs(spectrogram))
return spectrogram
@@ -101,5 +110,6 @@ def extract_audio_frames(spectrogram : Spectrogram, filter_total : int, audio_fr
indices = indices[indices >= audio_frame_step]
for index in indices:
audio_frames.append(spectrogram[:, max(0, index - audio_frame_step) : index])
start = max(0, index - audio_frame_step)
audio_frames.append(spectrogram[:, start:index])
return audio_frames
+2 -2
View File
@@ -56,8 +56,8 @@ def warp_face_by_face_landmark_5(temp_vision_frame : VisionFrame, face_landmark_
def warp_face_by_bounding_box(temp_vision_frame : VisionFrame, bounding_box : BoundingBox, crop_size : Size) -> Tuple[VisionFrame, Matrix]:
source_points = numpy.array([ [ bounding_box[0], bounding_box[1] ], [bounding_box[2], bounding_box[1] ], [ bounding_box[0], bounding_box[3] ] ], dtype = numpy.float32)
target_points = numpy.array([ [ 0, 0 ], [ crop_size[0], 0 ], [ 0, crop_size[1] ] ], dtype = numpy.float32)
source_points = numpy.array([ [ bounding_box[0], bounding_box[1] ], [bounding_box[2], bounding_box[1] ], [ bounding_box[0], bounding_box[3] ] ]).astype(numpy.float32)
target_points = numpy.array([ [ 0, 0 ], [ crop_size[0], 0 ], [ 0, crop_size[1] ] ]).astype(numpy.float32)
affine_matrix = cv2.getAffineTransform(source_points, target_points)
if bounding_box[2] - bounding_box[0] > crop_size[0] or bounding_box[3] - bounding_box[1] > crop_size[1]:
interpolation_method = cv2.INTER_AREA
+2 -2
View File
@@ -144,10 +144,10 @@ def create_region_mask(crop_vision_frame : VisionFrame, face_mask_regions : List
region_mask = (cv2.GaussianBlur(region_mask.clip(0, 1), (0, 0), 5).clip(0.5, 1) - 0.5) * 2
return region_mask
1
def create_mouth_mask(face_landmark_68 : FaceLandmark68) -> Mask:
convex_hull = cv2.convexHull(face_landmark_68[numpy.r_[3:14, 31:36]].astype(numpy.int32))
mouth_mask : Mask = numpy.zeros((512, 512), dtype = numpy.float32)
mouth_mask : Mask = numpy.zeros((512, 512)).astype(numpy.float32)
mouth_mask = cv2.fillConvexPoly(mouth_mask, convex_hull, 1.0)
mouth_mask = cv2.erode(mouth_mask.clip(0, 1), numpy.ones((21, 3)))
mouth_mask = cv2.GaussianBlur(mouth_mask, (0, 0), sigmaX = 1, sigmaY = 15)
@@ -219,7 +219,7 @@ def apply_enhance(crop_vision_frame : VisionFrame) -> VisionFrame:
if frame_processor_input.name == 'input':
frame_processor_inputs[frame_processor_input.name] = crop_vision_frame
if frame_processor_input.name == 'weight':
weight = numpy.array([ 1 ], dtype = numpy.double)
weight = numpy.array([ 1 ]).astype(numpy.double)
frame_processor_inputs[frame_processor_input.name] = weight
with THREAD_SEMAPHORE:
crop_vision_frame = frame_processor.run(None, frame_processor_inputs)[0][0]
+10 -4
View File
@@ -64,8 +64,9 @@ def create_static_hanning_window(filter_size : int) -> Any:
def batch_extract_voice(audio : Audio, chunk_size : int, overlap_size : float) -> Audio:
step_size = int(chunk_size * (1 - overlap_size))
audio_total = numpy.zeros((audio.shape[0], 2), dtype = numpy.float32)
audio_count = numpy.zeros((audio.shape[0], 2), dtype = numpy.float32)
audio_total = numpy.zeros((audio.shape[0], 2)).astype(numpy.float32)
audio_count = numpy.zeros((audio.shape[0], 2)).astype(numpy.float32)
for start in range(0, audio.shape[0], step_size):
end = min(start + chunk_size, audio.shape[0])
audio_total[start:end, ...] += extract_voice(audio[start:end, ...])
@@ -94,14 +95,19 @@ def extract_voice(audio_chunk : AudioChunk) -> AudioChunk:
return audio_chunk
def prepare_audio_chunk(audio_chunk : AudioChunk, chunk_size : int, trim_size : int) -> Tuple[AudioChunk, int]:
def prepare_audio_chunk(audio_chunk: AudioChunk, chunk_size: int, trim_size: int) -> Tuple[AudioChunk, int]:
audio_chunk = audio_chunk.T
step_size = chunk_size - 2 * trim_size
pad_size = step_size - audio_chunk.shape[1] % step_size
audio_chunk_size = audio_chunk.shape[1] + pad_size
audio_chunk = audio_chunk.astype(numpy.float32) / numpy.iinfo(numpy.int16).max
audio_chunk = numpy.pad(audio_chunk, ((0, 0), (trim_size, trim_size + pad_size)), mode = 'constant', constant_values = 0)
audio_chunk = numpy.concatenate([ audio_chunk[:,i:i + chunk_size ] for i in range(0, audio_chunk_size, step_size)], axis = 0)
temp_audio_chunks = []
for index in range(0, audio_chunk_size, step_size):
chunk = audio_chunk[:, index:index + chunk_size]
temp_audio_chunks.append(chunk)
audio_chunk = numpy.concatenate(temp_audio_chunks, axis = 0)
audio_chunk = audio_chunk.reshape((-1, chunk_size))
return audio_chunk, pad_size