From ab7110eb92a71db2e5436f5736bc4bda3e91a010 Mon Sep 17 00:00:00 2001 From: Harisreedhar <46858047+harisreedhar@users.noreply.github.com> Date: Mon, 11 May 2026 19:19:09 +0530 Subject: [PATCH] Add audio encoder (#1096) * add audio encoder * add todos * add todos * cleannup and add todos * fix lint --------- Co-authored-by: henryruhs --- facefusion/apis/stream_helper.py | 65 ++---------------------------- facefusion/audio_encoder.py | 64 +++++++++++++++++++++++++++++ facefusion/video_encoder.py | 2 +- tests/test_audio_encoder.py | 69 ++++++++++++++++++++++++++++++++ tests/test_video_encoder.py | 14 ++++++- 5 files changed, 151 insertions(+), 63 deletions(-) create mode 100644 facefusion/audio_encoder.py create mode 100644 tests/test_audio_encoder.py diff --git a/facefusion/apis/stream_helper.py b/facefusion/apis/stream_helper.py index d3166c68..29dab13a 100644 --- a/facefusion/apis/stream_helper.py +++ b/facefusion/apis/stream_helper.py @@ -1,8 +1,7 @@ import asyncio -import ctypes from collections import deque from collections.abc import AsyncIterator -from typing import Optional, Tuple +from typing import Tuple import cv2 import numpy @@ -11,7 +10,7 @@ from starlette.websockets import WebSocket, WebSocketState from facefusion import rtc_store, session_context, session_manager, state_manager from facefusion.apis.api_helper import get_sec_websocket_protocol from facefusion.apis.session_helper import extract_access_token -from facefusion.libraries import opus as opus_module +from facefusion.audio_encoder import create_opus_encoder, destroy_opus_encoder, encode_audio_chunk from facefusion.streamer import process_vision_frame from facefusion.types import Resolution, SessionId, VisionFrame from facefusion.video_encoder import create_vpx_encoder, destroy_vpx_encoder, encode_vpx @@ -42,43 +41,6 @@ async def receive_vision_frames(websocket : WebSocket) -> AsyncIterator[VisionFr websocket_event = await websocket.receive() -# TODO: move to facefusion/opus_encoder.py -def create_opus_encoder(sample_rate : int, channels : int) -> Optional[ctypes.c_void_p]: - opus_library = opus_module.create_static_library() - - if opus_library: - error = ctypes.c_int(0) - encoder = opus_library.opus_encoder_create(sample_rate, channels, 2049, ctypes.byref(error)) - - if error.value == 0: - return encoder - - return None - - -# TODO: move to facefusion/opus_encoder.py -def encode_opus(opus_encoder : ctypes.c_void_p, pcm_pointer : ctypes.c_void_p, frame_size : int) -> bytes: - opus_library = opus_module.create_static_library() - audio_buffer = b'' - - if opus_library: - output_buffer = ctypes.create_string_buffer(4000) - encoded_length = opus_library.opus_encode_float(opus_encoder, pcm_pointer, frame_size, output_buffer, 4000) - - if encoded_length > 0: - audio_buffer = output_buffer.raw[:encoded_length] - - return audio_buffer - - -# TODO: move to facefusion/opus_encoder.py -def destroy_opus_encoder(opus_encoder : ctypes.c_void_p) -> None: - opus_library = opus_module.create_static_library() - - if opus_library: - opus_library.opus_encoder_destroy(opus_encoder) - - # TODO: move to facefusion/vpx_encoder.py, throttle loop to avoid spinning on same frame def run_video_encode_loop(vision_frame_deque : deque[VisionFrame], session_id : SessionId, initial_resolution : Resolution, keyframe_interval : int) -> None: codec_context = create_vpx_encoder(initial_resolution[0], initial_resolution[1], 4500) @@ -117,25 +79,6 @@ def run_video_encode_loop(vision_frame_deque : deque[VisionFrame], session_id : destroy_vpx_encoder(codec_context) -# TODO: move to facefusion/opus_encoder.py -def encode_audio_chunk(opus_encoder : ctypes.c_void_p, session_id : SessionId, pcm_data : numpy.ndarray, audio_remainder : numpy.ndarray, audio_pts : int) -> Tuple[numpy.ndarray, int]: - pcm_buffer = numpy.concatenate([audio_remainder, pcm_data]) - frame_samples = 1920 - - while len(pcm_buffer) >= frame_samples: - chunk = pcm_buffer[:frame_samples] - pcm_buffer = pcm_buffer[frame_samples:] - pcm_pointer = chunk.ctypes.data_as(ctypes.POINTER(ctypes.c_float)) - audio_buffer = encode_opus(opus_encoder, pcm_pointer, 960) - - if audio_buffer: - rtc_store.send_rtc_audio(session_id, audio_buffer, audio_pts) - - audio_pts += 960 - - return pcm_buffer, audio_pts - - # TODO: extract shared session setup from handle_image_stream and handle_video_stream, guard session_id like handle_video_stream async def handle_image_stream(websocket : WebSocket) -> None: subprotocol = get_sec_websocket_protocol(websocket.scope) @@ -183,7 +126,7 @@ async def handle_video_stream(websocket : WebSocket) -> None: keyframe_interval = int(state_manager.get_item('output_video_fps') or 30) # TODO: remove hardcoded via stream_video_fps vision_frame_deque : deque[VisionFrame] = deque(maxlen = 1) opus_encoder = create_opus_encoder(48000, 2) # TODO: guard against opus_encoder being None - audio_remainder = numpy.array([], dtype = numpy.float32) + audio_initial = numpy.array([], dtype = numpy.float32) audio_pts = 0 vision_frame_deque.append(first_vision_frame) @@ -202,7 +145,7 @@ async def handle_video_stream(websocket : WebSocket) -> None: if frame_type == 2: pcm_data = numpy.frombuffer(frame_buffer, dtype = numpy.float32) - audio_remainder, audio_pts = encode_audio_chunk(opus_encoder, session_id, pcm_data, audio_remainder, audio_pts) + audio_initial, audio_pts = encode_audio_chunk(opus_encoder, session_id, pcm_data, audio_initial, audio_pts) vision_frame_deque.clear() await video_encode_task diff --git a/facefusion/audio_encoder.py b/facefusion/audio_encoder.py new file mode 100644 index 00000000..5452aebf --- /dev/null +++ b/facefusion/audio_encoder.py @@ -0,0 +1,64 @@ +import ctypes +from typing import Optional, Tuple + +import numpy + +from facefusion import rtc_store +from facefusion.libraries import opus as opus_module +from facefusion.types import AudioChunk, SessionId + + +# TODO this method needs refinement +def create_opus_encoder(sample_rate : int, channels : int) -> Optional[ctypes.c_void_p]: + opus_library = opus_module.create_static_library() + + if opus_library: + error = ctypes.c_int(0) + encoder = opus_library.opus_encoder_create(sample_rate, channels, 2049, ctypes.byref(error)) + + if error.value == 0: + return encoder + + return None + + +# TODO this method needs refinement - rename to encode_opus_buffer +def encode_opus(opus_encoder : ctypes.c_void_p, pcm_pointer : ctypes.c_void_p, frame_size : int) -> bytes: + opus_library = opus_module.create_static_library() + audio_buffer = b'' + + if opus_library: + output_buffer = ctypes.create_string_buffer(4000) + encoded_length = opus_library.opus_encode_float(opus_encoder, pcm_pointer, frame_size, output_buffer, 4000) + + if encoded_length > 0: + audio_buffer = output_buffer.raw[:encoded_length] + + return audio_buffer + + +# TODO not 100 sure this makes full sense. should we not run clear on the lru-cache instead? +def destroy_opus_encoder(opus_encoder : ctypes.c_void_p) -> None: + opus_library = opus_module.create_static_library() + + if opus_library: + opus_library.opus_encoder_destroy(opus_encoder) + + +# TODO this method needs refinement - eventual inline to encode +def encode_audio_chunk(opus_encoder : ctypes.c_void_p, session_id : SessionId, pcm_data : AudioChunk, audio_remainder : AudioChunk, audio_timestamp : int) -> Tuple[AudioChunk, int]: + pcm_buffer = numpy.concatenate([ audio_remainder, pcm_data ]) + frame_samples = 1920 + + while len(pcm_buffer) >= frame_samples: + chunk = pcm_buffer[:frame_samples] + pcm_buffer = pcm_buffer[frame_samples:] + pcm_pointer = chunk.ctypes.data_as(ctypes.POINTER(ctypes.c_float)) + audio_buffer = encode_opus(opus_encoder, pcm_pointer, 960) + + if audio_buffer: + rtc_store.send_rtc_audio(session_id, audio_buffer, audio_timestamp) + + audio_timestamp += 960 + + return pcm_buffer, audio_timestamp diff --git a/facefusion/video_encoder.py b/facefusion/video_encoder.py index 8ed32736..812bdb3f 100644 --- a/facefusion/video_encoder.py +++ b/facefusion/video_encoder.py @@ -36,7 +36,7 @@ def create_vpx_encoder(width : int, height : int, bitrate : int) -> Optional[cty return None -# TODO this method needs refinement +# TODO this method needs refinement - rename to encode_vpx_buffer def encode_vpx(codec_context : ctypes.Array[ctypes.c_char], yuv_buffer : bytes, width : int, height : int, presentation_timestamp : int, flags : int) -> bytes: vpx_library = vpx_module.create_static_library() frame_buffer = b'' diff --git a/tests/test_audio_encoder.py b/tests/test_audio_encoder.py new file mode 100644 index 00000000..6b4014c3 --- /dev/null +++ b/tests/test_audio_encoder.py @@ -0,0 +1,69 @@ +import ctypes + +import numpy +import pytest +from tests.assert_helper import get_test_example_file, get_test_examples_directory + +from facefusion import state_manager +from facefusion.audio_encoder import create_opus_encoder, encode_audio_chunk, encode_opus +from facefusion.download import conditional_download +from facefusion.ffmpeg import read_audio_buffer +from facefusion.libraries import opus as opus_module + + +@pytest.fixture(scope = 'module', autouse = True) +def before_all() -> None: + state_manager.init_item('download_providers', [ 'github', 'huggingface' ]) + + conditional_download(get_test_examples_directory(), [ 'https://github.com/facefusion/facefusion-assets/releases/download/examples-3.0.0/source.mp3' ]) + + opus_module.pre_check() + + +# TODO: implement +def test_create_opus_encoder() -> None: + pass + + +#TODO: rename to test_encode_opus_buffer +def test_encode_opus() -> None: + audio_buffer = read_audio_buffer(get_test_example_file('source.mp3'), 48000, 16, 2) + pcm_samples = numpy.frombuffer(audio_buffer, dtype = numpy.int16).astype(numpy.float32) / 32768.0 + pcm_pointer = pcm_samples[:1920].ctypes.data_as(ctypes.POINTER(ctypes.c_float)) + opus_encoder = create_opus_encoder(48000, 2) + + assert encode_opus(opus_encoder, pcm_pointer, 960) + assert encode_opus(opus_encoder, pcm_pointer, 0) == b'' + + +# TODO: implement +def test_destroy_opus_encoder() -> None: + pass + + +# TODO: improvise +def test_encode_audio_chunk() -> None: + sample_rate = 48000 + channels = 2 + frame_samples = sample_rate * 20 // 1000 * channels + + audio_buffer = read_audio_buffer(get_test_example_file('source.mp3'), sample_rate, 16, channels) + pcm_samples = numpy.frombuffer(audio_buffer, dtype = numpy.int16).astype(numpy.float32) / 32768.0 + opus_encoder = create_opus_encoder(sample_rate, channels) + audio_initial = numpy.array([], dtype = numpy.float32) + + audio_remainder, audio_timestamp = encode_audio_chunk(opus_encoder, 'test-encode-audio-chunk', pcm_samples[:frame_samples], audio_initial, 0) + assert len(audio_remainder) == 0 + assert audio_timestamp == 960 + + audio_remainder, audio_timestamp = encode_audio_chunk(opus_encoder, 'test-encode-audio-chunk', pcm_samples[:frame_samples + 500], audio_initial, 0) + assert len(audio_remainder) == 500 + assert audio_timestamp == 960 + + audio_remainder, audio_timestamp = encode_audio_chunk(opus_encoder, 'test-encode-audio-chunk', pcm_samples[:500], audio_initial, 0) + assert len(audio_remainder) == 500 + assert audio_timestamp == 0 + + audio_remainder, audio_timestamp = encode_audio_chunk(opus_encoder, 'test-encode-audio-chunk', pcm_samples[:1000], pcm_samples[:920], 0) + assert len(audio_remainder) == 0 + assert audio_timestamp == 960 diff --git a/tests/test_video_encoder.py b/tests/test_video_encoder.py index 1c8d30e3..264d7690 100644 --- a/tests/test_video_encoder.py +++ b/tests/test_video_encoder.py @@ -17,12 +17,24 @@ def before_all() -> None: vpx_module.pre_check() +# TODO: implement +def test_create_vpx_encoder() -> None: + pass + + +# TODO: rename to test_encode_vpx_buffer def test_encode_vpx() -> None: vision_frame = read_video_frame(get_test_example_file('target-240p.mp4')) height, width = vision_frame.shape[:2] + vpx_encoder = create_vpx_encoder(width, height, 1000) + buffer_valid = cv2.cvtColor(vision_frame, cv2.COLOR_BGR2YUV_I420).tobytes() buffer_invalid = bytes(0) - vpx_encoder = create_vpx_encoder(width, height, 1000) assert encode_vpx(vpx_encoder, buffer_valid, width, height, 3, 1) assert encode_vpx(vpx_encoder, buffer_invalid, width, height, 0, 0) == b'' + + +# TODO: implement +def test_destroy_vpx_encoder() -> None: + pass