From bff222a12f201c856f88a27c91c245b82e58261d Mon Sep 17 00:00:00 2001 From: Harisreedhar <46858047+harisreedhar@users.noreply.github.com> Date: Wed, 13 May 2026 16:26:30 +0530 Subject: [PATCH] try to unify structure of encode_opus_buffer and encode_vpx_buffer (#1107) --- facefusion/apis/stream_helper.py | 9 +++---- facefusion/audio_encoder.py | 6 ++--- facefusion/video_encoder.py | 46 ++++++++++++++++++-------------- tests/test_video_encoder.py | 16 +++++------ 4 files changed, 41 insertions(+), 36 deletions(-) diff --git a/facefusion/apis/stream_helper.py b/facefusion/apis/stream_helper.py index 68982fb4..bcb16a75 100644 --- a/facefusion/apis/stream_helper.py +++ b/facefusion/apis/stream_helper.py @@ -44,22 +44,21 @@ async def receive_vision_frames(websocket : WebSocket) -> AsyncIterator[VisionFr # TODO: move to facefusion/vpx_encoder.py, throttle loop to avoid spinning on same frame def run_video_encode_loop(vision_frame_deque : deque[VisionFrame], session_id : SessionId, initial_resolution : Resolution, keyframe_interval : int) -> None: - vpx_encoder = create_vpx_encoder(initial_resolution[0], initial_resolution[1], 4500, 8, 16) + vpx_encoder = create_vpx_encoder(initial_resolution, 4500, 8, 16) current_resolution = initial_resolution pts = 0 while vision_frame_deque: vision_frame = vision_frame_deque[-1] output_frame = process_vision_frame(vision_frame) - height, width = output_frame.shape[:2] - frame_resolution = (width, height) + frame_resolution = (output_frame.shape[1], output_frame.shape[0]) if frame_resolution[0] != current_resolution[0] or frame_resolution[1] != current_resolution[1]: if vpx_encoder: destroy_vpx_encoder(vpx_encoder) current_resolution = frame_resolution - vpx_encoder = create_vpx_encoder(current_resolution[0], current_resolution[1], 4500, 8, 16) + vpx_encoder = create_vpx_encoder(current_resolution, 4500, 8, 16) pts = 0 if vpx_encoder: @@ -69,7 +68,7 @@ def run_video_encode_loop(vision_frame_deque : deque[VisionFrame], session_id : if pts % keyframe_interval == 0: vpx_flags = 1 - frame_buffer = encode_vpx_buffer(vpx_encoder, yuv_frame.tobytes(), width, height, pts, vpx_flags) + frame_buffer = encode_vpx_buffer(vpx_encoder, yuv_frame.tobytes(), frame_resolution, pts, vpx_flags) if frame_buffer: rtc_store.send_rtc_video(session_id, frame_buffer) diff --git a/facefusion/audio_encoder.py b/facefusion/audio_encoder.py index a69a6988..5429db65 100644 --- a/facefusion/audio_encoder.py +++ b/facefusion/audio_encoder.py @@ -21,10 +21,10 @@ def encode_opus_buffer(opus_encoder : OpusEncoder, pcm_pointer : ctypes.c_void_p if opus_library: output_buffer = ctypes.create_string_buffer(4000) - encoded_length = opus_library.opus_encode_float(opus_encoder, pcm_pointer, frame_size, output_buffer, 4000) + encode_length = opus_library.opus_encode_float(opus_encoder, pcm_pointer, frame_size, output_buffer, 4000) - if encoded_length > 0: - audio_buffer = output_buffer.raw[:encoded_length] + if encode_length > 0: + audio_buffer = output_buffer.raw[:encode_length] return audio_buffer diff --git a/facefusion/video_encoder.py b/facefusion/video_encoder.py index a0f7ca82..e64a37d9 100644 --- a/facefusion/video_encoder.py +++ b/facefusion/video_encoder.py @@ -3,10 +3,10 @@ import struct from typing import Optional from facefusion.libraries import vpx as vpx_module -from facefusion.types import BitRate, VpxEncoder +from facefusion.types import BitRate, Resolution, VpxEncoder -def create_vpx_encoder(width : int, height : int, bitrate : BitRate, thread_count : int, cpu_count : int) -> Optional[VpxEncoder]: +def create_vpx_encoder(frame_resolution : Resolution, bitrate : BitRate, thread_count : int, cpu_count : int) -> Optional[VpxEncoder]: vpx_library = vpx_module.create_static_library() if vpx_library: @@ -17,8 +17,8 @@ def create_vpx_encoder(width : int, height : int, bitrate : BitRate, thread_coun if vpx_library.vpx_codec_enc_config_default(ctypes.byref(vp8_codec), config_buffer, 0) == 0: struct.pack_into('I', config_buffer, 4, thread_count) - struct.pack_into('I', config_buffer, 12, width) - struct.pack_into('I', config_buffer, 16, height) + struct.pack_into('I', config_buffer, 12, frame_resolution[0]) + struct.pack_into('I', config_buffer, 16, frame_resolution[1]) struct.pack_into('I', config_buffer, 28, 1) struct.pack_into('I', config_buffer, 36, 0) struct.pack_into('I', config_buffer, 72, 0) @@ -37,27 +37,33 @@ def create_vpx_encoder(width : int, height : int, bitrate : BitRate, thread_coun return None -# TODO this method needs refinement -def encode_vpx_buffer(vpx_encoder : VpxEncoder, yuv_buffer : bytes, width : int, height : int, presentation_timestamp : int, flags : int) -> bytes: +def collect_vpx_frame_buffer(vpx_encoder : VpxEncoder) -> bytes: + vpx_library = vpx_module.create_static_library() + frame_buffer = b'' + iterator = ctypes.c_void_p(0) + packet = vpx_library.vpx_codec_get_cx_data(vpx_encoder, ctypes.byref(iterator)) + + while packet: + if ctypes.c_int.from_address(packet).value == 0: + buffer_pointer = ctypes.c_void_p.from_address(packet + 8).value + buffer_size = ctypes.c_size_t.from_address(packet + 16).value + frame_buffer += ctypes.string_at(buffer_pointer, buffer_size) + + packet = vpx_library.vpx_codec_get_cx_data(vpx_encoder, ctypes.byref(iterator)) + + return frame_buffer + + +def encode_vpx_buffer(vpx_encoder : VpxEncoder, raw_frame_buffer : bytes, frame_resolution : Resolution, presentation_timestamp : int, flags : int) -> bytes: vpx_library = vpx_module.create_static_library() frame_buffer = b'' if vpx_library: - image_buffer = ctypes.create_string_buffer(512) - yuv_string_buffer = ctypes.create_string_buffer(yuv_buffer) + output_buffer = ctypes.create_string_buffer(512) + encode_string_buffer = ctypes.create_string_buffer(raw_frame_buffer) - if vpx_library.vpx_img_wrap(image_buffer, 0x102, width, height, 1, yuv_string_buffer): - if vpx_library.vpx_codec_encode(vpx_encoder, image_buffer, presentation_timestamp, 1, flags, 1) == 0: - iterator = ctypes.c_void_p(0) - packet = vpx_library.vpx_codec_get_cx_data(vpx_encoder, ctypes.byref(iterator)) - - while packet: - if ctypes.c_int.from_address(packet).value == 0: - buffer_pointer = ctypes.c_void_p.from_address(packet + 8).value - buffer_size = ctypes.c_size_t.from_address(packet + 16).value - frame_buffer += ctypes.string_at(buffer_pointer, buffer_size) - - packet = vpx_library.vpx_codec_get_cx_data(vpx_encoder, ctypes.byref(iterator)) + if vpx_library.vpx_img_wrap(output_buffer, 0x102, frame_resolution[0], frame_resolution[1], 1, encode_string_buffer) and vpx_library.vpx_codec_encode(vpx_encoder, output_buffer, presentation_timestamp, 1, flags, 1) == 0: + frame_buffer = collect_vpx_frame_buffer(vpx_encoder) return frame_buffer diff --git a/tests/test_video_encoder.py b/tests/test_video_encoder.py index a92d9dc2..4a7058b3 100644 --- a/tests/test_video_encoder.py +++ b/tests/test_video_encoder.py @@ -23,29 +23,29 @@ def before_all() -> None: def test_create_vpx_encoder() -> None: - assert create_vpx_encoder(320, 240, 1000, 8, 16) - assert create_vpx_encoder(0, 0, 0, 0, 0) is None + assert create_vpx_encoder((320, 240), 1000, 8, 16) + assert create_vpx_encoder((0, 0), 0, 0, 0) is None def test_encode_vpx_buffer() -> None: vision_frame = read_video_frame(get_test_example_file('target-240p.mp4')) - height, width = vision_frame.shape[:2] - vpx_encoder = create_vpx_encoder(width, height, 1000, 1, 0) + frame_resolution = (vision_frame.shape[1], vision_frame.shape[0]) + vpx_encoder = create_vpx_encoder(frame_resolution, 1000, 1, 0) buffer_valid = cv2.cvtColor(vision_frame, cv2.COLOR_BGR2YUV_I420).tobytes() buffer_invalid = bytes(0) if is_linux() or is_windows(): - assert create_hash(encode_vpx_buffer(vpx_encoder, buffer_valid, width, height, 3, 1)) == 'ce133a1f' + assert create_hash(encode_vpx_buffer(vpx_encoder, buffer_valid, frame_resolution, 3, 1)) == 'ce133a1f' if is_macos(): - assert create_hash(encode_vpx_buffer(vpx_encoder, buffer_valid, width, height, 3, 1)) == '21c36925' + assert create_hash(encode_vpx_buffer(vpx_encoder, buffer_valid, frame_resolution, 3, 1)) == '21c36925' - assert encode_vpx_buffer(vpx_encoder, buffer_invalid, width, height, 0, 0) == b'' + assert encode_vpx_buffer(vpx_encoder, buffer_invalid, frame_resolution, 0, 0) == b'' def test_destroy_vpx_encoder() -> None: - vpx_encoder = create_vpx_encoder(320, 240, 1000, 8, 16) + vpx_encoder = create_vpx_encoder((320, 240), 1000, 8, 16) with patch.object(vpx_module.create_static_library(), 'vpx_codec_destroy') as mock: destroy_vpx_encoder(vpx_encoder)