try to unify structure of encode_opus_buffer and encode_vpx_buffer (#1107)

2026-07-17 23:47:38 +02:00 · 2026-05-13 16:26:30 +05:30
parent 9453a042a1
commit bff222a12f
4 changed files with 41 additions and 36 deletions
@@ -44,22 +44,21 @@ async def receive_vision_frames(websocket : WebSocket) -> AsyncIterator[VisionFr

 # TODO: move to facefusion/vpx_encoder.py, throttle loop to avoid spinning on same frame
 def run_video_encode_loop(vision_frame_deque : deque[VisionFrame], session_id : SessionId, initial_resolution : Resolution, keyframe_interval : int) -> None:
-	vpx_encoder = create_vpx_encoder(initial_resolution[0], initial_resolution[1], 4500, 8, 16)
+	vpx_encoder = create_vpx_encoder(initial_resolution, 4500, 8, 16)
 	current_resolution = initial_resolution
 	pts = 0

 	while vision_frame_deque:
 		vision_frame = vision_frame_deque[-1]
 		output_frame = process_vision_frame(vision_frame)
-		height, width = output_frame.shape[:2]
-		frame_resolution = (width, height)
+		frame_resolution = (output_frame.shape[1], output_frame.shape[0])

 		if frame_resolution[0] != current_resolution[0] or frame_resolution[1] != current_resolution[1]:
 			if vpx_encoder:
 				destroy_vpx_encoder(vpx_encoder)

 			current_resolution = frame_resolution
-			vpx_encoder = create_vpx_encoder(current_resolution[0], current_resolution[1], 4500, 8, 16)
+			vpx_encoder = create_vpx_encoder(current_resolution, 4500, 8, 16)
 			pts = 0

 		if vpx_encoder:
@@ -69,7 +68,7 @@ def run_video_encode_loop(vision_frame_deque : deque[VisionFrame], session_id :
 			if pts % keyframe_interval == 0:
 				vpx_flags = 1

-			frame_buffer = encode_vpx_buffer(vpx_encoder, yuv_frame.tobytes(), width, height, pts, vpx_flags)
+			frame_buffer = encode_vpx_buffer(vpx_encoder, yuv_frame.tobytes(), frame_resolution, pts, vpx_flags)

 			if frame_buffer:
 				rtc_store.send_rtc_video(session_id, frame_buffer)
@@ -21,10 +21,10 @@ def encode_opus_buffer(opus_encoder : OpusEncoder, pcm_pointer : ctypes.c_void_p

 	if opus_library:
 		output_buffer = ctypes.create_string_buffer(4000)
-		encoded_length = opus_library.opus_encode_float(opus_encoder, pcm_pointer, frame_size, output_buffer, 4000)
+		encode_length = opus_library.opus_encode_float(opus_encoder, pcm_pointer, frame_size, output_buffer, 4000)

-		if encoded_length > 0:
-			audio_buffer = output_buffer.raw[:encoded_length]
+		if encode_length > 0:
+			audio_buffer = output_buffer.raw[:encode_length]

 	return audio_buffer

@@ -3,10 +3,10 @@ import struct
 from typing import Optional

 from facefusion.libraries import vpx as vpx_module
-from facefusion.types import BitRate, VpxEncoder
+from facefusion.types import BitRate, Resolution, VpxEncoder


-def create_vpx_encoder(width : int, height : int, bitrate : BitRate, thread_count : int, cpu_count : int) -> Optional[VpxEncoder]:
+def create_vpx_encoder(frame_resolution : Resolution, bitrate : BitRate, thread_count : int, cpu_count : int) -> Optional[VpxEncoder]:
 	vpx_library = vpx_module.create_static_library()

 	if vpx_library:
@@ -17,8 +17,8 @@ def create_vpx_encoder(width : int, height : int, bitrate : BitRate, thread_coun

 		if vpx_library.vpx_codec_enc_config_default(ctypes.byref(vp8_codec), config_buffer, 0) == 0:
 			struct.pack_into('I', config_buffer, 4, thread_count)
-			struct.pack_into('I', config_buffer, 12, width)
-			struct.pack_into('I', config_buffer, 16, height)
+			struct.pack_into('I', config_buffer, 12, frame_resolution[0])
+			struct.pack_into('I', config_buffer, 16, frame_resolution[1])
 			struct.pack_into('I', config_buffer, 28, 1)
 			struct.pack_into('I', config_buffer, 36, 0)
 			struct.pack_into('I', config_buffer, 72, 0)
@@ -37,27 +37,33 @@ def create_vpx_encoder(width : int, height : int, bitrate : BitRate, thread_coun
 	return None


-# TODO this method needs refinement
-def encode_vpx_buffer(vpx_encoder : VpxEncoder, yuv_buffer : bytes, width : int, height : int, presentation_timestamp : int, flags : int) -> bytes:
+def collect_vpx_frame_buffer(vpx_encoder : VpxEncoder) -> bytes:
+	vpx_library = vpx_module.create_static_library()
+	frame_buffer = b''
+	iterator = ctypes.c_void_p(0)
+	packet = vpx_library.vpx_codec_get_cx_data(vpx_encoder, ctypes.byref(iterator))
+
+	while packet:
+		if ctypes.c_int.from_address(packet).value == 0:
+			buffer_pointer = ctypes.c_void_p.from_address(packet + 8).value
+			buffer_size = ctypes.c_size_t.from_address(packet + 16).value
+			frame_buffer += ctypes.string_at(buffer_pointer, buffer_size)
+
+		packet = vpx_library.vpx_codec_get_cx_data(vpx_encoder, ctypes.byref(iterator))
+
+	return frame_buffer
+
+
+def encode_vpx_buffer(vpx_encoder : VpxEncoder, raw_frame_buffer : bytes, frame_resolution : Resolution, presentation_timestamp : int, flags : int) -> bytes:
 	vpx_library = vpx_module.create_static_library()
 	frame_buffer = b''

 	if vpx_library:
-		image_buffer = ctypes.create_string_buffer(512)
-		yuv_string_buffer = ctypes.create_string_buffer(yuv_buffer)
+		output_buffer = ctypes.create_string_buffer(512)
+		encode_string_buffer = ctypes.create_string_buffer(raw_frame_buffer)

-		if vpx_library.vpx_img_wrap(image_buffer, 0x102, width, height, 1, yuv_string_buffer):
-			if vpx_library.vpx_codec_encode(vpx_encoder, image_buffer, presentation_timestamp, 1, flags, 1) == 0:
-				iterator = ctypes.c_void_p(0)
-				packet = vpx_library.vpx_codec_get_cx_data(vpx_encoder, ctypes.byref(iterator))
-
-				while packet:
-					if ctypes.c_int.from_address(packet).value == 0:
-						buffer_pointer = ctypes.c_void_p.from_address(packet + 8).value
-						buffer_size = ctypes.c_size_t.from_address(packet + 16).value
-						frame_buffer += ctypes.string_at(buffer_pointer, buffer_size)
-
-					packet = vpx_library.vpx_codec_get_cx_data(vpx_encoder, ctypes.byref(iterator))
+		if vpx_library.vpx_img_wrap(output_buffer, 0x102, frame_resolution[0], frame_resolution[1], 1, encode_string_buffer) and vpx_library.vpx_codec_encode(vpx_encoder, output_buffer, presentation_timestamp, 1, flags, 1) == 0:
+			frame_buffer = collect_vpx_frame_buffer(vpx_encoder)

 	return frame_buffer

@@ -23,29 +23,29 @@ def before_all() -> None:


 def test_create_vpx_encoder() -> None:
-	assert create_vpx_encoder(320, 240, 1000, 8, 16)
-	assert create_vpx_encoder(0, 0, 0, 0, 0) is None
+	assert create_vpx_encoder((320, 240), 1000, 8, 16)
+	assert create_vpx_encoder((0, 0), 0, 0, 0) is None


 def test_encode_vpx_buffer() -> None:
 	vision_frame = read_video_frame(get_test_example_file('target-240p.mp4'))
-	height, width = vision_frame.shape[:2]
-	vpx_encoder = create_vpx_encoder(width, height, 1000, 1, 0)
+	frame_resolution = (vision_frame.shape[1], vision_frame.shape[0])
+	vpx_encoder = create_vpx_encoder(frame_resolution, 1000, 1, 0)

 	buffer_valid = cv2.cvtColor(vision_frame, cv2.COLOR_BGR2YUV_I420).tobytes()
 	buffer_invalid = bytes(0)

 	if is_linux() or is_windows():
-		assert create_hash(encode_vpx_buffer(vpx_encoder, buffer_valid, width, height, 3, 1)) == 'ce133a1f'
+		assert create_hash(encode_vpx_buffer(vpx_encoder, buffer_valid, frame_resolution, 3, 1)) == 'ce133a1f'

 	if is_macos():
-		assert create_hash(encode_vpx_buffer(vpx_encoder, buffer_valid, width, height, 3, 1)) == '21c36925'
+		assert create_hash(encode_vpx_buffer(vpx_encoder, buffer_valid, frame_resolution, 3, 1)) == '21c36925'

-	assert encode_vpx_buffer(vpx_encoder, buffer_invalid, width, height, 0, 0) == b''
+	assert encode_vpx_buffer(vpx_encoder, buffer_invalid, frame_resolution, 0, 0) == b''


 def test_destroy_vpx_encoder() -> None:
-	vpx_encoder = create_vpx_encoder(320, 240, 1000, 8, 16)
+	vpx_encoder = create_vpx_encoder((320, 240), 1000, 8, 16)

 	with patch.object(vpx_module.create_static_library(), 'vpx_codec_destroy') as mock:
 		destroy_vpx_encoder(vpx_encoder)