Feat/remb both direction (#1145)

* revamp remb to take both directions

* extract more methods
This commit is contained in:
Henry Ruhs
2026-06-03 10:31:33 +02:00
committed by GitHub
parent d9553b12e8
commit 2ac9b70550
8 changed files with 169 additions and 128 deletions
+10 -9
View File
@@ -23,7 +23,8 @@ def run_audio_encode_loop(rtc_peer : RtcPeer, audio_deque : deque[AudioPack], au
output_audio_buffer = opus_encoder.encode(audio_encoder, temp_audio_frame.tobytes(), 960)
if output_audio_buffer:
rtc.send_audio(rtc_peer, output_audio_buffer, int(temp_audio_time * 48000))
audio_timestamp = int(temp_audio_time * 48000)
rtc.send_audio(rtc_peer, output_audio_buffer, audio_timestamp)
if len(audio_deque) == 0:
audio_event.wait()
@@ -34,14 +35,6 @@ def run_audio_encode_loop(rtc_peer : RtcPeer, audio_deque : deque[AudioPack], au
opus_encoder.destroy(audio_encoder)
def fill_audio_deque(audio_codec : AudioCodec, audio_decoder : OpusDecoder, audio_buffer : bytes, audio_deque : deque[AudioPack], audio_event : threading.Event) -> None:
audio_frame = decode_audio_frame(audio_codec, audio_decoder, audio_buffer)
if audio_frame:
audio_deque.append((numpy.frombuffer(audio_frame, dtype = numpy.float32), time.monotonic()))
audio_event.set()
def receive_audio_frames(rtc_peer_audio : RtcPeerAudio, audio_deque : deque[AudioPack], audio_event : threading.Event) -> None:
audio_track = rtc_peer_audio.get('receiver_track')
audio_codec = rtc_peer_audio.get('codec')
@@ -69,6 +62,14 @@ def receive_audio_frames(rtc_peer_audio : RtcPeerAudio, audio_deque : deque[Audi
destroy_audio_decoder(audio_codec, audio_decoder)
def fill_audio_deque(audio_codec : AudioCodec, audio_decoder : OpusDecoder, audio_buffer : bytes, audio_deque : deque[AudioPack], audio_event : threading.Event) -> None:
audio_frame = decode_audio_frame(audio_codec, audio_decoder, audio_buffer)
if audio_frame:
audio_deque.append((numpy.frombuffer(audio_frame, dtype = numpy.float32), time.monotonic()))
audio_event.set()
def decode_audio_frame(audio_codec : AudioCodec, audio_decoder : OpusDecoder, input_buffer : bytes) -> Optional[bytes]:
if audio_codec == 'opus':
return opus_decoder.decode(audio_decoder, input_buffer, 960, 2)
+15 -16
View File
@@ -28,6 +28,19 @@ async def process_image(websocket : WebSocket) -> None:
await websocket.send_bytes(output_frame_buffer.tobytes())
async def receive_vision_frames(websocket : WebSocket) -> AsyncIterator[VisionFrame]:
websocket_event = await websocket.receive()
while websocket_event.get('type') == 'websocket.receive':
frame_buffer = websocket_event.get('bytes') or bytes()
vision_frame = cv2.imdecode(numpy.frombuffer(frame_buffer, numpy.uint8), cv2.IMREAD_COLOR)
if numpy.any(vision_frame):
yield vision_frame
websocket_event = await websocket.receive()
def process_video(session_id : SessionId, sdp_offer : SdpOffer) -> Optional[SdpAnswer]:
video_codec : VideoCodec = 'vp8'
@@ -42,9 +55,8 @@ def process_video(session_id : SessionId, sdp_offer : SdpOffer) -> Optional[SdpA
video_sender_track = rtc.add_video_track(peer_connection, 'sendonly', video_codec, video_payload_type)
sender_bitrate = ctypes.c_uint(0)
receiver_bitrate = ctypes.c_uint(0)
rtc.wire_remb(video_sender_track, sender_bitrate)
rtc.wire_remb(video_receiver_track, receiver_bitrate)
receiver_bitrate = ctypes.c_uint(8000)
rtc.wire_sender_bitrate(video_sender_track, sender_bitrate)
audio_codec : AudioCodec = 'opus'
audio_payload_type = rtc.get_payload_type(sdp_offer, audio_codec)
@@ -88,19 +100,6 @@ def process_video(session_id : SessionId, sdp_offer : SdpOffer) -> Optional[SdpA
return None
async def receive_vision_frames(websocket : WebSocket) -> AsyncIterator[VisionFrame]:
websocket_event = await websocket.receive()
while websocket_event.get('type') == 'websocket.receive':
frame_buffer = websocket_event.get('bytes') or bytes()
vision_frame = cv2.imdecode(numpy.frombuffer(frame_buffer, numpy.uint8), cv2.IMREAD_COLOR)
if numpy.any(vision_frame):
yield vision_frame
websocket_event = await websocket.receive()
async def run_peer_loop(session_id : SessionId, rtc_peer : RtcPeer) -> None:
video_deque : deque[VideoPack] = deque(maxlen = 1)
audio_deque : deque[AudioPack] = deque(maxlen = 10)
+58 -29
View File
@@ -25,49 +25,32 @@ def run_video_encode_loop(rtc_peer : RtcPeer, video_deque : deque[VideoPack], vi
temp_resolution : Resolution = (temp_vision_frame.shape[1], temp_vision_frame.shape[0])
temp_bitrate : BitRate = 8000
video_encoder = create_video_encoder(video_codec, temp_resolution, temp_bitrate)
previous_video_time = temp_video_time
frame_index = 0
while numpy.any(temp_vision_frame):
output_vision_frame = streamer.process_frame(create_empty_audio_frame(), temp_vision_frame)
output_resolution : Resolution = (output_vision_frame.shape[1], output_vision_frame.shape[0])
output_vision_buffer = cv2.cvtColor(output_vision_frame, cv2.COLOR_BGR2YUV_I420).tobytes()
peer_bitrate = rtc_peer.get('sender_bitrate').value
if output_resolution[0] - temp_resolution[0] or output_resolution[1] - temp_resolution[1]:
destroy_video_encoder(video_codec, video_encoder)
temp_resolution = output_resolution
video_encoder = create_video_encoder(video_codec, temp_resolution, temp_bitrate)
frame_index = 0
if peer_bitrate and peer_bitrate - temp_bitrate:
temp_bitrate = peer_bitrate
if not update_video_encoder_bitrate(video_codec, video_encoder, temp_bitrate):
destroy_video_encoder(video_codec, video_encoder)
video_encoder = create_video_encoder(video_codec, temp_resolution, temp_bitrate)
frame_index = 0
encode_start = time.monotonic()
output_vision_buffer, output_resolution = process_video_frame(temp_vision_frame)
peer_bitrate : BitRate = rtc_peer.get('sender_bitrate').value
video_encoder, temp_resolution, temp_bitrate, frame_index = adapt_video_encoder(video_codec, video_encoder, temp_resolution, temp_bitrate, output_resolution, peer_bitrate, frame_index)
output_video_buffer = encode_video_frame(video_codec, video_encoder, output_vision_buffer, temp_resolution, frame_index)
if output_video_buffer:
rtc.send_video(rtc_peer, output_video_buffer, int(temp_video_time * 90000))
encode_time = time.monotonic() - encode_start
frame_interval = temp_video_time - previous_video_time
previous_video_time = temp_video_time
rtc.adapt_receiver_bitrate(rtc_peer, calculate_receiver_bitrate(rtc_peer, encode_time, frame_interval))
frame_index += 1
video_event.wait()
video_event.clear()
temp_vision_frame, temp_video_time = video_deque.popleft()
destroy_video_encoder(video_codec, video_encoder)
rtc.clear_remb(rtc_peer)
def fill_video_deque(video_codec : VideoCodec, video_decoder : VpxDecoder | AomDecoder, video_buffer : bytes, video_deque : deque[VideoPack], video_event : threading.Event) -> None:
vision_frame = decode_video_frame(video_codec, video_decoder, video_buffer)
if numpy.any(vision_frame):
video_deque.append((vision_frame, time.monotonic()))
video_event.set()
rtc.clear_bitrate(rtc_peer)
def receive_video_frames(rtc_peer_video : RtcPeerVideo, video_deque : deque[VideoPack], video_event : threading.Event) -> None:
@@ -97,6 +80,52 @@ def receive_video_frames(rtc_peer_video : RtcPeerVideo, video_deque : deque[Vide
destroy_video_decoder(video_codec, video_decoder)
def fill_video_deque(video_codec : VideoCodec, video_decoder : VpxDecoder | AomDecoder, video_buffer : bytes, video_deque : deque[VideoPack], video_event : threading.Event) -> None:
vision_frame = decode_video_frame(video_codec, video_decoder, video_buffer)
if numpy.any(vision_frame):
video_deque.append((vision_frame, time.monotonic()))
video_event.set()
def process_video_frame(vision_frame : VisionFrame) -> tuple[bytes, Resolution]:
output_vision_frame = streamer.process_frame(create_empty_audio_frame(), vision_frame)
output_resolution : Resolution = (output_vision_frame.shape[1], output_vision_frame.shape[0])
output_vision_buffer = cv2.cvtColor(output_vision_frame, cv2.COLOR_BGR2YUV_I420).tobytes()
return output_vision_buffer, output_resolution
def adapt_video_encoder(video_codec : VideoCodec, video_encoder : VpxEncoder | AomEncoder, resolution : Resolution, bitrate : BitRate, output_resolution : Resolution, peer_bitrate : BitRate, frame_index : int) -> tuple[VpxEncoder | AomEncoder, Resolution, BitRate, int]:
if output_resolution[0] - resolution[0] or output_resolution[1] - resolution[1]:
destroy_video_encoder(video_codec, video_encoder)
resolution = output_resolution
video_encoder = create_video_encoder(video_codec, resolution, bitrate)
frame_index = 0
if peer_bitrate and peer_bitrate - bitrate:
bitrate = peer_bitrate
if not update_video_encoder_bitrate(video_codec, video_encoder, bitrate):
destroy_video_encoder(video_codec, video_encoder)
video_encoder = create_video_encoder(video_codec, resolution, bitrate)
frame_index = 0
return video_encoder, resolution, bitrate, frame_index
def calculate_receiver_bitrate(rtc_peer : RtcPeer, encode_time : float, frame_interval : float) -> BitRate:
min_bitrate : BitRate = 500
max_bitrate : BitRate = 8000
bitrate : BitRate = rtc_peer.get('receiver_bitrate').value
if frame_interval > 0:
scale = frame_interval / encode_time
bitrate = int(bitrate * scale)
bitrate = max(min_bitrate, min(max_bitrate, bitrate))
return bitrate
def decode_video_frame(video_codec : VideoCodec, video_decoder : VpxDecoder | AomDecoder, input_buffer : bytes) -> Optional[VisionFrame]:
if video_codec == 'av1':
aom_pointer = aom_decoder.decode(video_decoder, input_buffer)
+3
View File
@@ -224,6 +224,9 @@ def init_ctypes(library : ctypes.CDLL) -> ctypes.CDLL:
library.rtcChainRembHandler.argtypes = [ ctypes.c_int, ctypes.CFUNCTYPE(None, ctypes.c_int, ctypes.c_uint, ctypes.c_void_p) ]
library.rtcChainRembHandler.restype = ctypes.c_int
library.rtcRequestBitrate.argtypes = [ ctypes.c_int, ctypes.c_uint ]
library.rtcRequestBitrate.restype = ctypes.c_int
library.rtcSetAvailableCallback.argtypes = [ ctypes.c_int, ctypes.CFUNCTYPE(None, ctypes.c_int, ctypes.c_void_p) ]
library.rtcSetAvailableCallback.restype = ctypes.c_int
+14 -5
View File
@@ -2,7 +2,7 @@ import ctypes
from typing import List, Optional
from facefusion.libraries import datachannel as datachannel_module
from facefusion.types import AudioCodec, MediaDirection, PeerConnection, RtcAudioTrack, RtcPeer, RtcTrackInit, RtcVideoTrack, SdpAnswer, SdpOffer, VideoCodec
from facefusion.types import AudioCodec, BitRate, MediaDirection, PeerConnection, RtcAudioTrack, RtcPeer, RtcTrackInit, RtcVideoTrack, SdpAnswer, SdpOffer, VideoCodec
def create_peer_connection() -> PeerConnection:
@@ -226,15 +226,24 @@ def get_payload_type(sdp_offer : SdpOffer, codec : AudioCodec | VideoCodec) -> i
@ctypes.CFUNCTYPE(None, ctypes.c_int, ctypes.c_uint, ctypes.c_void_p)
def handle_remb(track : int, bitrate : int, pointer : int) -> None:
def handle_sender_bitrate(_ : int, bitrate : BitRate, pointer : int) -> None:
ctypes.cast(pointer, ctypes.POINTER(ctypes.c_uint)).contents.value = bitrate // 1000
def wire_remb(video_track : RtcVideoTrack, bitrate : ctypes.c_uint) -> None:
def wire_sender_bitrate(video_track : RtcVideoTrack, bitrate : ctypes.c_uint) -> None:
datachannel_library = datachannel_module.create_static_library()
datachannel_library.rtcSetUserPointer(video_track, ctypes.cast(ctypes.byref(bitrate), ctypes.c_void_p))
datachannel_library.rtcChainRembHandler(video_track, handle_remb)
datachannel_library.rtcChainRembHandler(video_track, handle_sender_bitrate)
def clear_remb(rtc_peer : RtcPeer) -> None:
def adapt_receiver_bitrate(rtc_peer : RtcPeer, bitrate : BitRate) -> None:
datachannel_library = datachannel_module.create_static_library()
receiver_track = rtc_peer.get('video').get('receiver_track')
rtc_peer.get('receiver_bitrate').value = bitrate
datachannel_library.rtcRequestBitrate(receiver_track, bitrate * 1000)
def clear_bitrate(rtc_peer : RtcPeer) -> None:
rtc_peer.get('sender_bitrate').value = 0
rtc_peer.get('receiver_bitrate').value = 0
+27 -27
View File
@@ -53,6 +53,30 @@ async def test_process_image() -> None:
assert create_hash(websocket_mock.send_bytes.call_args[0][0]) == '0142782f'
@pytest.mark.anyio
async def test_receive_vision_frames() -> None:
image_buffer = open(get_test_example_file('source.jpg'), 'rb').read()
websocket_mock = AsyncMock()
websocket_mock.receive.side_effect =\
[
{
'type': 'websocket.receive',
'bytes': image_buffer
},
{
'type': 'websocket.receive',
'bytes': 'invalid'.encode()
},
{
'type': 'websocket.disconnect'
}
]
vision_frames = receive_vision_frames(websocket_mock)
assert create_hash((await anext(vision_frames)).tobytes()) == '5ed32ca0'
@pytest.mark.parametrize('video_codec, session_id', [ ('av1', 'test-process-video-av1'), ('vp8', 'test-process-video-vp8') ])
def test_process_video(video_codec : VideoCodec, session_id : str) -> None:
peer_connection = rtc.create_peer_connection()
@@ -80,39 +104,15 @@ def test_process_video(video_codec : VideoCodec, session_id : str) -> None:
receiver_bitrate = peer.get('receiver_bitrate')
assert sender_bitrate.value == 0
assert receiver_bitrate.value == 0
assert receiver_bitrate.value == 8000
rtc.handle_remb(0, 8000000, ctypes.addressof(sender_bitrate))
rtc.handle_sender_bitrate(0, 8000000, ctypes.addressof(sender_bitrate))
assert sender_bitrate.value == 8000
rtc.handle_remb(0, 4000000, ctypes.addressof(receiver_bitrate))
rtc.adapt_receiver_bitrate(peer, 4000)
assert receiver_bitrate.value == 4000
@pytest.mark.anyio
async def test_receive_vision_frames() -> None:
image_buffer = open(get_test_example_file('source.jpg'), 'rb').read()
websocket_mock = AsyncMock()
websocket_mock.receive.side_effect =\
[
{
'type': 'websocket.receive',
'bytes': image_buffer
},
{
'type': 'websocket.receive',
'bytes': 'invalid'.encode()
},
{
'type': 'websocket.disconnect'
}
]
vision_frames = receive_vision_frames(websocket_mock)
assert create_hash((await anext(vision_frames)).tobytes()) == '5ed32ca0'
@pytest.mark.parametrize('video_codec, payload_type, session_id', [ ('av1', 35, 'test-run-peer-loop-av1'), ('vp8', 96, 'test-run-peer-loop-vp8') ])
def test_run_peer_loop(video_codec : VideoCodec, payload_type : int, session_id : SessionId) -> None:
peer_connection = rtc.create_peer_connection()
+31 -31
View File
@@ -56,7 +56,7 @@ def test_run_video_encode_loop(video_codec : VideoCodec, payload_type : int) ->
'codec': video_codec
},
'sender_bitrate': ctypes.c_uint(0),
'receiver_bitrate': ctypes.c_uint(0)
'receiver_bitrate': ctypes.c_uint(8000)
}
video_deque : deque[VideoPack] = deque()
@@ -86,6 +86,36 @@ def test_run_video_encode_loop(video_codec : VideoCodec, payload_type : int) ->
pytest.skip()
@pytest.mark.parametrize('video_codec', [ 'av1', 'vp8' ])
def test_receive_video_frames(video_codec : VideoCodec) -> None:
video_frame = read_video_frame(get_test_example_file('target-240p.mp4'))
video_deque : deque[VideoPack] = deque()
video_event = threading.Event()
datachannel_library_mock = MagicMock()
datachannel_library_mock.rtcReceiveMessage.side_effect = [ 0, -1 ]
with patch('facefusion.apis.stream_video.datachannel_module.create_static_library', return_value = datachannel_library_mock):
with patch('facefusion.apis.stream_video.decode_video_frame', return_value = video_frame):
rtc_peer_video : RtcPeerVideo =\
{
'sender_track': 0,
'receiver_track': 0,
'codec': video_codec
}
video_receiver_thread = threading.Thread(target = receive_video_frames, args = (rtc_peer_video, video_deque, video_event), daemon = True)
video_receiver_thread.start()
video_receiver_thread.join(timeout = 5.0)
vision_frame, _ = video_deque.popleft()
if is_linux() or is_windows():
assert create_hash(vision_frame.tobytes()) == 'a17439db'
if is_macos():
assert create_hash(vision_frame.tobytes()) == '38d00e2a'
@pytest.mark.parametrize('video_codec', [ 'av1', 'vp8' ])
def test_fill_video_deque(video_codec : VideoCodec) -> None:
video_frame = read_video_frame(get_test_example_file('target-240p.mp4'))
@@ -117,36 +147,6 @@ def test_fill_video_deque(video_codec : VideoCodec) -> None:
assert create_hash(vision_frame.tobytes()) == 'ff3ecb43'
@pytest.mark.parametrize('video_codec', [ 'av1', 'vp8' ])
def test_receive_video_frames(video_codec : VideoCodec) -> None:
video_frame = read_video_frame(get_test_example_file('target-240p.mp4'))
video_deque : deque[VideoPack] = deque()
video_event = threading.Event()
datachannel_library_mock = MagicMock()
datachannel_library_mock.rtcReceiveMessage.side_effect = [ 0, -1 ]
with patch('facefusion.apis.stream_video.datachannel_module.create_static_library', return_value = datachannel_library_mock):
with patch('facefusion.apis.stream_video.decode_video_frame', return_value = video_frame):
rtc_peer_video : RtcPeerVideo =\
{
'sender_track': 0,
'receiver_track': 0,
'codec': video_codec
}
video_receiver_thread = threading.Thread(target = receive_video_frames, args = (rtc_peer_video, video_deque, video_event), daemon = True)
video_receiver_thread.start()
video_receiver_thread.join(timeout = 5.0)
vision_frame, _ = video_deque.popleft()
if is_linux() or is_windows():
assert create_hash(vision_frame.tobytes()) == 'a17439db'
if is_macos():
assert create_hash(vision_frame.tobytes()) == '38d00e2a'
@pytest.mark.parametrize('video_codec', [ 'av1', 'vp8' ])
def test_encode_and_decode_video_frame(video_codec : VideoCodec) -> None:
video_frame = read_video_frame(get_test_example_file('target-240p.mp4'))
+11 -11
View File
@@ -5,7 +5,7 @@ import pytest
from facefusion import state_manager
from facefusion.libraries import datachannel as datachannel_module, opus as opus_module, vpx as vpx_module
from facefusion.rtc import add_audio_track, add_video_track, create_peer_connection, create_sdp_answer, create_sdp_offer, delete_peers, get_payload_type, handle_remb, send_audio, send_video, set_remote_description, wire_remb
from facefusion.rtc import adapt_receiver_bitrate, add_audio_track, add_video_track, create_peer_connection, create_sdp_answer, create_sdp_offer, delete_peers, get_payload_type, handle_sender_bitrate, send_audio, send_video, set_remote_description, wire_sender_bitrate
from facefusion.types import RtcPeer, VideoCodec
@@ -153,7 +153,7 @@ def test_get_payload_type() -> None:
@pytest.mark.parametrize('video_codec, payload_type', [ ('av1', 35), ('vp8', 96) ])
def test_wire_remb(video_codec : VideoCodec, payload_type : int) -> None:
def test_wire_sender_bitrate(video_codec : VideoCodec, payload_type : int) -> None:
datachannel_library = datachannel_module.create_static_library()
peer_connection = create_peer_connection()
video_sender_track = add_video_track(peer_connection, 'sendonly', video_codec, payload_type)
@@ -170,38 +170,38 @@ def test_wire_remb(video_codec : VideoCodec, payload_type : int) -> None:
'receiver_bitrate': ctypes.c_uint(0)
}
wire_remb(video_sender_track, rtc_peer.get('sender_bitrate'))
wire_sender_bitrate(video_sender_track, rtc_peer.get('sender_bitrate'))
assert rtc_peer.get('sender_bitrate').value == 0
handle_remb(0, 6000000, ctypes.addressof(rtc_peer.get('sender_bitrate')))
handle_sender_bitrate(0, 8000000, ctypes.addressof(rtc_peer.get('sender_bitrate')))
assert rtc_peer.get('sender_bitrate').value == 6000
assert rtc_peer.get('sender_bitrate').value == 8000
datachannel_library.rtcDeletePeerConnection(peer_connection)
@pytest.mark.parametrize('video_codec, payload_type', [ ('av1', 35), ('vp8', 96) ])
def test_wire_remb_receiver(video_codec : VideoCodec, payload_type : int) -> None:
def test_adapt_receiver_bitrate(video_codec : VideoCodec, payload_type : int) -> None:
datachannel_library = datachannel_module.create_static_library()
peer_connection = create_peer_connection()
video_sender_track = add_video_track(peer_connection, 'sendonly', video_codec, payload_type)
video_receiver_track = add_video_track(peer_connection, 'recvonly', video_codec, payload_type)
rtc_peer : RtcPeer =\
{
'peer_connection': peer_connection,
'video':
{
'sender_track': video_receiver_track,
'sender_track': video_sender_track,
'receiver_track': video_receiver_track,
'codec': video_codec
},
'sender_bitrate': ctypes.c_uint(0),
'receiver_bitrate': ctypes.c_uint(0)
'receiver_bitrate': ctypes.c_uint(8000)
}
wire_remb(video_receiver_track, rtc_peer.get('receiver_bitrate'))
handle_remb(0, 6000000, ctypes.addressof(rtc_peer.get('receiver_bitrate')))
adapt_receiver_bitrate(rtc_peer, 4000)
assert rtc_peer.get('receiver_bitrate').value == 6000
assert rtc_peer.get('receiver_bitrate').value == 4000
datachannel_library.rtcDeletePeerConnection(peer_connection)