move run_video_encode_loop (#1100)

This commit is contained in:
Harisreedhar
2026-05-11 21:04:56 +05:30
committed by GitHub
parent 6968d8fe47
commit 92296fc5a5
3 changed files with 18 additions and 61 deletions
+16 -5
View File
@@ -1,4 +1,5 @@
import asyncio
import ctypes
from collections import deque
from collections.abc import AsyncIterator
from typing import Tuple
@@ -10,7 +11,7 @@ from starlette.websockets import WebSocket, WebSocketState
from facefusion import rtc_store, session_context, session_manager, state_manager
from facefusion.apis.api_helper import get_sec_websocket_protocol
from facefusion.apis.session_helper import extract_access_token
from facefusion.audio_encoder import create_opus_encoder, destroy_opus_encoder, encode_audio_chunk
from facefusion.audio_encoder import create_opus_encoder, destroy_opus_encoder, encode_opus
from facefusion.streamer import process_vision_frame
from facefusion.types import Resolution, SessionId, VisionFrame
from facefusion.video_encoder import create_vpx_encoder, destroy_vpx_encoder, encode_vpx
@@ -126,8 +127,8 @@ async def handle_video_stream(websocket : WebSocket) -> None:
keyframe_interval = int(state_manager.get_item('output_video_fps') or 30) # TODO: remove hardcoded via stream_video_fps
vision_frame_deque : deque[VisionFrame] = deque(maxlen = 1)
opus_encoder = create_opus_encoder(48000, 2) # TODO: guard against opus_encoder being None
audio_initial = numpy.array([], dtype = numpy.float32)
audio_pts = 0
audio_temp = numpy.array([], dtype = numpy.float32)
audio_timestamp = 0
vision_frame_deque.append(first_vision_frame)
rtc_store.create_rtc_stream(session_id)
@@ -144,8 +145,18 @@ async def handle_video_stream(websocket : WebSocket) -> None:
vision_frame_deque.append(vision_frame)
if frame_type == 2:
pcm_data = numpy.frombuffer(frame_buffer, dtype = numpy.float32)
audio_initial, audio_pts = encode_audio_chunk(opus_encoder, session_id, pcm_data, audio_initial, audio_pts)
audio_temp = numpy.concatenate([ audio_temp, numpy.frombuffer(frame_buffer).astype(numpy.float32) ])
while len(audio_temp) >= 1920:
audio_chunk = audio_temp[:1920]
audio_temp = audio_temp[1920:]
pcm_pointer = audio_chunk.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
audio_buffer = encode_opus(opus_encoder, pcm_pointer, 960)
if audio_buffer:
rtc_store.send_rtc_audio(session_id, audio_buffer, audio_timestamp)
audio_timestamp += 960
vision_frame_deque.clear()
await video_encode_task
+1 -26
View File
@@ -1,14 +1,9 @@
import ctypes
from typing import Optional, Tuple
from typing import Optional
import numpy
from facefusion import rtc_store
from facefusion.libraries import opus as opus_module
from facefusion.types import AudioChunk, SessionId
# TODO this method needs refinement
def create_opus_encoder(sample_rate : int, channels : int) -> Optional[ctypes.c_void_p]:
opus_library = opus_module.create_static_library()
@@ -22,7 +17,6 @@ def create_opus_encoder(sample_rate : int, channels : int) -> Optional[ctypes.c_
return None
# TODO this method needs refinement - rename to encode_opus_buffer
def encode_opus(opus_encoder : ctypes.c_void_p, pcm_pointer : ctypes.c_void_p, frame_size : int) -> bytes:
opus_library = opus_module.create_static_library()
audio_buffer = b''
@@ -43,22 +37,3 @@ def destroy_opus_encoder(opus_encoder : ctypes.c_void_p) -> None:
if opus_library:
opus_library.opus_encoder_destroy(opus_encoder)
# TODO this method needs refinement - eventual inline to encode
def encode_audio_chunk(opus_encoder : ctypes.c_void_p, session_id : SessionId, pcm_data : AudioChunk, audio_remainder : AudioChunk, audio_timestamp : int) -> Tuple[AudioChunk, int]:
pcm_buffer = numpy.concatenate([ audio_remainder, pcm_data ])
frame_samples = 1920
while len(pcm_buffer) >= frame_samples:
chunk = pcm_buffer[:frame_samples]
pcm_buffer = pcm_buffer[frame_samples:]
pcm_pointer = chunk.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
audio_buffer = encode_opus(opus_encoder, pcm_pointer, 960)
if audio_buffer:
rtc_store.send_rtc_audio(session_id, audio_buffer, audio_timestamp)
audio_timestamp += 960
return pcm_buffer, audio_timestamp
+1 -30
View File
@@ -5,7 +5,7 @@ import pytest
from tests.assert_helper import get_test_example_file, get_test_examples_directory
from facefusion import state_manager
from facefusion.audio_encoder import create_opus_encoder, encode_audio_chunk, encode_opus
from facefusion.audio_encoder import create_opus_encoder, encode_opus
from facefusion.download import conditional_download
from facefusion.ffmpeg import read_audio_buffer
from facefusion.libraries import opus as opus_module
@@ -25,7 +25,6 @@ def test_create_opus_encoder() -> None:
pass
#TODO: rename to test_encode_opus_buffer
def test_encode_opus() -> None:
audio_buffer = read_audio_buffer(get_test_example_file('source.mp3'), 48000, 16, 2)
pcm_samples = numpy.frombuffer(audio_buffer, dtype = numpy.int16).astype(numpy.float32) / 32768.0
@@ -39,31 +38,3 @@ def test_encode_opus() -> None:
# TODO: implement
def test_destroy_opus_encoder() -> None:
pass
# TODO: improvise
def test_encode_audio_chunk() -> None:
sample_rate = 48000
channels = 2
frame_samples = sample_rate * 20 // 1000 * channels
audio_buffer = read_audio_buffer(get_test_example_file('source.mp3'), sample_rate, 16, channels)
pcm_samples = numpy.frombuffer(audio_buffer, dtype = numpy.int16).astype(numpy.float32) / 32768.0
opus_encoder = create_opus_encoder(sample_rate, channels)
audio_initial = numpy.array([], dtype = numpy.float32)
audio_remainder, audio_timestamp = encode_audio_chunk(opus_encoder, 'test-encode-audio-chunk', pcm_samples[:frame_samples], audio_initial, 0)
assert len(audio_remainder) == 0
assert audio_timestamp == 960
audio_remainder, audio_timestamp = encode_audio_chunk(opus_encoder, 'test-encode-audio-chunk', pcm_samples[:frame_samples + 500], audio_initial, 0)
assert len(audio_remainder) == 500
assert audio_timestamp == 960
audio_remainder, audio_timestamp = encode_audio_chunk(opus_encoder, 'test-encode-audio-chunk', pcm_samples[:500], audio_initial, 0)
assert len(audio_remainder) == 500
assert audio_timestamp == 0
audio_remainder, audio_timestamp = encode_audio_chunk(opus_encoder, 'test-encode-audio-chunk', pcm_samples[:1000], pcm_samples[:920], 0)
assert len(audio_remainder) == 0
assert audio_timestamp == 960