Add audio encoder (#1096)

* add audio encoder

* add todos

* add todos

* cleannup and add todos

* fix lint

---------

Co-authored-by: henryruhs <info@henryruhs.com>
This commit is contained in:
Harisreedhar
2026-05-11 19:19:09 +05:30
committed by henryruhs
parent 9321b41e8e
commit ab7110eb92
5 changed files with 151 additions and 63 deletions
+4 -61
View File
@@ -1,8 +1,7 @@
import asyncio
import ctypes
from collections import deque
from collections.abc import AsyncIterator
from typing import Optional, Tuple
from typing import Tuple
import cv2
import numpy
@@ -11,7 +10,7 @@ from starlette.websockets import WebSocket, WebSocketState
from facefusion import rtc_store, session_context, session_manager, state_manager
from facefusion.apis.api_helper import get_sec_websocket_protocol
from facefusion.apis.session_helper import extract_access_token
from facefusion.libraries import opus as opus_module
from facefusion.audio_encoder import create_opus_encoder, destroy_opus_encoder, encode_audio_chunk
from facefusion.streamer import process_vision_frame
from facefusion.types import Resolution, SessionId, VisionFrame
from facefusion.video_encoder import create_vpx_encoder, destroy_vpx_encoder, encode_vpx
@@ -42,43 +41,6 @@ async def receive_vision_frames(websocket : WebSocket) -> AsyncIterator[VisionFr
websocket_event = await websocket.receive()
# TODO: move to facefusion/opus_encoder.py
def create_opus_encoder(sample_rate : int, channels : int) -> Optional[ctypes.c_void_p]:
opus_library = opus_module.create_static_library()
if opus_library:
error = ctypes.c_int(0)
encoder = opus_library.opus_encoder_create(sample_rate, channels, 2049, ctypes.byref(error))
if error.value == 0:
return encoder
return None
# TODO: move to facefusion/opus_encoder.py
def encode_opus(opus_encoder : ctypes.c_void_p, pcm_pointer : ctypes.c_void_p, frame_size : int) -> bytes:
opus_library = opus_module.create_static_library()
audio_buffer = b''
if opus_library:
output_buffer = ctypes.create_string_buffer(4000)
encoded_length = opus_library.opus_encode_float(opus_encoder, pcm_pointer, frame_size, output_buffer, 4000)
if encoded_length > 0:
audio_buffer = output_buffer.raw[:encoded_length]
return audio_buffer
# TODO: move to facefusion/opus_encoder.py
def destroy_opus_encoder(opus_encoder : ctypes.c_void_p) -> None:
opus_library = opus_module.create_static_library()
if opus_library:
opus_library.opus_encoder_destroy(opus_encoder)
# TODO: move to facefusion/vpx_encoder.py, throttle loop to avoid spinning on same frame
def run_video_encode_loop(vision_frame_deque : deque[VisionFrame], session_id : SessionId, initial_resolution : Resolution, keyframe_interval : int) -> None:
codec_context = create_vpx_encoder(initial_resolution[0], initial_resolution[1], 4500)
@@ -117,25 +79,6 @@ def run_video_encode_loop(vision_frame_deque : deque[VisionFrame], session_id :
destroy_vpx_encoder(codec_context)
# TODO: move to facefusion/opus_encoder.py
def encode_audio_chunk(opus_encoder : ctypes.c_void_p, session_id : SessionId, pcm_data : numpy.ndarray, audio_remainder : numpy.ndarray, audio_pts : int) -> Tuple[numpy.ndarray, int]:
pcm_buffer = numpy.concatenate([audio_remainder, pcm_data])
frame_samples = 1920
while len(pcm_buffer) >= frame_samples:
chunk = pcm_buffer[:frame_samples]
pcm_buffer = pcm_buffer[frame_samples:]
pcm_pointer = chunk.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
audio_buffer = encode_opus(opus_encoder, pcm_pointer, 960)
if audio_buffer:
rtc_store.send_rtc_audio(session_id, audio_buffer, audio_pts)
audio_pts += 960
return pcm_buffer, audio_pts
# TODO: extract shared session setup from handle_image_stream and handle_video_stream, guard session_id like handle_video_stream
async def handle_image_stream(websocket : WebSocket) -> None:
subprotocol = get_sec_websocket_protocol(websocket.scope)
@@ -183,7 +126,7 @@ async def handle_video_stream(websocket : WebSocket) -> None:
keyframe_interval = int(state_manager.get_item('output_video_fps') or 30) # TODO: remove hardcoded via stream_video_fps
vision_frame_deque : deque[VisionFrame] = deque(maxlen = 1)
opus_encoder = create_opus_encoder(48000, 2) # TODO: guard against opus_encoder being None
audio_remainder = numpy.array([], dtype = numpy.float32)
audio_initial = numpy.array([], dtype = numpy.float32)
audio_pts = 0
vision_frame_deque.append(first_vision_frame)
@@ -202,7 +145,7 @@ async def handle_video_stream(websocket : WebSocket) -> None:
if frame_type == 2:
pcm_data = numpy.frombuffer(frame_buffer, dtype = numpy.float32)
audio_remainder, audio_pts = encode_audio_chunk(opus_encoder, session_id, pcm_data, audio_remainder, audio_pts)
audio_initial, audio_pts = encode_audio_chunk(opus_encoder, session_id, pcm_data, audio_initial, audio_pts)
vision_frame_deque.clear()
await video_encode_task
+64
View File
@@ -0,0 +1,64 @@
import ctypes
from typing import Optional, Tuple
import numpy
from facefusion import rtc_store
from facefusion.libraries import opus as opus_module
from facefusion.types import AudioChunk, SessionId
# TODO this method needs refinement
def create_opus_encoder(sample_rate : int, channels : int) -> Optional[ctypes.c_void_p]:
opus_library = opus_module.create_static_library()
if opus_library:
error = ctypes.c_int(0)
encoder = opus_library.opus_encoder_create(sample_rate, channels, 2049, ctypes.byref(error))
if error.value == 0:
return encoder
return None
# TODO this method needs refinement - rename to encode_opus_buffer
def encode_opus(opus_encoder : ctypes.c_void_p, pcm_pointer : ctypes.c_void_p, frame_size : int) -> bytes:
opus_library = opus_module.create_static_library()
audio_buffer = b''
if opus_library:
output_buffer = ctypes.create_string_buffer(4000)
encoded_length = opus_library.opus_encode_float(opus_encoder, pcm_pointer, frame_size, output_buffer, 4000)
if encoded_length > 0:
audio_buffer = output_buffer.raw[:encoded_length]
return audio_buffer
# TODO not 100 sure this makes full sense. should we not run clear on the lru-cache instead?
def destroy_opus_encoder(opus_encoder : ctypes.c_void_p) -> None:
opus_library = opus_module.create_static_library()
if opus_library:
opus_library.opus_encoder_destroy(opus_encoder)
# TODO this method needs refinement - eventual inline to encode
def encode_audio_chunk(opus_encoder : ctypes.c_void_p, session_id : SessionId, pcm_data : AudioChunk, audio_remainder : AudioChunk, audio_timestamp : int) -> Tuple[AudioChunk, int]:
pcm_buffer = numpy.concatenate([ audio_remainder, pcm_data ])
frame_samples = 1920
while len(pcm_buffer) >= frame_samples:
chunk = pcm_buffer[:frame_samples]
pcm_buffer = pcm_buffer[frame_samples:]
pcm_pointer = chunk.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
audio_buffer = encode_opus(opus_encoder, pcm_pointer, 960)
if audio_buffer:
rtc_store.send_rtc_audio(session_id, audio_buffer, audio_timestamp)
audio_timestamp += 960
return pcm_buffer, audio_timestamp
+1 -1
View File
@@ -36,7 +36,7 @@ def create_vpx_encoder(width : int, height : int, bitrate : int) -> Optional[cty
return None
# TODO this method needs refinement
# TODO this method needs refinement - rename to encode_vpx_buffer
def encode_vpx(codec_context : ctypes.Array[ctypes.c_char], yuv_buffer : bytes, width : int, height : int, presentation_timestamp : int, flags : int) -> bytes:
vpx_library = vpx_module.create_static_library()
frame_buffer = b''
+69
View File
@@ -0,0 +1,69 @@
import ctypes
import numpy
import pytest
from tests.assert_helper import get_test_example_file, get_test_examples_directory
from facefusion import state_manager
from facefusion.audio_encoder import create_opus_encoder, encode_audio_chunk, encode_opus
from facefusion.download import conditional_download
from facefusion.ffmpeg import read_audio_buffer
from facefusion.libraries import opus as opus_module
@pytest.fixture(scope = 'module', autouse = True)
def before_all() -> None:
state_manager.init_item('download_providers', [ 'github', 'huggingface' ])
conditional_download(get_test_examples_directory(), [ 'https://github.com/facefusion/facefusion-assets/releases/download/examples-3.0.0/source.mp3' ])
opus_module.pre_check()
# TODO: implement
def test_create_opus_encoder() -> None:
pass
#TODO: rename to test_encode_opus_buffer
def test_encode_opus() -> None:
audio_buffer = read_audio_buffer(get_test_example_file('source.mp3'), 48000, 16, 2)
pcm_samples = numpy.frombuffer(audio_buffer, dtype = numpy.int16).astype(numpy.float32) / 32768.0
pcm_pointer = pcm_samples[:1920].ctypes.data_as(ctypes.POINTER(ctypes.c_float))
opus_encoder = create_opus_encoder(48000, 2)
assert encode_opus(opus_encoder, pcm_pointer, 960)
assert encode_opus(opus_encoder, pcm_pointer, 0) == b''
# TODO: implement
def test_destroy_opus_encoder() -> None:
pass
# TODO: improvise
def test_encode_audio_chunk() -> None:
sample_rate = 48000
channels = 2
frame_samples = sample_rate * 20 // 1000 * channels
audio_buffer = read_audio_buffer(get_test_example_file('source.mp3'), sample_rate, 16, channels)
pcm_samples = numpy.frombuffer(audio_buffer, dtype = numpy.int16).astype(numpy.float32) / 32768.0
opus_encoder = create_opus_encoder(sample_rate, channels)
audio_initial = numpy.array([], dtype = numpy.float32)
audio_remainder, audio_timestamp = encode_audio_chunk(opus_encoder, 'test-encode-audio-chunk', pcm_samples[:frame_samples], audio_initial, 0)
assert len(audio_remainder) == 0
assert audio_timestamp == 960
audio_remainder, audio_timestamp = encode_audio_chunk(opus_encoder, 'test-encode-audio-chunk', pcm_samples[:frame_samples + 500], audio_initial, 0)
assert len(audio_remainder) == 500
assert audio_timestamp == 960
audio_remainder, audio_timestamp = encode_audio_chunk(opus_encoder, 'test-encode-audio-chunk', pcm_samples[:500], audio_initial, 0)
assert len(audio_remainder) == 500
assert audio_timestamp == 0
audio_remainder, audio_timestamp = encode_audio_chunk(opus_encoder, 'test-encode-audio-chunk', pcm_samples[:1000], pcm_samples[:920], 0)
assert len(audio_remainder) == 0
assert audio_timestamp == 960
+13 -1
View File
@@ -17,12 +17,24 @@ def before_all() -> None:
vpx_module.pre_check()
# TODO: implement
def test_create_vpx_encoder() -> None:
pass
# TODO: rename to test_encode_vpx_buffer
def test_encode_vpx() -> None:
vision_frame = read_video_frame(get_test_example_file('target-240p.mp4'))
height, width = vision_frame.shape[:2]
vpx_encoder = create_vpx_encoder(width, height, 1000)
buffer_valid = cv2.cvtColor(vision_frame, cv2.COLOR_BGR2YUV_I420).tobytes()
buffer_invalid = bytes(0)
vpx_encoder = create_vpx_encoder(width, height, 1000)
assert encode_vpx(vpx_encoder, buffer_valid, width, height, 3, 1)
assert encode_vpx(vpx_encoder, buffer_invalid, width, height, 0, 0) == b''
# TODO: implement
def test_destroy_vpx_encoder() -> None:
pass