mirror of
https://github.com/facefusion/facefusion.git
synced 2026-05-15 11:17:58 +02:00
Add audio encoder (#1096)
* add audio encoder * add todos * add todos * cleannup and add todos * fix lint --------- Co-authored-by: henryruhs <info@henryruhs.com>
This commit is contained in:
@@ -1,8 +1,7 @@
|
||||
import asyncio
|
||||
import ctypes
|
||||
from collections import deque
|
||||
from collections.abc import AsyncIterator
|
||||
from typing import Optional, Tuple
|
||||
from typing import Tuple
|
||||
|
||||
import cv2
|
||||
import numpy
|
||||
@@ -11,7 +10,7 @@ from starlette.websockets import WebSocket, WebSocketState
|
||||
from facefusion import rtc_store, session_context, session_manager, state_manager
|
||||
from facefusion.apis.api_helper import get_sec_websocket_protocol
|
||||
from facefusion.apis.session_helper import extract_access_token
|
||||
from facefusion.libraries import opus as opus_module
|
||||
from facefusion.audio_encoder import create_opus_encoder, destroy_opus_encoder, encode_audio_chunk
|
||||
from facefusion.streamer import process_vision_frame
|
||||
from facefusion.types import Resolution, SessionId, VisionFrame
|
||||
from facefusion.video_encoder import create_vpx_encoder, destroy_vpx_encoder, encode_vpx
|
||||
@@ -42,43 +41,6 @@ async def receive_vision_frames(websocket : WebSocket) -> AsyncIterator[VisionFr
|
||||
websocket_event = await websocket.receive()
|
||||
|
||||
|
||||
# TODO: move to facefusion/opus_encoder.py
|
||||
def create_opus_encoder(sample_rate : int, channels : int) -> Optional[ctypes.c_void_p]:
|
||||
opus_library = opus_module.create_static_library()
|
||||
|
||||
if opus_library:
|
||||
error = ctypes.c_int(0)
|
||||
encoder = opus_library.opus_encoder_create(sample_rate, channels, 2049, ctypes.byref(error))
|
||||
|
||||
if error.value == 0:
|
||||
return encoder
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# TODO: move to facefusion/opus_encoder.py
|
||||
def encode_opus(opus_encoder : ctypes.c_void_p, pcm_pointer : ctypes.c_void_p, frame_size : int) -> bytes:
|
||||
opus_library = opus_module.create_static_library()
|
||||
audio_buffer = b''
|
||||
|
||||
if opus_library:
|
||||
output_buffer = ctypes.create_string_buffer(4000)
|
||||
encoded_length = opus_library.opus_encode_float(opus_encoder, pcm_pointer, frame_size, output_buffer, 4000)
|
||||
|
||||
if encoded_length > 0:
|
||||
audio_buffer = output_buffer.raw[:encoded_length]
|
||||
|
||||
return audio_buffer
|
||||
|
||||
|
||||
# TODO: move to facefusion/opus_encoder.py
|
||||
def destroy_opus_encoder(opus_encoder : ctypes.c_void_p) -> None:
|
||||
opus_library = opus_module.create_static_library()
|
||||
|
||||
if opus_library:
|
||||
opus_library.opus_encoder_destroy(opus_encoder)
|
||||
|
||||
|
||||
# TODO: move to facefusion/vpx_encoder.py, throttle loop to avoid spinning on same frame
|
||||
def run_video_encode_loop(vision_frame_deque : deque[VisionFrame], session_id : SessionId, initial_resolution : Resolution, keyframe_interval : int) -> None:
|
||||
codec_context = create_vpx_encoder(initial_resolution[0], initial_resolution[1], 4500)
|
||||
@@ -117,25 +79,6 @@ def run_video_encode_loop(vision_frame_deque : deque[VisionFrame], session_id :
|
||||
destroy_vpx_encoder(codec_context)
|
||||
|
||||
|
||||
# TODO: move to facefusion/opus_encoder.py
|
||||
def encode_audio_chunk(opus_encoder : ctypes.c_void_p, session_id : SessionId, pcm_data : numpy.ndarray, audio_remainder : numpy.ndarray, audio_pts : int) -> Tuple[numpy.ndarray, int]:
|
||||
pcm_buffer = numpy.concatenate([audio_remainder, pcm_data])
|
||||
frame_samples = 1920
|
||||
|
||||
while len(pcm_buffer) >= frame_samples:
|
||||
chunk = pcm_buffer[:frame_samples]
|
||||
pcm_buffer = pcm_buffer[frame_samples:]
|
||||
pcm_pointer = chunk.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
|
||||
audio_buffer = encode_opus(opus_encoder, pcm_pointer, 960)
|
||||
|
||||
if audio_buffer:
|
||||
rtc_store.send_rtc_audio(session_id, audio_buffer, audio_pts)
|
||||
|
||||
audio_pts += 960
|
||||
|
||||
return pcm_buffer, audio_pts
|
||||
|
||||
|
||||
# TODO: extract shared session setup from handle_image_stream and handle_video_stream, guard session_id like handle_video_stream
|
||||
async def handle_image_stream(websocket : WebSocket) -> None:
|
||||
subprotocol = get_sec_websocket_protocol(websocket.scope)
|
||||
@@ -183,7 +126,7 @@ async def handle_video_stream(websocket : WebSocket) -> None:
|
||||
keyframe_interval = int(state_manager.get_item('output_video_fps') or 30) # TODO: remove hardcoded via stream_video_fps
|
||||
vision_frame_deque : deque[VisionFrame] = deque(maxlen = 1)
|
||||
opus_encoder = create_opus_encoder(48000, 2) # TODO: guard against opus_encoder being None
|
||||
audio_remainder = numpy.array([], dtype = numpy.float32)
|
||||
audio_initial = numpy.array([], dtype = numpy.float32)
|
||||
audio_pts = 0
|
||||
|
||||
vision_frame_deque.append(first_vision_frame)
|
||||
@@ -202,7 +145,7 @@ async def handle_video_stream(websocket : WebSocket) -> None:
|
||||
|
||||
if frame_type == 2:
|
||||
pcm_data = numpy.frombuffer(frame_buffer, dtype = numpy.float32)
|
||||
audio_remainder, audio_pts = encode_audio_chunk(opus_encoder, session_id, pcm_data, audio_remainder, audio_pts)
|
||||
audio_initial, audio_pts = encode_audio_chunk(opus_encoder, session_id, pcm_data, audio_initial, audio_pts)
|
||||
|
||||
vision_frame_deque.clear()
|
||||
await video_encode_task
|
||||
|
||||
@@ -0,0 +1,64 @@
|
||||
import ctypes
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import numpy
|
||||
|
||||
from facefusion import rtc_store
|
||||
from facefusion.libraries import opus as opus_module
|
||||
from facefusion.types import AudioChunk, SessionId
|
||||
|
||||
|
||||
# TODO this method needs refinement
|
||||
def create_opus_encoder(sample_rate : int, channels : int) -> Optional[ctypes.c_void_p]:
|
||||
opus_library = opus_module.create_static_library()
|
||||
|
||||
if opus_library:
|
||||
error = ctypes.c_int(0)
|
||||
encoder = opus_library.opus_encoder_create(sample_rate, channels, 2049, ctypes.byref(error))
|
||||
|
||||
if error.value == 0:
|
||||
return encoder
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# TODO this method needs refinement - rename to encode_opus_buffer
|
||||
def encode_opus(opus_encoder : ctypes.c_void_p, pcm_pointer : ctypes.c_void_p, frame_size : int) -> bytes:
|
||||
opus_library = opus_module.create_static_library()
|
||||
audio_buffer = b''
|
||||
|
||||
if opus_library:
|
||||
output_buffer = ctypes.create_string_buffer(4000)
|
||||
encoded_length = opus_library.opus_encode_float(opus_encoder, pcm_pointer, frame_size, output_buffer, 4000)
|
||||
|
||||
if encoded_length > 0:
|
||||
audio_buffer = output_buffer.raw[:encoded_length]
|
||||
|
||||
return audio_buffer
|
||||
|
||||
|
||||
# TODO not 100 sure this makes full sense. should we not run clear on the lru-cache instead?
|
||||
def destroy_opus_encoder(opus_encoder : ctypes.c_void_p) -> None:
|
||||
opus_library = opus_module.create_static_library()
|
||||
|
||||
if opus_library:
|
||||
opus_library.opus_encoder_destroy(opus_encoder)
|
||||
|
||||
|
||||
# TODO this method needs refinement - eventual inline to encode
|
||||
def encode_audio_chunk(opus_encoder : ctypes.c_void_p, session_id : SessionId, pcm_data : AudioChunk, audio_remainder : AudioChunk, audio_timestamp : int) -> Tuple[AudioChunk, int]:
|
||||
pcm_buffer = numpy.concatenate([ audio_remainder, pcm_data ])
|
||||
frame_samples = 1920
|
||||
|
||||
while len(pcm_buffer) >= frame_samples:
|
||||
chunk = pcm_buffer[:frame_samples]
|
||||
pcm_buffer = pcm_buffer[frame_samples:]
|
||||
pcm_pointer = chunk.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
|
||||
audio_buffer = encode_opus(opus_encoder, pcm_pointer, 960)
|
||||
|
||||
if audio_buffer:
|
||||
rtc_store.send_rtc_audio(session_id, audio_buffer, audio_timestamp)
|
||||
|
||||
audio_timestamp += 960
|
||||
|
||||
return pcm_buffer, audio_timestamp
|
||||
@@ -36,7 +36,7 @@ def create_vpx_encoder(width : int, height : int, bitrate : int) -> Optional[cty
|
||||
return None
|
||||
|
||||
|
||||
# TODO this method needs refinement
|
||||
# TODO this method needs refinement - rename to encode_vpx_buffer
|
||||
def encode_vpx(codec_context : ctypes.Array[ctypes.c_char], yuv_buffer : bytes, width : int, height : int, presentation_timestamp : int, flags : int) -> bytes:
|
||||
vpx_library = vpx_module.create_static_library()
|
||||
frame_buffer = b''
|
||||
|
||||
@@ -0,0 +1,69 @@
|
||||
import ctypes
|
||||
|
||||
import numpy
|
||||
import pytest
|
||||
from tests.assert_helper import get_test_example_file, get_test_examples_directory
|
||||
|
||||
from facefusion import state_manager
|
||||
from facefusion.audio_encoder import create_opus_encoder, encode_audio_chunk, encode_opus
|
||||
from facefusion.download import conditional_download
|
||||
from facefusion.ffmpeg import read_audio_buffer
|
||||
from facefusion.libraries import opus as opus_module
|
||||
|
||||
|
||||
@pytest.fixture(scope = 'module', autouse = True)
|
||||
def before_all() -> None:
|
||||
state_manager.init_item('download_providers', [ 'github', 'huggingface' ])
|
||||
|
||||
conditional_download(get_test_examples_directory(), [ 'https://github.com/facefusion/facefusion-assets/releases/download/examples-3.0.0/source.mp3' ])
|
||||
|
||||
opus_module.pre_check()
|
||||
|
||||
|
||||
# TODO: implement
|
||||
def test_create_opus_encoder() -> None:
|
||||
pass
|
||||
|
||||
|
||||
#TODO: rename to test_encode_opus_buffer
|
||||
def test_encode_opus() -> None:
|
||||
audio_buffer = read_audio_buffer(get_test_example_file('source.mp3'), 48000, 16, 2)
|
||||
pcm_samples = numpy.frombuffer(audio_buffer, dtype = numpy.int16).astype(numpy.float32) / 32768.0
|
||||
pcm_pointer = pcm_samples[:1920].ctypes.data_as(ctypes.POINTER(ctypes.c_float))
|
||||
opus_encoder = create_opus_encoder(48000, 2)
|
||||
|
||||
assert encode_opus(opus_encoder, pcm_pointer, 960)
|
||||
assert encode_opus(opus_encoder, pcm_pointer, 0) == b''
|
||||
|
||||
|
||||
# TODO: implement
|
||||
def test_destroy_opus_encoder() -> None:
|
||||
pass
|
||||
|
||||
|
||||
# TODO: improvise
|
||||
def test_encode_audio_chunk() -> None:
|
||||
sample_rate = 48000
|
||||
channels = 2
|
||||
frame_samples = sample_rate * 20 // 1000 * channels
|
||||
|
||||
audio_buffer = read_audio_buffer(get_test_example_file('source.mp3'), sample_rate, 16, channels)
|
||||
pcm_samples = numpy.frombuffer(audio_buffer, dtype = numpy.int16).astype(numpy.float32) / 32768.0
|
||||
opus_encoder = create_opus_encoder(sample_rate, channels)
|
||||
audio_initial = numpy.array([], dtype = numpy.float32)
|
||||
|
||||
audio_remainder, audio_timestamp = encode_audio_chunk(opus_encoder, 'test-encode-audio-chunk', pcm_samples[:frame_samples], audio_initial, 0)
|
||||
assert len(audio_remainder) == 0
|
||||
assert audio_timestamp == 960
|
||||
|
||||
audio_remainder, audio_timestamp = encode_audio_chunk(opus_encoder, 'test-encode-audio-chunk', pcm_samples[:frame_samples + 500], audio_initial, 0)
|
||||
assert len(audio_remainder) == 500
|
||||
assert audio_timestamp == 960
|
||||
|
||||
audio_remainder, audio_timestamp = encode_audio_chunk(opus_encoder, 'test-encode-audio-chunk', pcm_samples[:500], audio_initial, 0)
|
||||
assert len(audio_remainder) == 500
|
||||
assert audio_timestamp == 0
|
||||
|
||||
audio_remainder, audio_timestamp = encode_audio_chunk(opus_encoder, 'test-encode-audio-chunk', pcm_samples[:1000], pcm_samples[:920], 0)
|
||||
assert len(audio_remainder) == 0
|
||||
assert audio_timestamp == 960
|
||||
@@ -17,12 +17,24 @@ def before_all() -> None:
|
||||
vpx_module.pre_check()
|
||||
|
||||
|
||||
# TODO: implement
|
||||
def test_create_vpx_encoder() -> None:
|
||||
pass
|
||||
|
||||
|
||||
# TODO: rename to test_encode_vpx_buffer
|
||||
def test_encode_vpx() -> None:
|
||||
vision_frame = read_video_frame(get_test_example_file('target-240p.mp4'))
|
||||
height, width = vision_frame.shape[:2]
|
||||
vpx_encoder = create_vpx_encoder(width, height, 1000)
|
||||
|
||||
buffer_valid = cv2.cvtColor(vision_frame, cv2.COLOR_BGR2YUV_I420).tobytes()
|
||||
buffer_invalid = bytes(0)
|
||||
vpx_encoder = create_vpx_encoder(width, height, 1000)
|
||||
|
||||
assert encode_vpx(vpx_encoder, buffer_valid, width, height, 3, 1)
|
||||
assert encode_vpx(vpx_encoder, buffer_invalid, width, height, 0, 0) == b''
|
||||
|
||||
|
||||
# TODO: implement
|
||||
def test_destroy_vpx_encoder() -> None:
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user