diff --git a/facefusion/apis/stream_helper.py b/facefusion/apis/stream_helper.py index 29dab13a..788a0e91 100644 --- a/facefusion/apis/stream_helper.py +++ b/facefusion/apis/stream_helper.py @@ -1,4 +1,5 @@ import asyncio +import ctypes from collections import deque from collections.abc import AsyncIterator from typing import Tuple @@ -10,7 +11,7 @@ from starlette.websockets import WebSocket, WebSocketState from facefusion import rtc_store, session_context, session_manager, state_manager from facefusion.apis.api_helper import get_sec_websocket_protocol from facefusion.apis.session_helper import extract_access_token -from facefusion.audio_encoder import create_opus_encoder, destroy_opus_encoder, encode_audio_chunk +from facefusion.audio_encoder import create_opus_encoder, destroy_opus_encoder, encode_opus from facefusion.streamer import process_vision_frame from facefusion.types import Resolution, SessionId, VisionFrame from facefusion.video_encoder import create_vpx_encoder, destroy_vpx_encoder, encode_vpx @@ -126,8 +127,8 @@ async def handle_video_stream(websocket : WebSocket) -> None: keyframe_interval = int(state_manager.get_item('output_video_fps') or 30) # TODO: remove hardcoded via stream_video_fps vision_frame_deque : deque[VisionFrame] = deque(maxlen = 1) opus_encoder = create_opus_encoder(48000, 2) # TODO: guard against opus_encoder being None - audio_initial = numpy.array([], dtype = numpy.float32) - audio_pts = 0 + audio_temp = numpy.array([], dtype = numpy.float32) + audio_timestamp = 0 vision_frame_deque.append(first_vision_frame) rtc_store.create_rtc_stream(session_id) @@ -144,8 +145,18 @@ async def handle_video_stream(websocket : WebSocket) -> None: vision_frame_deque.append(vision_frame) if frame_type == 2: - pcm_data = numpy.frombuffer(frame_buffer, dtype = numpy.float32) - audio_initial, audio_pts = encode_audio_chunk(opus_encoder, session_id, pcm_data, audio_initial, audio_pts) + audio_temp = numpy.concatenate([ audio_temp, numpy.frombuffer(frame_buffer).astype(numpy.float32) ]) + + while len(audio_temp) >= 1920: + audio_chunk = audio_temp[:1920] + audio_temp = audio_temp[1920:] + pcm_pointer = audio_chunk.ctypes.data_as(ctypes.POINTER(ctypes.c_float)) + audio_buffer = encode_opus(opus_encoder, pcm_pointer, 960) + + if audio_buffer: + rtc_store.send_rtc_audio(session_id, audio_buffer, audio_timestamp) + + audio_timestamp += 960 vision_frame_deque.clear() await video_encode_task diff --git a/facefusion/audio_encoder.py b/facefusion/audio_encoder.py index 5452aebf..ee2cd7bc 100644 --- a/facefusion/audio_encoder.py +++ b/facefusion/audio_encoder.py @@ -1,14 +1,9 @@ import ctypes -from typing import Optional, Tuple +from typing import Optional -import numpy - -from facefusion import rtc_store from facefusion.libraries import opus as opus_module -from facefusion.types import AudioChunk, SessionId -# TODO this method needs refinement def create_opus_encoder(sample_rate : int, channels : int) -> Optional[ctypes.c_void_p]: opus_library = opus_module.create_static_library() @@ -22,7 +17,6 @@ def create_opus_encoder(sample_rate : int, channels : int) -> Optional[ctypes.c_ return None -# TODO this method needs refinement - rename to encode_opus_buffer def encode_opus(opus_encoder : ctypes.c_void_p, pcm_pointer : ctypes.c_void_p, frame_size : int) -> bytes: opus_library = opus_module.create_static_library() audio_buffer = b'' @@ -43,22 +37,3 @@ def destroy_opus_encoder(opus_encoder : ctypes.c_void_p) -> None: if opus_library: opus_library.opus_encoder_destroy(opus_encoder) - - -# TODO this method needs refinement - eventual inline to encode -def encode_audio_chunk(opus_encoder : ctypes.c_void_p, session_id : SessionId, pcm_data : AudioChunk, audio_remainder : AudioChunk, audio_timestamp : int) -> Tuple[AudioChunk, int]: - pcm_buffer = numpy.concatenate([ audio_remainder, pcm_data ]) - frame_samples = 1920 - - while len(pcm_buffer) >= frame_samples: - chunk = pcm_buffer[:frame_samples] - pcm_buffer = pcm_buffer[frame_samples:] - pcm_pointer = chunk.ctypes.data_as(ctypes.POINTER(ctypes.c_float)) - audio_buffer = encode_opus(opus_encoder, pcm_pointer, 960) - - if audio_buffer: - rtc_store.send_rtc_audio(session_id, audio_buffer, audio_timestamp) - - audio_timestamp += 960 - - return pcm_buffer, audio_timestamp diff --git a/tests/test_audio_encoder.py b/tests/test_audio_encoder.py index 6b4014c3..1da683de 100644 --- a/tests/test_audio_encoder.py +++ b/tests/test_audio_encoder.py @@ -5,7 +5,7 @@ import pytest from tests.assert_helper import get_test_example_file, get_test_examples_directory from facefusion import state_manager -from facefusion.audio_encoder import create_opus_encoder, encode_audio_chunk, encode_opus +from facefusion.audio_encoder import create_opus_encoder, encode_opus from facefusion.download import conditional_download from facefusion.ffmpeg import read_audio_buffer from facefusion.libraries import opus as opus_module @@ -25,7 +25,6 @@ def test_create_opus_encoder() -> None: pass -#TODO: rename to test_encode_opus_buffer def test_encode_opus() -> None: audio_buffer = read_audio_buffer(get_test_example_file('source.mp3'), 48000, 16, 2) pcm_samples = numpy.frombuffer(audio_buffer, dtype = numpy.int16).astype(numpy.float32) / 32768.0 @@ -39,31 +38,3 @@ def test_encode_opus() -> None: # TODO: implement def test_destroy_opus_encoder() -> None: pass - - -# TODO: improvise -def test_encode_audio_chunk() -> None: - sample_rate = 48000 - channels = 2 - frame_samples = sample_rate * 20 // 1000 * channels - - audio_buffer = read_audio_buffer(get_test_example_file('source.mp3'), sample_rate, 16, channels) - pcm_samples = numpy.frombuffer(audio_buffer, dtype = numpy.int16).astype(numpy.float32) / 32768.0 - opus_encoder = create_opus_encoder(sample_rate, channels) - audio_initial = numpy.array([], dtype = numpy.float32) - - audio_remainder, audio_timestamp = encode_audio_chunk(opus_encoder, 'test-encode-audio-chunk', pcm_samples[:frame_samples], audio_initial, 0) - assert len(audio_remainder) == 0 - assert audio_timestamp == 960 - - audio_remainder, audio_timestamp = encode_audio_chunk(opus_encoder, 'test-encode-audio-chunk', pcm_samples[:frame_samples + 500], audio_initial, 0) - assert len(audio_remainder) == 500 - assert audio_timestamp == 960 - - audio_remainder, audio_timestamp = encode_audio_chunk(opus_encoder, 'test-encode-audio-chunk', pcm_samples[:500], audio_initial, 0) - assert len(audio_remainder) == 500 - assert audio_timestamp == 0 - - audio_remainder, audio_timestamp = encode_audio_chunk(opus_encoder, 'test-encode-audio-chunk', pcm_samples[:1000], pcm_samples[:920], 0) - assert len(audio_remainder) == 0 - assert audio_timestamp == 960