perf(ui): decouple face detection from swap in live webcam pipeline

Add a dedicated detection thread that runs face detection continuously
on the latest captured frame and publishes results to a shared dict.
The processing/swap thread reads cached detection results instead of
running detection inline, so it never blocks on the 15-30ms detection
cost.

Architecture change: 2 threads → 3 threads
  Before: capture → [detect + swap] → display
  After:  capture → swap (uses cached detections) → display
                  ↘ detect (async, writes to shared cache) ↗

Also replaces the blocking while/ROOT.update() display loop with
ROOT.after()-based scheduling, which avoids Tk event loop re-entrancy
issues and UI freezes.

Closes #1664
This commit is contained in:
Lauri Gates
2026-02-22 18:41:47 +02:00
parent d89385457e
commit ca6cba9311
+86 -41
View File
@@ -997,28 +997,48 @@ def _capture_thread_func(cap, capture_queue, stop_event):
pass
# How often to run full face detection. On intermediate frames the last
# detected face positions are reused, which significantly reduces the
# per-frame cost of the processing thread.
DETECT_EVERY_N = 2
def _detection_thread_func(latest_frame_holder, detection_result, detection_lock, stop_event):
"""Detection thread: continuously runs face detection on the latest
captured frame and stores results in detection_result under detection_lock.
This decouples face detection (~15-30ms) from face swapping (~5-10ms)
so the swap loop never blocks on detection, significantly improving
live mode FPS."""
while not stop_event.is_set():
with detection_lock:
frame = latest_frame_holder[0]
if frame is None:
time.sleep(0.005)
continue
if modules.globals.many_faces:
many = get_many_faces(frame)
with detection_lock:
detection_result['target_face'] = None
detection_result['many_faces'] = many
else:
face = get_one_face(frame)
with detection_lock:
detection_result['target_face'] = face
detection_result['many_faces'] = None
def _processing_thread_func(capture_queue, processed_queue, stop_event):
"""Processing thread: takes raw frames from capture_queue, applies face
processing, and puts results into processed_queue. Drops processed frames
when the output queue is full so the UI always gets the latest result.
def _processing_thread_func(capture_queue, processed_queue, stop_event,
latest_frame_holder, detection_result, detection_lock):
"""Processing thread: takes raw frames from capture_queue, reads the
latest detection result from the shared detection_result dict, applies
face swap/enhancement, and puts results into processed_queue.
Uses DETECT_EVERY_N to skip expensive face detection on intermediate
frames, reusing cached face positions instead."""
Face detection runs concurrently in _detection_thread_func — this thread
only reads cached results so it never blocks on detection."""
frame_processors = get_frame_processors_modules(modules.globals.frame_processors)
source_image = None
last_source_path = None
prev_time = time.time()
fps_update_interval = 0.5
frame_count = 0
fps = 0
proc_frame_index = 0
cached_target_face = None # cached single-face result
cached_many_faces = None # cached many-faces result
while not stop_event.is_set():
try:
@@ -1026,32 +1046,31 @@ def _processing_thread_func(capture_queue, processed_queue, stop_event):
except queue.Empty:
continue
temp_frame = frame.copy()
run_detection = (proc_frame_index % DETECT_EVERY_N == 0)
proc_frame_index += 1
temp_frame = frame
if modules.globals.live_mirror:
temp_frame = gpu_flip(temp_frame, 1)
# Publish the mirrored frame for the detection thread to pick up
with detection_lock:
latest_frame_holder[0] = temp_frame
if not modules.globals.map_faces:
if source_image is None and modules.globals.source_path:
if modules.globals.source_path and modules.globals.source_path != last_source_path:
last_source_path = modules.globals.source_path
source_image = get_one_face(cv2.imread(modules.globals.source_path))
# Update face detection cache on detection frames
if run_detection or (cached_target_face is None and cached_many_faces is None):
if modules.globals.many_faces:
cached_many_faces = get_many_faces(temp_frame)
cached_target_face = None
else:
cached_target_face = get_one_face(temp_frame)
cached_many_faces = None
# Read latest detection results (brief lock to avoid blocking detection thread)
with detection_lock:
cached_target_face = detection_result.get('target_face')
cached_many_faces = detection_result.get('many_faces')
for frame_processor in frame_processors:
if frame_processor.NAME == "DLC.FACE-ENHANCER":
if modules.globals.fp_ui["face_enhancer"]:
temp_frame = frame_processor.process_frame(None, temp_frame)
elif frame_processor.NAME == "DLC.FACE-SWAPPER":
# Use cached face positions to skip redundant detection
# Use cached face positions from detection thread
swapped_bboxes = []
if modules.globals.many_faces and cached_many_faces:
result = temp_frame.copy()
@@ -1127,6 +1146,14 @@ def create_webcam_preview(camera_index: int):
processed_queue = queue.Queue(maxsize=2)
stop_event = threading.Event()
# Shared state for the detection pipeline.
# latest_frame_holder[0] is the most recent raw frame for the detection
# thread; detection_result holds the last detected faces for the
# processing thread to read. Both are guarded by detection_lock.
detection_lock = threading.Lock()
latest_frame_holder = [None]
detection_result = {'target_face': None, 'many_faces': None}
# Start capture thread
cap_thread = threading.Thread(
target=_capture_thread_func,
@@ -1135,21 +1162,45 @@ def create_webcam_preview(camera_index: int):
)
cap_thread.start()
# Start detection thread — runs face detection asynchronously so the
# processing/swap thread never blocks on it
det_thread = threading.Thread(
target=_detection_thread_func,
args=(latest_frame_holder, detection_result, detection_lock, stop_event),
daemon=True,
)
det_thread.start()
# Start processing thread
proc_thread = threading.Thread(
target=_processing_thread_func,
args=(capture_queue, processed_queue, stop_event),
args=(capture_queue, processed_queue, stop_event,
latest_frame_holder, detection_result, detection_lock),
daemon=True,
)
proc_thread.start()
# Main (UI) thread: pull processed frames and update the display
while not stop_event.is_set():
# Cleanup helper called from the display loop when preview closes
def _cleanup():
stop_event.set()
cap_thread.join(timeout=2.0)
det_thread.join(timeout=2.0)
proc_thread.join(timeout=2.0)
cap.release()
PREVIEW.withdraw()
# Non-blocking display loop using ROOT.after() — avoids blocking the
# Tk event loop which could cause UI freezes or re-entrancy issues
def _display_next_frame():
if stop_event.is_set() or PREVIEW.state() == "withdrawn":
_cleanup()
return
try:
temp_frame = processed_queue.get(timeout=0.03)
temp_frame = processed_queue.get_nowait()
except queue.Empty:
ROOT.update()
continue
ROOT.after(16, _display_next_frame)
return
if modules.globals.live_resizable:
temp_frame = fit_image_to_size(
@@ -1167,17 +1218,11 @@ def create_webcam_preview(camera_index: int):
)
image = ctk.CTkImage(image, size=image.size)
preview_label.configure(image=image)
ROOT.update()
if PREVIEW.state() == "withdrawn":
break
ROOT.after(16, _display_next_frame)
# Signal threads to stop and wait for them
stop_event.set()
cap_thread.join(timeout=2.0)
proc_thread.join(timeout=2.0)
cap.release()
PREVIEW.withdraw()
# Kick off the non-blocking display loop
ROOT.after(0, _display_next_frame)
def create_source_target_popup_for_webcam(