From ca6cba9311078cb6a32aa69f133a1be6c4e92aab Mon Sep 17 00:00:00 2001
From: Lauri Gates <lauri.gates@forumvirium.fi>
Date: Sun, 22 Feb 2026 18:41:47 +0200
Subject: [PATCH] perf(ui): decouple face detection from swap in live webcam
 pipeline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a dedicated detection thread that runs face detection continuously
on the latest captured frame and publishes results to a shared dict.
The processing/swap thread reads cached detection results instead of
running detection inline, so it never blocks on the 15-30ms detection
cost.

Architecture change: 2 threads → 3 threads
  Before: capture → [detect + swap] → display
  After:  capture → swap (uses cached detections) → display
                  ↘ detect (async, writes to shared cache) ↗

Also replaces the blocking while/ROOT.update() display loop with
ROOT.after()-based scheduling, which avoids Tk event loop re-entrancy
issues and UI freezes.

Closes #1664
---
 modules/ui.py | 127 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 86 insertions(+), 41 deletions(-)

diff --git a/modules/ui.py b/modules/ui.py
index 74681bc..e776608 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -997,28 +997,48 @@ def _capture_thread_func(cap, capture_queue, stop_event):
                 pass
 
 
-# How often to run full face detection. On intermediate frames the last
-# detected face positions are reused, which significantly reduces the
-# per-frame cost of the processing thread.
-DETECT_EVERY_N = 2
+def _detection_thread_func(latest_frame_holder, detection_result, detection_lock, stop_event):
+    """Detection thread: continuously runs face detection on the latest
+    captured frame and stores results in detection_result under detection_lock.
+
+    This decouples face detection (~15-30ms) from face swapping (~5-10ms)
+    so the swap loop never blocks on detection, significantly improving
+    live mode FPS."""
+    while not stop_event.is_set():
+        with detection_lock:
+            frame = latest_frame_holder[0]
+
+        if frame is None:
+            time.sleep(0.005)
+            continue
+
+        if modules.globals.many_faces:
+            many = get_many_faces(frame)
+            with detection_lock:
+                detection_result['target_face'] = None
+                detection_result['many_faces'] = many
+        else:
+            face = get_one_face(frame)
+            with detection_lock:
+                detection_result['target_face'] = face
+                detection_result['many_faces'] = None
 
 
-def _processing_thread_func(capture_queue, processed_queue, stop_event):
-    """Processing thread: takes raw frames from capture_queue, applies face
-    processing, and puts results into processed_queue. Drops processed frames
-    when the output queue is full so the UI always gets the latest result.
+def _processing_thread_func(capture_queue, processed_queue, stop_event,
+                             latest_frame_holder, detection_result, detection_lock):
+    """Processing thread: takes raw frames from capture_queue, reads the
+    latest detection result from the shared detection_result dict, applies
+    face swap/enhancement, and puts results into processed_queue.
 
-    Uses DETECT_EVERY_N to skip expensive face detection on intermediate
-    frames, reusing cached face positions instead."""
+    Face detection runs concurrently in _detection_thread_func — this thread
+    only reads cached results so it never blocks on detection."""
     frame_processors = get_frame_processors_modules(modules.globals.frame_processors)
     source_image = None
+    last_source_path = None
     prev_time = time.time()
     fps_update_interval = 0.5
     frame_count = 0
     fps = 0
-    proc_frame_index = 0
-    cached_target_face = None  # cached single-face result
-    cached_many_faces = None   # cached many-faces result
 
     while not stop_event.is_set():
         try:
@@ -1026,32 +1046,31 @@ def _processing_thread_func(capture_queue, processed_queue, stop_event):
         except queue.Empty:
             continue
 
-        temp_frame = frame.copy()
-        run_detection = (proc_frame_index % DETECT_EVERY_N == 0)
-        proc_frame_index += 1
+        temp_frame = frame
 
         if modules.globals.live_mirror:
             temp_frame = gpu_flip(temp_frame, 1)
 
+        # Publish the mirrored frame for the detection thread to pick up
+        with detection_lock:
+            latest_frame_holder[0] = temp_frame
+
         if not modules.globals.map_faces:
-            if source_image is None and modules.globals.source_path:
+            if modules.globals.source_path and modules.globals.source_path != last_source_path:
+                last_source_path = modules.globals.source_path
                 source_image = get_one_face(cv2.imread(modules.globals.source_path))
 
-            # Update face detection cache on detection frames
-            if run_detection or (cached_target_face is None and cached_many_faces is None):
-                if modules.globals.many_faces:
-                    cached_many_faces = get_many_faces(temp_frame)
-                    cached_target_face = None
-                else:
-                    cached_target_face = get_one_face(temp_frame)
-                    cached_many_faces = None
+            # Read latest detection results (brief lock to avoid blocking detection thread)
+            with detection_lock:
+                cached_target_face = detection_result.get('target_face')
+                cached_many_faces = detection_result.get('many_faces')
 
             for frame_processor in frame_processors:
                 if frame_processor.NAME == "DLC.FACE-ENHANCER":
                     if modules.globals.fp_ui["face_enhancer"]:
                         temp_frame = frame_processor.process_frame(None, temp_frame)
                 elif frame_processor.NAME == "DLC.FACE-SWAPPER":
-                    # Use cached face positions to skip redundant detection
+                    # Use cached face positions from detection thread
                     swapped_bboxes = []
                     if modules.globals.many_faces and cached_many_faces:
                         result = temp_frame.copy()
@@ -1127,6 +1146,14 @@ def create_webcam_preview(camera_index: int):
     processed_queue = queue.Queue(maxsize=2)
     stop_event = threading.Event()
 
+    # Shared state for the detection pipeline.
+    # latest_frame_holder[0] is the most recent raw frame for the detection
+    # thread; detection_result holds the last detected faces for the
+    # processing thread to read.  Both are guarded by detection_lock.
+    detection_lock = threading.Lock()
+    latest_frame_holder = [None]
+    detection_result = {'target_face': None, 'many_faces': None}
+
     # Start capture thread
     cap_thread = threading.Thread(
         target=_capture_thread_func,
@@ -1135,21 +1162,45 @@ def create_webcam_preview(camera_index: int):
     )
     cap_thread.start()
 
+    # Start detection thread — runs face detection asynchronously so the
+    # processing/swap thread never blocks on it
+    det_thread = threading.Thread(
+        target=_detection_thread_func,
+        args=(latest_frame_holder, detection_result, detection_lock, stop_event),
+        daemon=True,
+    )
+    det_thread.start()
+
     # Start processing thread
     proc_thread = threading.Thread(
         target=_processing_thread_func,
-        args=(capture_queue, processed_queue, stop_event),
+        args=(capture_queue, processed_queue, stop_event,
+              latest_frame_holder, detection_result, detection_lock),
         daemon=True,
     )
     proc_thread.start()
 
-    # Main (UI) thread: pull processed frames and update the display
-    while not stop_event.is_set():
+    # Cleanup helper called from the display loop when preview closes
+    def _cleanup():
+        stop_event.set()
+        cap_thread.join(timeout=2.0)
+        det_thread.join(timeout=2.0)
+        proc_thread.join(timeout=2.0)
+        cap.release()
+        PREVIEW.withdraw()
+
+    # Non-blocking display loop using ROOT.after() — avoids blocking the
+    # Tk event loop which could cause UI freezes or re-entrancy issues
+    def _display_next_frame():
+        if stop_event.is_set() or PREVIEW.state() == "withdrawn":
+            _cleanup()
+            return
+
         try:
-            temp_frame = processed_queue.get(timeout=0.03)
+            temp_frame = processed_queue.get_nowait()
         except queue.Empty:
-            ROOT.update()
-            continue
+            ROOT.after(16, _display_next_frame)
+            return
 
         if modules.globals.live_resizable:
             temp_frame = fit_image_to_size(
@@ -1167,17 +1218,11 @@ def create_webcam_preview(camera_index: int):
         )
         image = ctk.CTkImage(image, size=image.size)
         preview_label.configure(image=image)
-        ROOT.update()
 
-        if PREVIEW.state() == "withdrawn":
-            break
+        ROOT.after(16, _display_next_frame)
 
-    # Signal threads to stop and wait for them
-    stop_event.set()
-    cap_thread.join(timeout=2.0)
-    proc_thread.join(timeout=2.0)
-    cap.release()
-    PREVIEW.withdraw()
+    # Kick off the non-blocking display loop
+    ROOT.after(0, _display_next_frame)
 
 
 def create_source_target_popup_for_webcam(