From eac2ad23077e697d798f97342deaf912bcdaab37 Mon Sep 17 00:00:00 2001
From: ozp3 <barisozpulat@gmail.com>
Date: Sat, 28 Mar 2026 13:09:20 +0300
Subject: [PATCH] feat: AMD DML optimization - GPU face detection, detection
 throttle, pre-load fix

---
 DeepLiveCam.lnk                          | Bin 0 -> 1110 bytes
 modules/core.py                          |   7 +++--
 modules/face_analyser.py                 |  10 +++++--
 modules/globals.py                       |   3 ++
 modules/processors/frame/face_swapper.py |  10 +++----
 modules/ui.py                            |  34 ++++++++++++++++++-----
 run-dml.bat                              |   5 ++++
 7 files changed, 52 insertions(+), 17 deletions(-)
 create mode 100644 DeepLiveCam.lnk
 create mode 100644 run-dml.bat

diff --git a/DeepLiveCam.lnk b/DeepLiveCam.lnk
new file mode 100644
index 0000000000000000000000000000000000000000..f89232025d1cbdbf4ad98bdc64529fa8744bf813
GIT binary patch
literal 1110
zcmbtTPe@a79RED?Fhgr2jasv25Lxub7Ia9E2Aer((!fTeUg(=!xZJ<?rWO=MP$XzX
zP#2{-nCoB%578-+AVTOM9fBkxyoH6VE)hk2zS~~x;ONlr@%jFL-#_p7eL(<9a~&zL
zP@*_G2@9N*@4R#z(&pbQfp3emedIB;mUO)w2c^PI_8Nomm<p`$$KzQwm>7wxK0SpX
z8c>h5c{9Le3hQz|;(W4D`CbAelvVhkBgJS3qBuwyxTINHoX8EQ)mSR&it2d;DpkZI
zkSaeWrNoMmsxwu%f*gjCX66|4CgEaU9eKvs35{?NGNo{pkaL=_5gUjZ-|9UpIeZ;j
z*ofy0E#%YuabqZ-$&y!%uT?jRCf3qwRBz*)9rT7_aOxbXh7fHU<eL~6Vuu-inX<nA
zH|l3g?C)VjdgF=exrCv{%8gN9`Q_aknY-FGq2Bt?bSC`i_vg{uNA?$OH($VhdHKBU
zUw#J{rh)mdGnc0~)ceK#eA$Pw?!xz_Ez`!?-3i)b1a=Vzh}zfbM0;i6W4w$cADvg?
zQ3eQ+bAQM0p=J<2N%N%pqyWz%Ax*f8d!DUz+pUj^b^YV_S7*h)Q39liMxxejx6rql
zfLF-lsnU{*QC@-w=EiqXt{0=E+)kY2a;7<I2BYlCi4&!(Npcb=+)AI~<m4H8E|z-t
z$S#!<ZW6&unE#aMV}uMcb8eu$^X?nl)3;bS{%+fn?EFp%kgBrK|Gyx$mQShxhY4w9
tZfnZ_WFi|V1iY^4&I{wMv=2%^fQWb-+bq{k^*s1`!)X4v<CWOW^9Qt#+lBxD

literal 0
HcmV?d00001

diff --git a/modules/core.py b/modules/core.py
index 663d742..4280edb 100644
--- a/modules/core.py
+++ b/modules/core.py
@@ -2,7 +2,7 @@ import os
 import sys
 # single thread doubles cuda performance - needs to be set before torch import
 if any(arg.startswith('--execution-provider') for arg in sys.argv):
-    os.environ['OMP_NUM_THREADS'] = '1'
+    os.environ['OMP_NUM_THREADS'] = '6'
 # reduce tensorflow log level
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
 import warnings
@@ -291,9 +291,12 @@ def run() -> None:
     for frame_processor in get_frame_processors_modules(modules.globals.frame_processors):
         if not frame_processor.pre_check():
             return
+    # Pre-load face analyser in main thread before GUI starts
+    #from modules.face_analyser import get_face_analyser
+    #get_face_analyser()
     limit_resources()
     if modules.globals.headless:
         start()
     else:
         window = ui.init(start, destroy, modules.globals.lang)
-        window.mainloop()
+        window.mainloop()
\ No newline at end of file
diff --git a/modules/face_analyser.py b/modules/face_analyser.py
index 696398b..5860880 100644
--- a/modules/face_analyser.py
+++ b/modules/face_analyser.py
@@ -35,7 +35,9 @@ def get_face_analyser() -> Any:
 
 
 def get_one_face(frame: Frame) -> Any:
-    face = get_face_analyser().get(frame)
+    import modules.globals as g
+    with g.dml_lock:
+        face = get_face_analyser().get(frame)
     try:
         return min(face, key=lambda x: x.bbox[0])
     except ValueError:
@@ -43,8 +45,10 @@ def get_one_face(frame: Frame) -> Any:
 
 
 def get_many_faces(frame: Frame) -> Any:
+    import modules.globals as g
     try:
-        return get_face_analyser().get(frame)
+        with g.dml_lock:
+            return get_face_analyser().get(frame)
     except IndexError:
         return None
 
@@ -196,4 +200,4 @@ def dump_faces(centroids: Any, frame_face_embeddings: list):
 
                     if temp_frame[int(y_min):int(y_max), int(x_min):int(x_max)].size > 0:
                         cv2.imwrite(temp_directory_path + f"/{i}/{frame['frame']}_{j}.png", temp_frame[int(y_min):int(y_max), int(x_min):int(x_max)])
-                j += 1
\ No newline at end of file
+                j += 1
diff --git a/modules/globals.py b/modules/globals.py
index aabc19a..3d88931 100644
--- a/modules/globals.py
+++ b/modules/globals.py
@@ -71,3 +71,6 @@ interpolation_weight: float = 0  # Blend weight for current frame (0.0-1.0). Low
 # --- END: Added for Frame Interpolation ---
 
 # --- END OF FILE globals.py ---
+
+import threading
+dml_lock = threading.Lock()
diff --git a/modules/processors/frame/face_swapper.py b/modules/processors/frame/face_swapper.py
index 04f846b..57bf508 100644
--- a/modules/processors/frame/face_swapper.py
+++ b/modules/processors/frame/face_swapper.py
@@ -110,7 +110,6 @@ def get_face_swapper() -> Any:
                         ))
                     else:
                         providers_config.append(p)
-                
                 FACE_SWAPPER = insightface.model_zoo.get_model(
                     model_path,
                     providers=providers_config,
@@ -153,9 +152,10 @@ def swap_face(source_face: Face, target_face: Face, temp_frame: Frame) -> Frame:
         if not temp_frame.flags['C_CONTIGUOUS']:
             temp_frame = np.ascontiguousarray(temp_frame)
         
-        swapped_frame_raw = face_swapper.get(
-            temp_frame, target_face, source_face, paste_back=True
-        )
+        with modules.globals.dml_lock:
+            swapped_frame_raw = face_swapper.get(
+                temp_frame, target_face, source_face, paste_back=True
+            )
 
         # --- START: CRITICAL FIX FOR ORT 1.17 ---
         # Check the output type and range from the model
@@ -1183,4 +1183,4 @@ def apply_color_transfer(source, target):
          # traceback.print_exc()
          return source
 
-    return result_bgr
\ No newline at end of file
+    return result_bgr
diff --git a/modules/ui.py b/modules/ui.py
index 7143076..6041fc4 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -72,8 +72,8 @@ ROOT_WIDTH = 600
 PREVIEW = None
 PREVIEW_MAX_HEIGHT = 700
 PREVIEW_MAX_WIDTH = 1200
-PREVIEW_DEFAULT_WIDTH = 960
-PREVIEW_DEFAULT_HEIGHT = 540
+PREVIEW_DEFAULT_WIDTH = 640
+PREVIEW_DEFAULT_HEIGHT = 360
 
 POPUP_WIDTH = 750
 POPUP_HEIGHT = 810
@@ -1000,6 +1000,10 @@ def webcam_preview(root: ctk.CTk, camera_index: int):
         if modules.globals.source_path is None:
             update_status("Please select a source image first")
             return
+        from modules.processors.frame.face_swapper import get_face_swapper
+        from modules.face_analyser import get_face_analyser
+        get_face_analyser()
+        get_face_swapper()
         create_webcam_preview(camera_index)
     else:
         modules.globals.source_target_map = []
@@ -1105,7 +1109,7 @@ def _detection_thread_func(latest_frame_holder, detection_result, detection_lock
             frame = latest_frame_holder[0]
 
         if frame is None:
-            time.sleep(0.005)
+            time.sleep(0.2)
             continue
 
         if modules.globals.many_faces:
@@ -1157,7 +1161,22 @@ def _processing_thread_func(capture_queue, processed_queue, stop_event,
                 source_image = get_one_face(cv2.imread(modules.globals.source_path))
 
             # Read latest detection results (brief lock to avoid blocking detection thread)
-            with detection_lock:
+            # Run detection inline since detection thread is disabled
+            # Run detection every 3 frames, reuse cached result otherwise
+            if not hasattr(_processing_thread_func, '_det_count'):
+                _processing_thread_func._det_count = 0
+            _processing_thread_func._det_count += 1
+
+            if _processing_thread_func._det_count % 3 == 0:
+                if modules.globals.many_faces:
+                    cached_target_face = None
+                    cached_many_faces = get_many_faces(temp_frame)
+                    detection_result['many_faces'] = cached_many_faces
+                else:
+                    cached_target_face = get_one_face(temp_frame)
+                    cached_many_faces = None
+                    detection_result['target_face'] = cached_target_face
+            else:
                 cached_target_face = detection_result.get('target_face')
                 cached_many_faces = detection_result.get('many_faces')
 
@@ -1275,7 +1294,7 @@ def create_webcam_preview(camera_index: int):
         args=(latest_frame_holder, detection_result, detection_lock, stop_event),
         daemon=True,
     )
-    det_thread.start()
+    # det_thread.start()
 
     # Start processing thread
     proc_thread = threading.Thread(
@@ -1316,7 +1335,7 @@ def create_webcam_preview(camera_index: int):
             temp_frame = fit_image_to_size(
                 temp_frame, PREVIEW.winfo_width(), PREVIEW.winfo_height()
             )
-
+        temp_frame = temp_frame.copy()
         image = gpu_cvt_color(temp_frame, cv2.COLOR_BGR2RGB)
         image = Image.fromarray(image)
         image = ImageOps.contain(
@@ -1574,4 +1593,5 @@ def update_webcam_target(
             target_label_dict_live[button_num] = target_image
         else:
             update_pop_live_status("Face could not be detected in last upload!")
-        return map
\ No newline at end of file
+        return map
+
diff --git a/run-dml.bat b/run-dml.bat
new file mode 100644
index 0000000..68a67be
--- /dev/null
+++ b/run-dml.bat
@@ -0,0 +1,5 @@
+@echo off
+cd /d "%~dp0"
+call venv\Scripts\activate
+python run.py --execution-provider dml
+pause