diff --git a/modules/processors/frame/face_enhancer.py b/modules/processors/frame/face_enhancer.py
index 7c9dde9..ded60d7 100644
--- a/modules/processors/frame/face_enhancer.py
+++ b/modules/processors/frame/face_enhancer.py
@@ -311,7 +311,11 @@ def enhance_face(temp_frame: Frame, detected_faces=None) -> Frame:
         return temp_frame
 
     # Temporal caching: only available when faces are pre-detected (live mode)
-    use_cache = detected_faces is not None
+    # AND we're in single-face mode — the cache holds exactly one enhancement,
+    # so reusing it in many_faces mode would paste the same face onto every
+    # detected target.
+    many_faces_mode = getattr(modules.globals, "many_faces", False)
+    use_cache = detected_faces is not None and not many_faces_mode
     if use_cache:
         _enh_live_cache['frame_count'] += 1
         run_inference_this_frame = (_enh_live_cache['frame_count'] % _ENH_INTERVAL == 0
@@ -372,7 +376,8 @@ def enhance_face(temp_frame: Frame, detected_faces=None) -> Frame:
                     cached['affine_matrix'],
                     output_size=cached['align_size'],
                 )
-        break  # single-face live mode — only process first face
+        if not many_faces_mode:
+            break  # single-face live mode — only process first face
 
     return temp_frame
 
diff --git a/modules/processors/frame/face_swapper.py b/modules/processors/frame/face_swapper.py
index 91b403e..971a37a 100644
--- a/modules/processors/frame/face_swapper.py
+++ b/modules/processors/frame/face_swapper.py
@@ -168,6 +168,10 @@ _cuda_graph_session = {
     'ort_latent': None,
     'recorded': False,
 }
+# Serializes CUDA-graph replay. The io_binding + ort_input/ort_latent are
+# shared across threads and run_with_iobinding mutates GPU-side buffers;
+# concurrent calls would produce wrong output.
+_cuda_graph_lock = threading.Lock()
 
 
 def _init_cuda_graph_session(model_path: str, swapper):
@@ -232,10 +236,11 @@ def _init_cuda_graph_session(model_path: str, swapper):
 def _cuda_graph_swap_inference(blob: np.ndarray, latent: np.ndarray) -> np.ndarray:
     """Run swap model via CUDA graph replay — minimal CPU overhead."""
     cg = _cuda_graph_session
-    cg['ort_input'].update_inplace(blob)
-    cg['ort_latent'].update_inplace(latent)
-    cg['session'].run_with_iobinding(cg['io_binding'])
-    return cg['io_binding'].get_outputs()[0].numpy()
+    with _cuda_graph_lock:
+        cg['ort_input'].update_inplace(blob)
+        cg['ort_latent'].update_inplace(latent)
+        cg['session'].run_with_iobinding(cg['io_binding'])
+        return cg['io_binding'].get_outputs()[0].numpy()
 
 
 def _fast_paste_back(target_img: Frame, bgr_fake: np.ndarray, aimg: np.ndarray, M: np.ndarray) -> Frame: