diff --git a/modules/processors/frame/face_enhancer.py b/modules/processors/frame/face_enhancer.py index 7c9dde9..ded60d7 100644 --- a/modules/processors/frame/face_enhancer.py +++ b/modules/processors/frame/face_enhancer.py @@ -311,7 +311,11 @@ def enhance_face(temp_frame: Frame, detected_faces=None) -> Frame: return temp_frame # Temporal caching: only available when faces are pre-detected (live mode) - use_cache = detected_faces is not None + # AND we're in single-face mode — the cache holds exactly one enhancement, + # so reusing it in many_faces mode would paste the same face onto every + # detected target. + many_faces_mode = getattr(modules.globals, "many_faces", False) + use_cache = detected_faces is not None and not many_faces_mode if use_cache: _enh_live_cache['frame_count'] += 1 run_inference_this_frame = (_enh_live_cache['frame_count'] % _ENH_INTERVAL == 0 @@ -372,7 +376,8 @@ def enhance_face(temp_frame: Frame, detected_faces=None) -> Frame: cached['affine_matrix'], output_size=cached['align_size'], ) - break # single-face live mode — only process first face + if not many_faces_mode: + break # single-face live mode — only process first face return temp_frame diff --git a/modules/processors/frame/face_swapper.py b/modules/processors/frame/face_swapper.py index 91b403e..971a37a 100644 --- a/modules/processors/frame/face_swapper.py +++ b/modules/processors/frame/face_swapper.py @@ -168,6 +168,10 @@ _cuda_graph_session = { 'ort_latent': None, 'recorded': False, } +# Serializes CUDA-graph replay. The io_binding + ort_input/ort_latent are +# shared across threads and run_with_iobinding mutates GPU-side buffers; +# concurrent calls would produce wrong output. +_cuda_graph_lock = threading.Lock() def _init_cuda_graph_session(model_path: str, swapper): @@ -232,10 +236,11 @@ def _init_cuda_graph_session(model_path: str, swapper): def _cuda_graph_swap_inference(blob: np.ndarray, latent: np.ndarray) -> np.ndarray: """Run swap model via CUDA graph replay — minimal CPU overhead.""" cg = _cuda_graph_session - cg['ort_input'].update_inplace(blob) - cg['ort_latent'].update_inplace(latent) - cg['session'].run_with_iobinding(cg['io_binding']) - return cg['io_binding'].get_outputs()[0].numpy() + with _cuda_graph_lock: + cg['ort_input'].update_inplace(blob) + cg['ort_latent'].update_inplace(latent) + cg['session'].run_with_iobinding(cg['io_binding']) + return cg['io_binding'].get_outputs()[0].numpy() def _fast_paste_back(target_img: Frame, bgr_fake: np.ndarray, aimg: np.ndarray, M: np.ndarray) -> Frame: