mirror of
https://github.com/hacksider/Deep-Live-Cam.git
synced 2026-06-07 21:13:54 +02:00
Fix CUDA-graph replay race + many_faces enhancer regression
Two issues surfaced in post-squash review of f65aeae:
1. CUDA-graph replay buffers were shared across threads with no lock.
`_cuda_graph_swap_inference` mutates module-level ort_input/ort_latent
and runs run_with_iobinding — concurrent swap calls on Windows/CUDA
could overwrite each other's bound input buffers before replay,
producing wrong-face output. Added `_cuda_graph_lock` around the
full update/run/read sequence.
2. Face enhancer loop unconditionally broke after the first face, so
`many_faces=True` silently enhanced only one face. Also, the
single-slot temporal cache would paste the same enhancement onto
every target if reused in many-faces mode. Gated the break on
`not many_faces_mode` and disabled the cache path in that mode.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -311,7 +311,11 @@ def enhance_face(temp_frame: Frame, detected_faces=None) -> Frame:
|
||||
return temp_frame
|
||||
|
||||
# Temporal caching: only available when faces are pre-detected (live mode)
|
||||
use_cache = detected_faces is not None
|
||||
# AND we're in single-face mode — the cache holds exactly one enhancement,
|
||||
# so reusing it in many_faces mode would paste the same face onto every
|
||||
# detected target.
|
||||
many_faces_mode = getattr(modules.globals, "many_faces", False)
|
||||
use_cache = detected_faces is not None and not many_faces_mode
|
||||
if use_cache:
|
||||
_enh_live_cache['frame_count'] += 1
|
||||
run_inference_this_frame = (_enh_live_cache['frame_count'] % _ENH_INTERVAL == 0
|
||||
@@ -372,7 +376,8 @@ def enhance_face(temp_frame: Frame, detected_faces=None) -> Frame:
|
||||
cached['affine_matrix'],
|
||||
output_size=cached['align_size'],
|
||||
)
|
||||
break # single-face live mode — only process first face
|
||||
if not many_faces_mode:
|
||||
break # single-face live mode — only process first face
|
||||
|
||||
return temp_frame
|
||||
|
||||
|
||||
@@ -168,6 +168,10 @@ _cuda_graph_session = {
|
||||
'ort_latent': None,
|
||||
'recorded': False,
|
||||
}
|
||||
# Serializes CUDA-graph replay. The io_binding + ort_input/ort_latent are
|
||||
# shared across threads and run_with_iobinding mutates GPU-side buffers;
|
||||
# concurrent calls would produce wrong output.
|
||||
_cuda_graph_lock = threading.Lock()
|
||||
|
||||
|
||||
def _init_cuda_graph_session(model_path: str, swapper):
|
||||
@@ -232,10 +236,11 @@ def _init_cuda_graph_session(model_path: str, swapper):
|
||||
def _cuda_graph_swap_inference(blob: np.ndarray, latent: np.ndarray) -> np.ndarray:
|
||||
"""Run swap model via CUDA graph replay — minimal CPU overhead."""
|
||||
cg = _cuda_graph_session
|
||||
cg['ort_input'].update_inplace(blob)
|
||||
cg['ort_latent'].update_inplace(latent)
|
||||
cg['session'].run_with_iobinding(cg['io_binding'])
|
||||
return cg['io_binding'].get_outputs()[0].numpy()
|
||||
with _cuda_graph_lock:
|
||||
cg['ort_input'].update_inplace(blob)
|
||||
cg['ort_latent'].update_inplace(latent)
|
||||
cg['session'].run_with_iobinding(cg['io_binding'])
|
||||
return cg['io_binding'].get_outputs()[0].numpy()
|
||||
|
||||
|
||||
def _fast_paste_back(target_img: Frame, bgr_fake: np.ndarray, aimg: np.ndarray, M: np.ndarray) -> Frame:
|
||||
|
||||
Reference in New Issue
Block a user