mirror of
https://github.com/hacksider/Deep-Live-Cam.git
synced 2026-04-23 01:26:17 +02:00
f65aeae5db
Bundles CoreML graph rewrites, GPU-accelerated pipeline work, Windows CUDA fixes, and Mac/Windows runtime routing into a single drop. CoreML (Apple Silicon): - Decompose Pad(reflect) → Slice+Concat in inswapper_128 so the model runs in one CoreML partition instead of 14 (TEMPORARY: fixed upstream in microsoft/onnxruntime#28073, drop when ORT >= 1.26.0). - Fold Shape/Gather chains to constants in det_10g (21ms → 4ms). - Decompose Split(axis=1) → Slice pairs in GFPGAN (155ms → 89ms). - Route detection model to GPU so the ANE is free for the swap model. - Centralize provider/config selection in create_onnx_session. Pipeline (all platforms): - Parallelize face landmark + recognition post-detection; skip landmark_2d_106 when only face_swapper is active. - Pipeline face detection with swap for ANE overlap. - GPU-accelerated paste_back, MJPEG capture, zero-copy display path. - Standalone pipeline benchmark script. Windows / CUDA: - CUDA graphs + FP16 model + all-GPU pipeline for 1080p 60 FPS. - Auto-detect GPU provider and fix DLL discovery for Windows CUDA execution. Cross-platform: - platform_info helper for Mac/Windows runtime routing. - GFPGAN 30 fps + MSMF camera 60 fps with adaptive pipeline tuning. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
286 lines
9.3 KiB
Python
286 lines
9.3 KiB
Python
# --- START OF FILE gpu_processing.py ---
|
||
"""
|
||
GPU-accelerated image processing using OpenCV CUDA (cv2.cuda.GpuMat).
|
||
|
||
Provides drop-in replacements for common cv2 functions. When OpenCV is built
|
||
with CUDA support the functions transparently upload → process → download via
|
||
GpuMat; otherwise they fall back to the regular CPU path so the rest of the
|
||
codebase never has to care whether CUDA is available.
|
||
|
||
Usage
|
||
-----
|
||
from modules.gpu_processing import (
|
||
gpu_gaussian_blur, gpu_sharpen, gpu_add_weighted,
|
||
gpu_resize, gpu_cvt_color, gpu_flip,
|
||
is_gpu_accelerated,
|
||
)
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import os
|
||
import cv2
|
||
import numpy as np
|
||
from typing import Tuple, Optional
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# CUDA availability detection (evaluated once at import time)
|
||
# ---------------------------------------------------------------------------
|
||
CUDA_AVAILABLE: bool = False
|
||
|
||
# OpenCV CUDA per-operation acceleration is DISABLED by default.
|
||
# Each gpu_* call uploads to GPU, processes, then downloads back to CPU.
|
||
# At webcam resolution (~960x540) this upload/download overhead far exceeds
|
||
# the time saved on the actual operation, making it slower than pure CPU.
|
||
# The heavy lifting (face detection, swap, enhancement) runs on GPU via
|
||
# ONNX Runtime's CUDAExecutionProvider, which is where GPU matters.
|
||
#
|
||
# To force-enable, set OPENCV_CUDA_PROCESSING=1 in your environment.
|
||
if os.environ.get("OPENCV_CUDA_PROCESSING") == "1":
|
||
try:
|
||
_test_mat = cv2.cuda.GpuMat()
|
||
_has_gauss = hasattr(cv2.cuda, "createGaussianFilter")
|
||
_has_resize = hasattr(cv2.cuda, "resize")
|
||
_has_cvt = hasattr(cv2.cuda, "cvtColor")
|
||
if _has_gauss and _has_resize and _has_cvt:
|
||
CUDA_AVAILABLE = True
|
||
print("[gpu_processing] OpenCV CUDA processing enabled via OPENCV_CUDA_PROCESSING=1.")
|
||
except Exception:
|
||
pass
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Internal helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _ensure_uint8(img: np.ndarray) -> np.ndarray:
|
||
"""Clip and convert to uint8 if necessary."""
|
||
if img.dtype != np.uint8:
|
||
return np.clip(img, 0, 255).astype(np.uint8)
|
||
return img
|
||
|
||
|
||
def _ksize_odd(ksize: Tuple[int, int]) -> Tuple[int, int]:
|
||
"""Ensure kernel dimensions are positive and odd (required by GaussianBlur)."""
|
||
kw = max(1, ksize[0] // 2 * 2 + 1) if ksize[0] > 0 else 0
|
||
kh = max(1, ksize[1] // 2 * 2 + 1) if ksize[1] > 0 else 0
|
||
return (kw, kh)
|
||
|
||
|
||
def _cv_type_for(img: np.ndarray) -> int:
|
||
"""Return the OpenCV type constant matching *img* (uint8 only)."""
|
||
channels = 1 if img.ndim == 2 else img.shape[2]
|
||
if channels == 1:
|
||
return cv2.CV_8UC1
|
||
elif channels == 3:
|
||
return cv2.CV_8UC3
|
||
elif channels == 4:
|
||
return cv2.CV_8UC4
|
||
return cv2.CV_8UC3 # fallback
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Public API – Gaussian Blur
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def gpu_gaussian_blur(
|
||
src: np.ndarray,
|
||
ksize: Tuple[int, int],
|
||
sigma_x: float,
|
||
sigma_y: float = 0,
|
||
) -> np.ndarray:
|
||
"""Drop-in replacement for ``cv2.GaussianBlur`` with CUDA acceleration.
|
||
|
||
Parameters match ``cv2.GaussianBlur(src, ksize, sigmaX, sigmaY)``.
|
||
When *ksize* is ``(0, 0)`` OpenCV computes the kernel size from *sigma_x*.
|
||
"""
|
||
if CUDA_AVAILABLE:
|
||
try:
|
||
src_u8 = _ensure_uint8(src)
|
||
cv_type = _cv_type_for(src_u8)
|
||
ks = _ksize_odd(ksize) if ksize != (0, 0) else ksize
|
||
|
||
gauss = cv2.cuda.createGaussianFilter(cv_type, cv_type, ks, sigma_x, sigma_y)
|
||
gpu_src = cv2.cuda.GpuMat()
|
||
gpu_src.upload(src_u8)
|
||
gpu_dst = gauss.apply(gpu_src)
|
||
return gpu_dst.download()
|
||
except cv2.error:
|
||
pass
|
||
|
||
return cv2.GaussianBlur(src, ksize, sigma_x, sigmaY=sigma_y)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Public API – addWeighted
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def gpu_add_weighted(
|
||
src1: np.ndarray,
|
||
alpha: float,
|
||
src2: np.ndarray,
|
||
beta: float,
|
||
gamma: float,
|
||
) -> np.ndarray:
|
||
"""Drop-in replacement for ``cv2.addWeighted`` with CUDA acceleration."""
|
||
if CUDA_AVAILABLE:
|
||
try:
|
||
s1 = _ensure_uint8(src1)
|
||
s2 = _ensure_uint8(src2)
|
||
g1 = cv2.cuda.GpuMat()
|
||
g2 = cv2.cuda.GpuMat()
|
||
g1.upload(s1)
|
||
g2.upload(s2)
|
||
gpu_dst = cv2.cuda.addWeighted(g1, alpha, g2, beta, gamma)
|
||
return gpu_dst.download()
|
||
except cv2.error:
|
||
pass
|
||
|
||
return cv2.addWeighted(src1, alpha, src2, beta, gamma)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Public API – Unsharp-mask sharpening
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def gpu_sharpen(
|
||
src: np.ndarray,
|
||
strength: float,
|
||
sigma: float = 3,
|
||
) -> np.ndarray:
|
||
"""Unsharp-mask sharpening, optionally GPU-accelerated.
|
||
|
||
Equivalent to::
|
||
|
||
blurred = GaussianBlur(src, (0,0), sigma)
|
||
result = addWeighted(src, 1+strength, blurred, -strength, 0)
|
||
"""
|
||
if strength <= 0:
|
||
return src
|
||
|
||
if CUDA_AVAILABLE:
|
||
try:
|
||
src_u8 = _ensure_uint8(src)
|
||
cv_type = _cv_type_for(src_u8)
|
||
|
||
gauss = cv2.cuda.createGaussianFilter(cv_type, cv_type, (0, 0), sigma)
|
||
gpu_src = cv2.cuda.GpuMat()
|
||
gpu_src.upload(src_u8)
|
||
gpu_blurred = gauss.apply(gpu_src)
|
||
gpu_sharp = cv2.cuda.addWeighted(gpu_src, 1.0 + strength, gpu_blurred, -strength, 0)
|
||
result = gpu_sharp.download()
|
||
return np.clip(result, 0, 255).astype(np.uint8)
|
||
except cv2.error:
|
||
pass
|
||
|
||
blurred = cv2.GaussianBlur(src, (0, 0), sigma)
|
||
sharpened = cv2.addWeighted(src, 1.0 + strength, blurred, -strength, 0)
|
||
return np.clip(sharpened, 0, 255).astype(np.uint8)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Public API – Resize
|
||
# ---------------------------------------------------------------------------
|
||
|
||
# Map common cv2 interpolation flags to their CUDA equivalents
|
||
_INTERP_MAP = {
|
||
cv2.INTER_NEAREST: cv2.INTER_NEAREST,
|
||
cv2.INTER_LINEAR: cv2.INTER_LINEAR,
|
||
cv2.INTER_CUBIC: cv2.INTER_CUBIC,
|
||
cv2.INTER_AREA: cv2.INTER_AREA,
|
||
cv2.INTER_LANCZOS4: cv2.INTER_LANCZOS4,
|
||
}
|
||
|
||
|
||
def gpu_resize(
|
||
src: np.ndarray,
|
||
dsize: Tuple[int, int],
|
||
fx: float = 0,
|
||
fy: float = 0,
|
||
interpolation: int = cv2.INTER_LINEAR,
|
||
) -> np.ndarray:
|
||
"""Drop-in replacement for ``cv2.resize`` with CUDA acceleration.
|
||
|
||
Parameters match ``cv2.resize(src, dsize, fx=fx, fy=fy, interpolation=...)``.
|
||
"""
|
||
if CUDA_AVAILABLE:
|
||
try:
|
||
src_u8 = _ensure_uint8(src)
|
||
gpu_src = cv2.cuda.GpuMat()
|
||
gpu_src.upload(src_u8)
|
||
|
||
interp = _INTERP_MAP.get(interpolation, cv2.INTER_LINEAR)
|
||
|
||
if dsize and dsize[0] > 0 and dsize[1] > 0:
|
||
gpu_dst = cv2.cuda.resize(gpu_src, dsize, interpolation=interp)
|
||
else:
|
||
gpu_dst = cv2.cuda.resize(gpu_src, (0, 0), fx=fx, fy=fy, interpolation=interp)
|
||
|
||
return gpu_dst.download()
|
||
except cv2.error:
|
||
pass
|
||
|
||
return cv2.resize(src, dsize, fx=fx, fy=fy, interpolation=interpolation)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Public API – Color conversion
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def gpu_cvt_color(
|
||
src: np.ndarray,
|
||
code: int,
|
||
) -> np.ndarray:
|
||
"""Drop-in replacement for ``cv2.cvtColor`` with CUDA acceleration.
|
||
|
||
Parameters match ``cv2.cvtColor(src, code)``.
|
||
"""
|
||
if CUDA_AVAILABLE:
|
||
try:
|
||
src_u8 = _ensure_uint8(src)
|
||
gpu_src = cv2.cuda.GpuMat()
|
||
gpu_src.upload(src_u8)
|
||
gpu_dst = cv2.cuda.cvtColor(gpu_src, code)
|
||
return gpu_dst.download()
|
||
except cv2.error:
|
||
pass
|
||
|
||
return cv2.cvtColor(src, code)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Public API – Flip
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def gpu_flip(
|
||
src: np.ndarray,
|
||
flip_code: int,
|
||
) -> np.ndarray:
|
||
"""Drop-in replacement for ``cv2.flip`` with CUDA acceleration.
|
||
|
||
Parameters match ``cv2.flip(src, flipCode)``.
|
||
*flip_code*: 0 = vertical, 1 = horizontal, -1 = both.
|
||
"""
|
||
if CUDA_AVAILABLE:
|
||
try:
|
||
src_u8 = _ensure_uint8(src)
|
||
gpu_src = cv2.cuda.GpuMat()
|
||
gpu_src.upload(src_u8)
|
||
gpu_dst = cv2.cuda.flip(gpu_src, flip_code)
|
||
return gpu_dst.download()
|
||
except cv2.error:
|
||
pass
|
||
|
||
return cv2.flip(src, flip_code)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Convenience: check at runtime whether GPU path is active
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def is_gpu_accelerated() -> bool:
|
||
"""Return ``True`` when the CUDA path will be used."""
|
||
return CUDA_AVAILABLE
|
||
|
||
# --- END OF FILE gpu_processing.py ---
|