Files
Max Buckley f65aeae5db Apple Silicon + Windows CUDA perf: 60 FPS pipeline, cross-platform routing
Bundles CoreML graph rewrites, GPU-accelerated pipeline work, Windows CUDA
fixes, and Mac/Windows runtime routing into a single drop.

CoreML (Apple Silicon):
- Decompose Pad(reflect) → Slice+Concat in inswapper_128 so the model
  runs in one CoreML partition instead of 14 (TEMPORARY: fixed upstream
  in microsoft/onnxruntime#28073, drop when ORT >= 1.26.0).
- Fold Shape/Gather chains to constants in det_10g (21ms → 4ms).
- Decompose Split(axis=1) → Slice pairs in GFPGAN (155ms → 89ms).
- Route detection model to GPU so the ANE is free for the swap model.
- Centralize provider/config selection in create_onnx_session.

Pipeline (all platforms):
- Parallelize face landmark + recognition post-detection; skip landmark_2d_106
  when only face_swapper is active.
- Pipeline face detection with swap for ANE overlap.
- GPU-accelerated paste_back, MJPEG capture, zero-copy display path.
- Standalone pipeline benchmark script.

Windows / CUDA:
- CUDA graphs + FP16 model + all-GPU pipeline for 1080p 60 FPS.
- Auto-detect GPU provider and fix DLL discovery for Windows CUDA execution.

Cross-platform:
- platform_info helper for Mac/Windows runtime routing.
- GFPGAN 30 fps + MSMF camera 60 fps with adaptive pipeline tuning.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 10:44:59 +02:00

286 lines
9.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# --- START OF FILE gpu_processing.py ---
"""
GPU-accelerated image processing using OpenCV CUDA (cv2.cuda.GpuMat).
Provides drop-in replacements for common cv2 functions. When OpenCV is built
with CUDA support the functions transparently upload → process → download via
GpuMat; otherwise they fall back to the regular CPU path so the rest of the
codebase never has to care whether CUDA is available.
Usage
-----
from modules.gpu_processing import (
gpu_gaussian_blur, gpu_sharpen, gpu_add_weighted,
gpu_resize, gpu_cvt_color, gpu_flip,
is_gpu_accelerated,
)
"""
from __future__ import annotations
import os
import cv2
import numpy as np
from typing import Tuple, Optional
# ---------------------------------------------------------------------------
# CUDA availability detection (evaluated once at import time)
# ---------------------------------------------------------------------------
CUDA_AVAILABLE: bool = False
# OpenCV CUDA per-operation acceleration is DISABLED by default.
# Each gpu_* call uploads to GPU, processes, then downloads back to CPU.
# At webcam resolution (~960x540) this upload/download overhead far exceeds
# the time saved on the actual operation, making it slower than pure CPU.
# The heavy lifting (face detection, swap, enhancement) runs on GPU via
# ONNX Runtime's CUDAExecutionProvider, which is where GPU matters.
#
# To force-enable, set OPENCV_CUDA_PROCESSING=1 in your environment.
if os.environ.get("OPENCV_CUDA_PROCESSING") == "1":
try:
_test_mat = cv2.cuda.GpuMat()
_has_gauss = hasattr(cv2.cuda, "createGaussianFilter")
_has_resize = hasattr(cv2.cuda, "resize")
_has_cvt = hasattr(cv2.cuda, "cvtColor")
if _has_gauss and _has_resize and _has_cvt:
CUDA_AVAILABLE = True
print("[gpu_processing] OpenCV CUDA processing enabled via OPENCV_CUDA_PROCESSING=1.")
except Exception:
pass
# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------
def _ensure_uint8(img: np.ndarray) -> np.ndarray:
"""Clip and convert to uint8 if necessary."""
if img.dtype != np.uint8:
return np.clip(img, 0, 255).astype(np.uint8)
return img
def _ksize_odd(ksize: Tuple[int, int]) -> Tuple[int, int]:
"""Ensure kernel dimensions are positive and odd (required by GaussianBlur)."""
kw = max(1, ksize[0] // 2 * 2 + 1) if ksize[0] > 0 else 0
kh = max(1, ksize[1] // 2 * 2 + 1) if ksize[1] > 0 else 0
return (kw, kh)
def _cv_type_for(img: np.ndarray) -> int:
"""Return the OpenCV type constant matching *img* (uint8 only)."""
channels = 1 if img.ndim == 2 else img.shape[2]
if channels == 1:
return cv2.CV_8UC1
elif channels == 3:
return cv2.CV_8UC3
elif channels == 4:
return cv2.CV_8UC4
return cv2.CV_8UC3 # fallback
# ---------------------------------------------------------------------------
# Public API Gaussian Blur
# ---------------------------------------------------------------------------
def gpu_gaussian_blur(
src: np.ndarray,
ksize: Tuple[int, int],
sigma_x: float,
sigma_y: float = 0,
) -> np.ndarray:
"""Drop-in replacement for ``cv2.GaussianBlur`` with CUDA acceleration.
Parameters match ``cv2.GaussianBlur(src, ksize, sigmaX, sigmaY)``.
When *ksize* is ``(0, 0)`` OpenCV computes the kernel size from *sigma_x*.
"""
if CUDA_AVAILABLE:
try:
src_u8 = _ensure_uint8(src)
cv_type = _cv_type_for(src_u8)
ks = _ksize_odd(ksize) if ksize != (0, 0) else ksize
gauss = cv2.cuda.createGaussianFilter(cv_type, cv_type, ks, sigma_x, sigma_y)
gpu_src = cv2.cuda.GpuMat()
gpu_src.upload(src_u8)
gpu_dst = gauss.apply(gpu_src)
return gpu_dst.download()
except cv2.error:
pass
return cv2.GaussianBlur(src, ksize, sigma_x, sigmaY=sigma_y)
# ---------------------------------------------------------------------------
# Public API addWeighted
# ---------------------------------------------------------------------------
def gpu_add_weighted(
src1: np.ndarray,
alpha: float,
src2: np.ndarray,
beta: float,
gamma: float,
) -> np.ndarray:
"""Drop-in replacement for ``cv2.addWeighted`` with CUDA acceleration."""
if CUDA_AVAILABLE:
try:
s1 = _ensure_uint8(src1)
s2 = _ensure_uint8(src2)
g1 = cv2.cuda.GpuMat()
g2 = cv2.cuda.GpuMat()
g1.upload(s1)
g2.upload(s2)
gpu_dst = cv2.cuda.addWeighted(g1, alpha, g2, beta, gamma)
return gpu_dst.download()
except cv2.error:
pass
return cv2.addWeighted(src1, alpha, src2, beta, gamma)
# ---------------------------------------------------------------------------
# Public API Unsharp-mask sharpening
# ---------------------------------------------------------------------------
def gpu_sharpen(
src: np.ndarray,
strength: float,
sigma: float = 3,
) -> np.ndarray:
"""Unsharp-mask sharpening, optionally GPU-accelerated.
Equivalent to::
blurred = GaussianBlur(src, (0,0), sigma)
result = addWeighted(src, 1+strength, blurred, -strength, 0)
"""
if strength <= 0:
return src
if CUDA_AVAILABLE:
try:
src_u8 = _ensure_uint8(src)
cv_type = _cv_type_for(src_u8)
gauss = cv2.cuda.createGaussianFilter(cv_type, cv_type, (0, 0), sigma)
gpu_src = cv2.cuda.GpuMat()
gpu_src.upload(src_u8)
gpu_blurred = gauss.apply(gpu_src)
gpu_sharp = cv2.cuda.addWeighted(gpu_src, 1.0 + strength, gpu_blurred, -strength, 0)
result = gpu_sharp.download()
return np.clip(result, 0, 255).astype(np.uint8)
except cv2.error:
pass
blurred = cv2.GaussianBlur(src, (0, 0), sigma)
sharpened = cv2.addWeighted(src, 1.0 + strength, blurred, -strength, 0)
return np.clip(sharpened, 0, 255).astype(np.uint8)
# ---------------------------------------------------------------------------
# Public API Resize
# ---------------------------------------------------------------------------
# Map common cv2 interpolation flags to their CUDA equivalents
_INTERP_MAP = {
cv2.INTER_NEAREST: cv2.INTER_NEAREST,
cv2.INTER_LINEAR: cv2.INTER_LINEAR,
cv2.INTER_CUBIC: cv2.INTER_CUBIC,
cv2.INTER_AREA: cv2.INTER_AREA,
cv2.INTER_LANCZOS4: cv2.INTER_LANCZOS4,
}
def gpu_resize(
src: np.ndarray,
dsize: Tuple[int, int],
fx: float = 0,
fy: float = 0,
interpolation: int = cv2.INTER_LINEAR,
) -> np.ndarray:
"""Drop-in replacement for ``cv2.resize`` with CUDA acceleration.
Parameters match ``cv2.resize(src, dsize, fx=fx, fy=fy, interpolation=...)``.
"""
if CUDA_AVAILABLE:
try:
src_u8 = _ensure_uint8(src)
gpu_src = cv2.cuda.GpuMat()
gpu_src.upload(src_u8)
interp = _INTERP_MAP.get(interpolation, cv2.INTER_LINEAR)
if dsize and dsize[0] > 0 and dsize[1] > 0:
gpu_dst = cv2.cuda.resize(gpu_src, dsize, interpolation=interp)
else:
gpu_dst = cv2.cuda.resize(gpu_src, (0, 0), fx=fx, fy=fy, interpolation=interp)
return gpu_dst.download()
except cv2.error:
pass
return cv2.resize(src, dsize, fx=fx, fy=fy, interpolation=interpolation)
# ---------------------------------------------------------------------------
# Public API Color conversion
# ---------------------------------------------------------------------------
def gpu_cvt_color(
src: np.ndarray,
code: int,
) -> np.ndarray:
"""Drop-in replacement for ``cv2.cvtColor`` with CUDA acceleration.
Parameters match ``cv2.cvtColor(src, code)``.
"""
if CUDA_AVAILABLE:
try:
src_u8 = _ensure_uint8(src)
gpu_src = cv2.cuda.GpuMat()
gpu_src.upload(src_u8)
gpu_dst = cv2.cuda.cvtColor(gpu_src, code)
return gpu_dst.download()
except cv2.error:
pass
return cv2.cvtColor(src, code)
# ---------------------------------------------------------------------------
# Public API Flip
# ---------------------------------------------------------------------------
def gpu_flip(
src: np.ndarray,
flip_code: int,
) -> np.ndarray:
"""Drop-in replacement for ``cv2.flip`` with CUDA acceleration.
Parameters match ``cv2.flip(src, flipCode)``.
*flip_code*: 0 = vertical, 1 = horizontal, -1 = both.
"""
if CUDA_AVAILABLE:
try:
src_u8 = _ensure_uint8(src)
gpu_src = cv2.cuda.GpuMat()
gpu_src.upload(src_u8)
gpu_dst = cv2.cuda.flip(gpu_src, flip_code)
return gpu_dst.download()
except cv2.error:
pass
return cv2.flip(src, flip_code)
# ---------------------------------------------------------------------------
# Convenience: check at runtime whether GPU path is active
# ---------------------------------------------------------------------------
def is_gpu_accelerated() -> bool:
"""Return ``True`` when the CUDA path will be used."""
return CUDA_AVAILABLE
# --- END OF FILE gpu_processing.py ---