remove-ai-watermarks/src/remove_ai_watermarks/auto_config.py

"""Automatic pipeline planning for the ``--auto`` quality mode.

``plan(image_path)`` inspects the INPUT image (before the diffusion model loads)
and returns the quality modes to use, so the pipeline can adapt to content. It is
meant to run as the FIRST step of the invisible/all pipeline, wherever that pipeline
runs (locally, or the raiw.cc Modal GPU worker) -- never on a memory-constrained web
host (image work there OOM-crashes the container).

Routing is **quality-priority**: ControlNet (text/face-structure preservation) is the
default; it is only skipped for a clearly structure-less image (no face, no text,
near-zero edges), where plain SDXL is cheaper and just as good. GFPGAN face
restoration is enabled when a face is present. When a smoothing pass (controlnet or
face restore) ran, the **adaptive polish** (``humanizer.adaptive_polish``) restores
the input's detail level -- a capped unsharp + edge-masked grain targeting the input's
Laplacian variance -- to counter the over-smoothed "AI look". It is self-limiting on
text/graphics (already high-frequency, so almost no polish) and spares text/edges by
masking the grain.

Detection is **cv2-only and torch-free**: OpenCV YuNet (``cv2.FaceDetectorYN``) for
faces -- a 232 KB MIT-licensed model bundled in ``assets/`` -- DBNet (PP-OCRv3
differentiable-binarization via ``cv2.dnn.TextDetectionModel_DB``, a 2.4 MB Apache-2.0
model bundled in ``assets/``) for text, and a Canny ``edge_density``. The whole planner
peaks ~100 MB RSS in a few ms, so it adds nothing meaningful to a GPU run and runs
anywhere the pipeline runs.

The text detector falls back to the old MSER region heuristic if the DBNet model can't
load. Either way text only ever ADDS controlnet, so a miss is backstopped by the
edge-density route and a false positive only costs a controlnet run.
"""

# cv2/numpy boundary: cv2 ships no usable element types; relax the unknown-type rules
# for this file only.
# pyright: reportUnknownMemberType=false, reportUnknownArgumentType=false, reportUnknownVariableType=false, reportUnknownParameterType=false, reportMissingTypeArgument=false, reportMissingTypeStubs=false, reportMissingImports=false, reportArgumentType=false, reportAssignmentType=false, reportReturnType=false, reportCallIssue=false, reportIndexIssue=false, reportOperatorIssue=false, reportOptionalMemberAccess=false, reportOptionalCall=false, reportOptionalSubscript=false, reportOptionalOperand=false, reportAttributeAccessIssue=false, reportPrivateImportUsage=false, reportPrivateUsage=false, reportInvalidTypeForm=false, reportConstantRedefinition=false, reportUnnecessaryComparison=false
from __future__ import annotations

import logging
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any

if TYPE_CHECKING:
    from numpy.typing import NDArray

logger = logging.getLogger(__name__)

# ── Routing thresholds (tunable; quality-priority -> controlnet unless clearly flat) ──
# Canny edge-density below this, AND no face AND no text -> plain SDXL (nothing to
# preserve). The headshot measures ~0.022, a busy photo higher; only a near-flat
# gradient/solid image falls under 0.008.
_STRUCTURELESS_EDGE_MAX = 0.008
# MSER regions per megapixel above this -> likely text. The MSER path is now only the
# FALLBACK when the bundled DBNet model can't load; DBNet (below) is the primary text
# detector. Rough heuristic: a no-text portrait measures a few hundred/MP, dense text
# far more. Set high so it rarely false-fires; text only ever ADDS controlnet.
_TEXT_MSER_PER_MP = 1500.0
_FACE_SCORE = 0.6  # YuNet confidence for a face to count
# Downscale the long side to this for DETECTION only (faces stay detectable down to
# ~10px, and this bounds YuNet/DBNet/MSER cost on huge inputs). Removal runs at full res.
_DETECT_MAX_SIDE = 1024

# DBNet (PP-OCRv3 differentiable-binarization) text-region detector via cv2.dnn -- the
# primary "has meaningful text" signal. The model is the shared PP-OCRv3 detection net
# from OpenCV Zoo (Apache-2.0); en/cn variants are byte-identical, so it is bundled
# language-neutral. cv2.dnn is core OpenCV, so this adds NO new pip dependency.
_DBNET_ASSET = "text_detection_ppocrv3_2023may.onnx"  # Apache-2.0 (OpenCV Zoo PP-OCRv3 DB)
_DBNET_BINARY_THRESHOLD = 0.3
_DBNET_POLYGON_THRESHOLD = 0.5
_DBNET_MAX_CANDIDATES = 200
_DBNET_UNCLIP_RATIO = 2.0
_DBNET_INPUT_SIDE = 736  # square input, multiple of 32 (PP-OCRv3 default)
_DBNET_MEAN = (122.67891434, 116.66876762, 104.00698793)  # ImageNet mean * 255
_dbnet: Any = None  # lazy singleton; set to False after a load failure (-> MSER fallback)

# When a smoothing pass ran (controlnet or face restore), the adaptive polish
# (humanizer.adaptive_polish) restores the input's detail level, sparing text --
# replacing the old fixed unsharp/grain which over-/under-corrected and speckled text.
_UPSCALE_FLOOR = 1024

_YUNET_ASSET = "face_detection_yunet_2023mar.onnx"  # MIT (Shiqi Yu), OpenCV Zoo
_yunet: Any = None  # lazy singleton


@dataclass(frozen=True)
class AutoConfig:
    """Resolved quality modes from content analysis (the ``--auto`` plan)."""

    pipeline: str  # "default" | "controlnet"
    restore_faces: bool
    adaptive_polish: bool  # restore the input's detail level (sharpen + masked grain), sparing text
    unsharp: float  # fixed-polish knobs, 0 in auto (the adaptive polish replaces them)
    humanize: float
    min_resolution: int
    # signals retained for logging / debugging a bad pick
    has_face: bool
    has_text: bool
    edge_density: float
    width: int
    height: int

    @property
    def reason(self) -> str:
        """One-line human-readable summary of the plan (logged per image)."""
        bits = ["face" if self.has_face else "no-face"]
        if self.has_text:
            bits.append("text")
        bits.append(f"edges={self.edge_density:.3f}")
        rf = ", face-restore on" if self.restore_faces else ""
        if self.adaptive_polish:
            polish = ", adaptive polish"
        elif self.unsharp or self.humanize:
            polish = f", unsharp {self.unsharp}/grain {self.humanize}"
        else:
            polish = ""
        return f"{'+'.join(bits)} -> {self.pipeline} pipeline{rf}{polish}"


def _to_bgr(image: NDArray[Any]) -> NDArray[Any]:
    """Normalize a 2D grayscale or 4-channel BGRA array to 3-channel BGR."""
    import cv2

    if image.ndim == 2:
        return cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
    if image.shape[2] == 4:
        return cv2.cvtColor(image, cv2.COLOR_BGRA2BGR)
    return image


def _to_gray(image: NDArray[Any]) -> NDArray[Any]:
    """Single-channel grayscale; passes a 2D (already-gray) input through unchanged."""
    import cv2

    if image.ndim == 3 and image.shape[2] >= 3:
        return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    return image


def _downscale_for_detection(image: NDArray[Any]) -> NDArray[Any]:
    """Shrink the long side to ``_DETECT_MAX_SIDE`` for cheap, bounded detection."""
    import cv2

    h, w = image.shape[:2]
    long_side = max(h, w)
    if long_side <= _DETECT_MAX_SIDE:
        return image
    scale = _DETECT_MAX_SIDE / long_side
    return cv2.resize(image, (max(1, round(w * scale)), max(1, round(h * scale))), interpolation=cv2.INTER_AREA)


def detect_face(image: NDArray[Any]) -> bool:
    """True if OpenCV YuNet finds at least one face. cv2-only, torch-free."""
    import cv2

    global _yunet
    img = _to_bgr(image)
    h, w = img.shape[:2]
    if h < 1 or w < 1:
        return False
    try:
        if _yunet is None:
            model = Path(__file__).parent / "assets" / _YUNET_ASSET
            _yunet = cv2.FaceDetectorYN.create(str(model), "", (w, h), _FACE_SCORE, 0.3, 5000)
        _yunet.setInputSize((w, h))
        _, faces = _yunet.detect(img)
    except cv2.error as e:  # malformed input / model
        logger.debug("YuNet face detect failed (%s); assuming no face", e)
        return False
    return faces is not None and len(faces) > 0


def _detect_text_dbnet(image: NDArray[Any]) -> bool | None:
    """DBNet (PP-OCRv3) text-region presence via cv2.dnn.

    Returns True/False on a successful run, or None if the bundled model can't load
    (the caller then falls back to the MSER heuristic). Loads once, lazily.
    """
    import cv2

    global _dbnet
    if _dbnet is False:  # a prior load failed; skip straight to the MSER fallback
        return None
    img = _to_bgr(image)
    h, w = img.shape[:2]
    if h < 1 or w < 1:
        return False
    try:
        if _dbnet is None:
            model = Path(__file__).parent / "assets" / _DBNET_ASSET
            net = cv2.dnn.TextDetectionModel_DB(str(model))
            net.setBinaryThreshold(_DBNET_BINARY_THRESHOLD)
            net.setPolygonThreshold(_DBNET_POLYGON_THRESHOLD)
            net.setMaxCandidates(_DBNET_MAX_CANDIDATES)
            net.setUnclipRatio(_DBNET_UNCLIP_RATIO)
            net.setInputParams(1.0 / 255.0, (_DBNET_INPUT_SIDE, _DBNET_INPUT_SIDE), _DBNET_MEAN)
            _dbnet = net
        boxes, _ = _dbnet.detect(img)
    except Exception as e:  # model load / inference can raise cv2.error or others
        logger.debug("DBNet text detect failed (%s); falling back to MSER", e)
        _dbnet = False
        return None
    return boxes is not None and len(boxes) > 0


def _detect_text_mser(image: NDArray[Any]) -> bool:
    """Fallback MSER-based text-presence heuristic (used only if DBNet can't load)."""
    import cv2

    gray = _to_gray(image)
    h, w = gray.shape[:2]
    try:
        regions, _ = cv2.MSER_create().detectRegions(gray)
    except cv2.error:
        return False
    per_mp = len(regions) / max(1e-6, (h * w) / 1e6)
    return per_mp > _TEXT_MSER_PER_MP


def detect_text(image: NDArray[Any]) -> bool:
    """Text-presence: DBNet (cv2.dnn) when the bundled model loads, else the MSER heuristic."""
    dbnet = _detect_text_dbnet(image)
    return _detect_text_mser(image) if dbnet is None else dbnet


def edge_density(image: NDArray[Any]) -> float:
    """Fraction of Canny edge pixels -- a cheap 'has structure' proxy in [0, 1]."""
    import cv2

    gray = _to_gray(image)
    edges = cv2.Canny(gray, 100, 200)
    return float((edges > 0).mean())


def plan(image_path: Path) -> AutoConfig | None:
    """Inspect the input image and return the quality modes, or None if unreadable.

    Pure analysis: loads the image, runs the cv2 detectors on a downscaled copy, and
    applies the quality-priority routing rules. Safe to call wherever the pipeline
    runs; no diffusion model is loaded.
    """
    from remove_ai_watermarks import image_io

    image = image_io.imread(image_path)
    if image is None:
        return None

    h, w = image.shape[:2]
    small = _downscale_for_detection(image)
    gray = _to_gray(small)  # convert once; edge density + the MSER fallback use gray
    has_face = detect_face(small)  # YuNet needs the 3-channel image
    has_text = detect_text(small)  # DBNet wants BGR; the MSER fallback grays it internally
    edges = edge_density(gray)

    structureless = (not has_face) and (not has_text) and edges < _STRUCTURELESS_EDGE_MAX
    pipeline = "default" if structureless else "controlnet"
    restore_faces = has_face
    smoothing = pipeline == "controlnet" or restore_faces

    cfg = AutoConfig(
        pipeline=pipeline,
        restore_faces=restore_faces,
        adaptive_polish=smoothing,  # adaptive (detail-targeted) polish when a smoothing pass ran
        unsharp=0.0,
        humanize=0.0,
        min_resolution=_UPSCALE_FLOOR,
        has_face=has_face,
        has_text=has_text,
        edge_density=edges,
        width=w,
        height=h,
    )
    logger.debug("auto plan for %s: %s", image_path, cfg.reason)
    return cfg