feat(auto): content-adaptive --auto quality mode, Phase 1

Add `auto_config.plan(image_path) -> AutoConfig`, the first step of the invisible/all pipeline: it inspects the input image (before the diffusion model loads) and picks the quality modes so the run adapts to content. Quality-priority routing -- ControlNet (text/face-structure preservation) is the default, skipped for plain SDXL only on a clearly structure-less image; GFPGAN face restore when a face is present; a mild sharpen + grain polish when a smoothing pass ran. Exposed as `--auto` on `all`/`invisible` (`_apply_auto`; explicit flags override via click's parameter source). Not wired into batch (its engine is cached per-mode). Detection is cv2-only and torch-free (~100 MB peak RSS, a few ms): OpenCV YuNet (`cv2.FaceDetectorYN`, MIT, 232 KB model bundled in assets/) for faces, a Canny edge-density + MSER heuristic for text/structure (a rough Phase-1 placeholder; DBNet via cv2.dnn is the planned upgrade). ZERO new pip deps. Designed to run wherever the pipeline runs -- the raiw.cc Modal GPU worker -- never on the 512 MB web host. Real-ESRGAN-via-Spandrel upscaling (a new `esrgan` extra) and an adaptive Laplacian-variance polish are deferred to later phases. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-10 12:53:56 +02:00 · 2026-06-03 20:52:17 -07:00
parent ea59bdc3e2
commit 9bd2c17cc4
6 changed files with 365 additions and 0 deletions
@@ -0,0 +1,209 @@
+"""Automatic pipeline planning for the ``--auto`` quality mode.
+
+``plan(image_path)`` inspects the INPUT image (before the diffusion model loads)
+and returns the quality modes to use, so the pipeline can adapt to content. It is
+meant to run as the FIRST step of the invisible/all pipeline, wherever that pipeline
+runs (locally, or the raiw.cc Modal GPU worker) -- never on a memory-constrained web
+host (image work there OOM-crashes the container).
+
+Routing is **quality-priority**: ControlNet (text/face-structure preservation) is the
+default; it is only skipped for a clearly structure-less image (no face, no text,
+near-zero edges), where plain SDXL is cheaper and just as good. GFPGAN face
+restoration is enabled when a face is present. A mild sharpen + grain polish is added
+when a smoothing pass (controlnet or face restore) ran, to counter the over-smoothed
+"AI look".
+
+Detection is **cv2-only and torch-free**: OpenCV YuNet (``cv2.FaceDetectorYN``) for
+faces -- a 232 KB MIT-licensed model bundled in ``assets/`` -- plus a Canny
+edge-density + MSER region heuristic for text/structure. The whole planner peaks
+~100 MB RSS in a few ms, so it adds nothing meaningful to a GPU run and runs anywhere
+the pipeline runs. (Phase 1 applies a fixed mild polish; an adaptive Laplacian-variance
+polish that measures the OUTPUT is a later phase.)
+
+The text heuristic is a deliberately rough Phase-1 placeholder (DBNet via cv2.dnn is
+the planned precision upgrade); it only ever ADDS controlnet, so a miss is backstopped
+by the edge-density route and a false positive only costs a controlnet run.
+"""
+
+# cv2/numpy boundary: cv2 ships no usable element types; relax the unknown-type rules
+# for this file only.
+# pyright: reportUnknownMemberType=false, reportUnknownArgumentType=false, reportUnknownVariableType=false, reportUnknownParameterType=false, reportMissingTypeArgument=false, reportMissingTypeStubs=false, reportMissingImports=false, reportArgumentType=false, reportAssignmentType=false, reportReturnType=false, reportCallIssue=false, reportIndexIssue=false, reportOperatorIssue=false, reportOptionalMemberAccess=false, reportOptionalCall=false, reportOptionalSubscript=false, reportOptionalOperand=false, reportAttributeAccessIssue=false, reportPrivateImportUsage=false, reportPrivateUsage=false, reportInvalidTypeForm=false, reportConstantRedefinition=false, reportUnnecessaryComparison=false
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    from numpy.typing import NDArray
+
+logger = logging.getLogger(__name__)
+
+# ── Routing thresholds (tunable; quality-priority -> controlnet unless clearly flat) ──
+# Canny edge-density below this, AND no face AND no text -> plain SDXL (nothing to
+# preserve). The headshot measures ~0.022, a busy photo higher; only a near-flat
+# gradient/solid image falls under 0.008.
+_STRUCTURELESS_EDGE_MAX = 0.008
+# MSER regions per megapixel above this -> likely text. Rough Phase-1 heuristic: a
+# no-text portrait measures a few hundred/MP, dense text far more. Set high so it
+# rarely false-fires; it only ever ADDS controlnet so miscalibration is low-harm.
+_TEXT_MSER_PER_MP = 1500.0
+_FACE_SCORE = 0.6  # YuNet confidence for a face to count
+# Downscale the long side to this for DETECTION only (faces stay detectable down to
+# ~10px, and this bounds YuNet/MSER cost on huge inputs). Removal runs at full res.
+_DETECT_MAX_SIDE = 1024
+
+# Auto polish applied only when a smoothing pass ran (controlnet or face restore),
+# to counter the soft "AI look". Conservative defaults; the user can override.
+_AUTO_UNSHARP = 0.5
+_AUTO_HUMANIZE = 2.0
+_UPSCALE_FLOOR = 1024
+
+_YUNET_ASSET = "face_detection_yunet_2023mar.onnx"  # MIT (Shiqi Yu), OpenCV Zoo
+_yunet: Any = None  # lazy singleton
+
+
+@dataclass(frozen=True)
+class AutoConfig:
+    """Resolved quality modes from content analysis (the ``--auto`` plan)."""
+
+    pipeline: str  # "default" | "controlnet"
+    restore_faces: bool
+    unsharp: float
+    humanize: float
+    min_resolution: int
+    # signals retained for logging / debugging a bad pick
+    has_face: bool
+    has_text: bool
+    edge_density: float
+    width: int
+    height: int
+
+    @property
+    def reason(self) -> str:
+        """One-line human-readable summary of the plan (logged per image)."""
+        bits = ["face" if self.has_face else "no-face"]
+        if self.has_text:
+            bits.append("text")
+        bits.append(f"edges={self.edge_density:.3f}")
+        rf = ", face-restore on" if self.restore_faces else ""
+        polish = f", unsharp {self.unsharp}/grain {self.humanize}" if (self.unsharp or self.humanize) else ""
+        return f"{'+'.join(bits)} -> {self.pipeline} pipeline{rf}{polish}"
+
+
+def _to_bgr(image: NDArray[Any]) -> NDArray[Any]:
+    """Normalize a 2D grayscale or 4-channel BGRA array to 3-channel BGR."""
+    import cv2
+
+    if image.ndim == 2:
+        return cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
+    if image.shape[2] == 4:
+        return cv2.cvtColor(image, cv2.COLOR_BGRA2BGR)
+    return image
+
+
+def _to_gray(image: NDArray[Any]) -> NDArray[Any]:
+    """Single-channel grayscale; passes a 2D (already-gray) input through unchanged."""
+    import cv2
+
+    if image.ndim == 3 and image.shape[2] >= 3:
+        return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    return image
+
+
+def _downscale_for_detection(image: NDArray[Any]) -> NDArray[Any]:
+    """Shrink the long side to ``_DETECT_MAX_SIDE`` for cheap, bounded detection."""
+    import cv2
+
+    h, w = image.shape[:2]
+    long_side = max(h, w)
+    if long_side <= _DETECT_MAX_SIDE:
+        return image
+    scale = _DETECT_MAX_SIDE / long_side
+    return cv2.resize(image, (max(1, round(w * scale)), max(1, round(h * scale))), interpolation=cv2.INTER_AREA)
+
+
+def detect_face(image: NDArray[Any]) -> bool:
+    """True if OpenCV YuNet finds at least one face. cv2-only, torch-free."""
+    import cv2
+
+    global _yunet
+    img = _to_bgr(image)
+    h, w = img.shape[:2]
+    if h < 1 or w < 1:
+        return False
+    try:
+        if _yunet is None:
+            model = Path(__file__).parent / "assets" / _YUNET_ASSET
+            _yunet = cv2.FaceDetectorYN.create(str(model), "", (w, h), _FACE_SCORE, 0.3, 5000)
+        _yunet.setInputSize((w, h))
+        _, faces = _yunet.detect(img)
+    except cv2.error as e:  # malformed input / model
+        logger.debug("YuNet face detect failed (%s); assuming no face", e)
+        return False
+    return faces is not None and len(faces) > 0
+
+
+def detect_text(image: NDArray[Any]) -> bool:
+    """Rough MSER-based text-presence heuristic (Phase-1 placeholder for DBNet)."""
+    import cv2
+
+    gray = _to_gray(image)
+    h, w = gray.shape[:2]
+    try:
+        regions, _ = cv2.MSER_create().detectRegions(gray)
+    except cv2.error:
+        return False
+    per_mp = len(regions) / max(1e-6, (h * w) / 1e6)
+    return per_mp > _TEXT_MSER_PER_MP
+
+
+def edge_density(image: NDArray[Any]) -> float:
+    """Fraction of Canny edge pixels -- a cheap 'has structure' proxy in [0, 1]."""
+    import cv2
+
+    gray = _to_gray(image)
+    edges = cv2.Canny(gray, 100, 200)
+    return float((edges > 0).mean())
+
+
+def plan(image_path: Path) -> AutoConfig | None:
+    """Inspect the input image and return the quality modes, or None if unreadable.
+
+    Pure analysis: loads the image, runs the cv2 detectors on a downscaled copy, and
+    applies the quality-priority routing rules. Safe to call wherever the pipeline
+    runs; no diffusion model is loaded.
+    """
+    from remove_ai_watermarks import image_io
+
+    image = image_io.imread(image_path)
+    if image is None:
+        return None
+
+    h, w = image.shape[:2]
+    small = _downscale_for_detection(image)
+    gray = _to_gray(small)  # convert once; the text/edge detectors pass a gray input through
+    has_face = detect_face(small)  # YuNet needs the 3-channel image
+    has_text = detect_text(gray)
+    edges = edge_density(gray)
+
+    structureless = (not has_face) and (not has_text) and edges < _STRUCTURELESS_EDGE_MAX
+    pipeline = "default" if structureless else "controlnet"
+    restore_faces = has_face
+    smoothing = pipeline == "controlnet" or restore_faces
+
+    cfg = AutoConfig(
+        pipeline=pipeline,
+        restore_faces=restore_faces,
+        unsharp=_AUTO_UNSHARP if smoothing else 0.0,
+        humanize=_AUTO_HUMANIZE if smoothing else 0.0,
+        min_resolution=_UPSCALE_FLOOR,
+        has_face=has_face,
+        has_text=has_text,
+        edge_density=edges,
+        width=w,
+        height=h,
+    )
+    logger.debug("auto plan for %s: %s", image_path, cfg.reason)
+    return cfg
@@ -159,6 +159,48 @@ _unsharp_option = click.option(
    "--unsharp", type=float, default=0.0, help="Unsharp-mask sharpening strength (0 = off, typical: 0.3-0.8)."
 )

+_auto_option = click.option(
+    "--auto",
+    is_flag=True,
+    default=False,
+    help="Auto-pick quality modes (pipeline, face restore, sharpen/grain) from image content. "
+    "Explicit flags override. EXPERIMENTAL.",
+)
+
+
+def _apply_auto(
+    ctx: click.Context,
+    source: Path,
+    pipeline: str,
+    restore_faces: bool,
+    unsharp: float,
+    humanize: float,
+) -> tuple[str, bool, float, float]:
+    """Resolve ``--auto``: plan modes from the image, overriding only the flags the
+    user left at their default (an explicit flag always wins). Returns the resolved
+    ``(pipeline, restore_faces, unsharp, humanize)`` and prints the chosen plan.
+    """
+    from remove_ai_watermarks import auto_config
+
+    cfg = auto_config.plan(source)
+    if cfg is None:
+        console.print("  Auto: could not read image; using defaults")
+        return pipeline, restore_faces, unsharp, humanize
+
+    def _is_default(name: str) -> bool:
+        return ctx.get_parameter_source(name) == click.core.ParameterSource.DEFAULT
+
+    if _is_default("pipeline"):
+        pipeline = cfg.pipeline
+    if _is_default("restore_faces"):
+        restore_faces = cfg.restore_faces
+    if _is_default("unsharp"):
+        unsharp = cfg.unsharp
+    if _is_default("humanize"):
+        humanize = cfg.humanize
+    console.print(f"  Auto: {cfg.reason}")
+    return pipeline, restore_faces, unsharp, humanize
+

 def _restore_faces_options(f: Any) -> Any:
    """Attach the shared GFPGAN face-restoration flags to an invisible-pipeline command."""
@@ -507,6 +549,7 @@ def cmd_erase(
@_restore_faces_options
@_min_resolution_option
@_unsharp_option
+@_auto_option
@click.pass_context
 def cmd_invisible(
    ctx: click.Context,
@@ -525,6 +568,7 @@ def cmd_invisible(
    controlnet_scale: float,
    restore_faces: bool,
    restore_faces_weight: float,
+    auto: bool,
 ) -> None:
    """Remove invisible AI watermarks (SynthID, StableSignature, TreeRing).

@@ -542,6 +586,10 @@ def cmd_invisible(
    from remove_ai_watermarks.invisible_engine import InvisibleEngine

    source = _validate_image(source)
+    if auto:
+        pipeline, restore_faces, unsharp, humanize = _apply_auto(
+            ctx, source, pipeline, restore_faces, unsharp, humanize
+        )
    if output is None:
        output = source.with_stem(source.stem + "_clean")

@@ -758,6 +806,7 @@ def cmd_identify(ctx: click.Context, source: Path, no_visible: bool, as_json: bo
@_restore_faces_options
@_min_resolution_option
@_unsharp_option
+@_auto_option
@click.pass_context
 def cmd_all(
    ctx: click.Context,
@@ -779,6 +828,7 @@ def cmd_all(
    controlnet_scale: float,
    restore_faces: bool,
    restore_faces_weight: float,
+    auto: bool,
 ) -> None:
    """Remove ALL watermarks: visible + invisible + metadata.

@@ -793,6 +843,10 @@ def cmd_all(

    _banner()
    source = _validate_image(source)
+    if auto:
+        pipeline, restore_faces, unsharp, humanize = _apply_auto(
+            ctx, source, pipeline, restore_faces, unsharp, humanize
+        )

    if output is None:
        output = source.with_stem(source.stem + "_clean")