feat(auto): content-adaptive --auto quality mode, Phase 1

Add `auto_config.plan(image_path) -> AutoConfig`, the first step of the
invisible/all pipeline: it inspects the input image (before the diffusion model
loads) and picks the quality modes so the run adapts to content. Quality-priority
routing -- ControlNet (text/face-structure preservation) is the default, skipped for
plain SDXL only on a clearly structure-less image; GFPGAN face restore when a face is
present; a mild sharpen + grain polish when a smoothing pass ran. Exposed as `--auto`
on `all`/`invisible` (`_apply_auto`; explicit flags override via click's parameter
source). Not wired into batch (its engine is cached per-mode).

Detection is cv2-only and torch-free (~100 MB peak RSS, a few ms): OpenCV YuNet
(`cv2.FaceDetectorYN`, MIT, 232 KB model bundled in assets/) for faces, a Canny
edge-density + MSER heuristic for text/structure (a rough Phase-1 placeholder; DBNet
via cv2.dnn is the planned upgrade). ZERO new pip deps. Designed to run wherever the
pipeline runs -- the raiw.cc Modal GPU worker -- never on the 512 MB web host.

Real-ESRGAN-via-Spandrel upscaling (a new `esrgan` extra) and an adaptive
Laplacian-variance polish are deferred to later phases.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Victor Kuznetsov
2026-06-03 20:52:17 -07:00
parent ea59bdc3e2
commit 9bd2c17cc4
6 changed files with 365 additions and 0 deletions
+209
View File
@@ -0,0 +1,209 @@
"""Automatic pipeline planning for the ``--auto`` quality mode.
``plan(image_path)`` inspects the INPUT image (before the diffusion model loads)
and returns the quality modes to use, so the pipeline can adapt to content. It is
meant to run as the FIRST step of the invisible/all pipeline, wherever that pipeline
runs (locally, or the raiw.cc Modal GPU worker) -- never on a memory-constrained web
host (image work there OOM-crashes the container).
Routing is **quality-priority**: ControlNet (text/face-structure preservation) is the
default; it is only skipped for a clearly structure-less image (no face, no text,
near-zero edges), where plain SDXL is cheaper and just as good. GFPGAN face
restoration is enabled when a face is present. A mild sharpen + grain polish is added
when a smoothing pass (controlnet or face restore) ran, to counter the over-smoothed
"AI look".
Detection is **cv2-only and torch-free**: OpenCV YuNet (``cv2.FaceDetectorYN``) for
faces -- a 232 KB MIT-licensed model bundled in ``assets/`` -- plus a Canny
edge-density + MSER region heuristic for text/structure. The whole planner peaks
~100 MB RSS in a few ms, so it adds nothing meaningful to a GPU run and runs anywhere
the pipeline runs. (Phase 1 applies a fixed mild polish; an adaptive Laplacian-variance
polish that measures the OUTPUT is a later phase.)
The text heuristic is a deliberately rough Phase-1 placeholder (DBNet via cv2.dnn is
the planned precision upgrade); it only ever ADDS controlnet, so a miss is backstopped
by the edge-density route and a false positive only costs a controlnet run.
"""
# cv2/numpy boundary: cv2 ships no usable element types; relax the unknown-type rules
# for this file only.
# pyright: reportUnknownMemberType=false, reportUnknownArgumentType=false, reportUnknownVariableType=false, reportUnknownParameterType=false, reportMissingTypeArgument=false, reportMissingTypeStubs=false, reportMissingImports=false, reportArgumentType=false, reportAssignmentType=false, reportReturnType=false, reportCallIssue=false, reportIndexIssue=false, reportOperatorIssue=false, reportOptionalMemberAccess=false, reportOptionalCall=false, reportOptionalSubscript=false, reportOptionalOperand=false, reportAttributeAccessIssue=false, reportPrivateImportUsage=false, reportPrivateUsage=false, reportInvalidTypeForm=false, reportConstantRedefinition=false, reportUnnecessaryComparison=false
from __future__ import annotations
import logging
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
if TYPE_CHECKING:
from numpy.typing import NDArray
logger = logging.getLogger(__name__)
# ── Routing thresholds (tunable; quality-priority -> controlnet unless clearly flat) ──
# Canny edge-density below this, AND no face AND no text -> plain SDXL (nothing to
# preserve). The headshot measures ~0.022, a busy photo higher; only a near-flat
# gradient/solid image falls under 0.008.
_STRUCTURELESS_EDGE_MAX = 0.008
# MSER regions per megapixel above this -> likely text. Rough Phase-1 heuristic: a
# no-text portrait measures a few hundred/MP, dense text far more. Set high so it
# rarely false-fires; it only ever ADDS controlnet so miscalibration is low-harm.
_TEXT_MSER_PER_MP = 1500.0
_FACE_SCORE = 0.6 # YuNet confidence for a face to count
# Downscale the long side to this for DETECTION only (faces stay detectable down to
# ~10px, and this bounds YuNet/MSER cost on huge inputs). Removal runs at full res.
_DETECT_MAX_SIDE = 1024
# Auto polish applied only when a smoothing pass ran (controlnet or face restore),
# to counter the soft "AI look". Conservative defaults; the user can override.
_AUTO_UNSHARP = 0.5
_AUTO_HUMANIZE = 2.0
_UPSCALE_FLOOR = 1024
_YUNET_ASSET = "face_detection_yunet_2023mar.onnx" # MIT (Shiqi Yu), OpenCV Zoo
_yunet: Any = None # lazy singleton
@dataclass(frozen=True)
class AutoConfig:
"""Resolved quality modes from content analysis (the ``--auto`` plan)."""
pipeline: str # "default" | "controlnet"
restore_faces: bool
unsharp: float
humanize: float
min_resolution: int
# signals retained for logging / debugging a bad pick
has_face: bool
has_text: bool
edge_density: float
width: int
height: int
@property
def reason(self) -> str:
"""One-line human-readable summary of the plan (logged per image)."""
bits = ["face" if self.has_face else "no-face"]
if self.has_text:
bits.append("text")
bits.append(f"edges={self.edge_density:.3f}")
rf = ", face-restore on" if self.restore_faces else ""
polish = f", unsharp {self.unsharp}/grain {self.humanize}" if (self.unsharp or self.humanize) else ""
return f"{'+'.join(bits)} -> {self.pipeline} pipeline{rf}{polish}"
def _to_bgr(image: NDArray[Any]) -> NDArray[Any]:
"""Normalize a 2D grayscale or 4-channel BGRA array to 3-channel BGR."""
import cv2
if image.ndim == 2:
return cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
if image.shape[2] == 4:
return cv2.cvtColor(image, cv2.COLOR_BGRA2BGR)
return image
def _to_gray(image: NDArray[Any]) -> NDArray[Any]:
"""Single-channel grayscale; passes a 2D (already-gray) input through unchanged."""
import cv2
if image.ndim == 3 and image.shape[2] >= 3:
return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
return image
def _downscale_for_detection(image: NDArray[Any]) -> NDArray[Any]:
"""Shrink the long side to ``_DETECT_MAX_SIDE`` for cheap, bounded detection."""
import cv2
h, w = image.shape[:2]
long_side = max(h, w)
if long_side <= _DETECT_MAX_SIDE:
return image
scale = _DETECT_MAX_SIDE / long_side
return cv2.resize(image, (max(1, round(w * scale)), max(1, round(h * scale))), interpolation=cv2.INTER_AREA)
def detect_face(image: NDArray[Any]) -> bool:
"""True if OpenCV YuNet finds at least one face. cv2-only, torch-free."""
import cv2
global _yunet
img = _to_bgr(image)
h, w = img.shape[:2]
if h < 1 or w < 1:
return False
try:
if _yunet is None:
model = Path(__file__).parent / "assets" / _YUNET_ASSET
_yunet = cv2.FaceDetectorYN.create(str(model), "", (w, h), _FACE_SCORE, 0.3, 5000)
_yunet.setInputSize((w, h))
_, faces = _yunet.detect(img)
except cv2.error as e: # malformed input / model
logger.debug("YuNet face detect failed (%s); assuming no face", e)
return False
return faces is not None and len(faces) > 0
def detect_text(image: NDArray[Any]) -> bool:
"""Rough MSER-based text-presence heuristic (Phase-1 placeholder for DBNet)."""
import cv2
gray = _to_gray(image)
h, w = gray.shape[:2]
try:
regions, _ = cv2.MSER_create().detectRegions(gray)
except cv2.error:
return False
per_mp = len(regions) / max(1e-6, (h * w) / 1e6)
return per_mp > _TEXT_MSER_PER_MP
def edge_density(image: NDArray[Any]) -> float:
"""Fraction of Canny edge pixels -- a cheap 'has structure' proxy in [0, 1]."""
import cv2
gray = _to_gray(image)
edges = cv2.Canny(gray, 100, 200)
return float((edges > 0).mean())
def plan(image_path: Path) -> AutoConfig | None:
"""Inspect the input image and return the quality modes, or None if unreadable.
Pure analysis: loads the image, runs the cv2 detectors on a downscaled copy, and
applies the quality-priority routing rules. Safe to call wherever the pipeline
runs; no diffusion model is loaded.
"""
from remove_ai_watermarks import image_io
image = image_io.imread(image_path)
if image is None:
return None
h, w = image.shape[:2]
small = _downscale_for_detection(image)
gray = _to_gray(small) # convert once; the text/edge detectors pass a gray input through
has_face = detect_face(small) # YuNet needs the 3-channel image
has_text = detect_text(gray)
edges = edge_density(gray)
structureless = (not has_face) and (not has_text) and edges < _STRUCTURELESS_EDGE_MAX
pipeline = "default" if structureless else "controlnet"
restore_faces = has_face
smoothing = pipeline == "controlnet" or restore_faces
cfg = AutoConfig(
pipeline=pipeline,
restore_faces=restore_faces,
unsharp=_AUTO_UNSHARP if smoothing else 0.0,
humanize=_AUTO_HUMANIZE if smoothing else 0.0,
min_resolution=_UPSCALE_FLOOR,
has_face=has_face,
has_text=has_text,
edge_density=edges,
width=w,
height=h,
)
logger.debug("auto plan for %s: %s", image_path, cfg.reason)
return cfg
+54
View File
@@ -159,6 +159,48 @@ _unsharp_option = click.option(
"--unsharp", type=float, default=0.0, help="Unsharp-mask sharpening strength (0 = off, typical: 0.3-0.8)."
)
_auto_option = click.option(
"--auto",
is_flag=True,
default=False,
help="Auto-pick quality modes (pipeline, face restore, sharpen/grain) from image content. "
"Explicit flags override. EXPERIMENTAL.",
)
def _apply_auto(
ctx: click.Context,
source: Path,
pipeline: str,
restore_faces: bool,
unsharp: float,
humanize: float,
) -> tuple[str, bool, float, float]:
"""Resolve ``--auto``: plan modes from the image, overriding only the flags the
user left at their default (an explicit flag always wins). Returns the resolved
``(pipeline, restore_faces, unsharp, humanize)`` and prints the chosen plan.
"""
from remove_ai_watermarks import auto_config
cfg = auto_config.plan(source)
if cfg is None:
console.print(" Auto: could not read image; using defaults")
return pipeline, restore_faces, unsharp, humanize
def _is_default(name: str) -> bool:
return ctx.get_parameter_source(name) == click.core.ParameterSource.DEFAULT
if _is_default("pipeline"):
pipeline = cfg.pipeline
if _is_default("restore_faces"):
restore_faces = cfg.restore_faces
if _is_default("unsharp"):
unsharp = cfg.unsharp
if _is_default("humanize"):
humanize = cfg.humanize
console.print(f" Auto: {cfg.reason}")
return pipeline, restore_faces, unsharp, humanize
def _restore_faces_options(f: Any) -> Any:
"""Attach the shared GFPGAN face-restoration flags to an invisible-pipeline command."""
@@ -507,6 +549,7 @@ def cmd_erase(
@_restore_faces_options
@_min_resolution_option
@_unsharp_option
@_auto_option
@click.pass_context
def cmd_invisible(
ctx: click.Context,
@@ -525,6 +568,7 @@ def cmd_invisible(
controlnet_scale: float,
restore_faces: bool,
restore_faces_weight: float,
auto: bool,
) -> None:
"""Remove invisible AI watermarks (SynthID, StableSignature, TreeRing).
@@ -542,6 +586,10 @@ def cmd_invisible(
from remove_ai_watermarks.invisible_engine import InvisibleEngine
source = _validate_image(source)
if auto:
pipeline, restore_faces, unsharp, humanize = _apply_auto(
ctx, source, pipeline, restore_faces, unsharp, humanize
)
if output is None:
output = source.with_stem(source.stem + "_clean")
@@ -758,6 +806,7 @@ def cmd_identify(ctx: click.Context, source: Path, no_visible: bool, as_json: bo
@_restore_faces_options
@_min_resolution_option
@_unsharp_option
@_auto_option
@click.pass_context
def cmd_all(
ctx: click.Context,
@@ -779,6 +828,7 @@ def cmd_all(
controlnet_scale: float,
restore_faces: bool,
restore_faces_weight: float,
auto: bool,
) -> None:
"""Remove ALL watermarks: visible + invisible + metadata.
@@ -793,6 +843,10 @@ def cmd_all(
_banner()
source = _validate_image(source)
if auto:
pipeline, restore_faces, unsharp, humanize = _apply_auto(
ctx, source, pipeline, restore_faces, unsharp, humanize
)
if output is None:
output = source.with_stem(source.stem + "_clean")