Files
remove-ai-watermarks/src/remove_ai_watermarks/auto_config.py
T
Victor Kuznetsov b686dbdd79 feat(auto): adaptive detail-targeting polish + --adaptive-polish flag
The fixed mild auto polish (unsharp 0.5 / grain 2.0) under-corrected soft
photo/face output (gemini_3 stayed at lap-var 84 vs its 592 original) and its
grain speckled small text. Replace it with humanizer.adaptive_polish: target the
input's Laplacian variance with a capped unsharp scaled to the deficit + edge-
masked grain (smooth regions only), calibrated by a short sigma search. Self-
limiting on text/graphics -- already high-frequency, so almost no polish lands
and text edges are masked out. Validated on the spaces corpus (gemini_3 84 -> 334
end-to-end; openai_1 text near-untouched).

Interface: every --auto decision is now independently overridable -- add
--adaptive-polish/--no-adaptive-polish (matching --restore-faces; works without
--auto too) so the polish can be disabled or used manually. _apply_auto overrides
exactly the three content-adaptive modes (pipeline, restore-faces, adaptive-
polish); --unsharp/--humanize stay independent fixed filters.

cv2-only, no new deps. Threaded through invisible/all (not batch).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-03 21:49:08 -07:00

218 lines
9.0 KiB
Python

"""Automatic pipeline planning for the ``--auto`` quality mode.
``plan(image_path)`` inspects the INPUT image (before the diffusion model loads)
and returns the quality modes to use, so the pipeline can adapt to content. It is
meant to run as the FIRST step of the invisible/all pipeline, wherever that pipeline
runs (locally, or the raiw.cc Modal GPU worker) -- never on a memory-constrained web
host (image work there OOM-crashes the container).
Routing is **quality-priority**: ControlNet (text/face-structure preservation) is the
default; it is only skipped for a clearly structure-less image (no face, no text,
near-zero edges), where plain SDXL is cheaper and just as good. GFPGAN face
restoration is enabled when a face is present. When a smoothing pass (controlnet or
face restore) ran, the **adaptive polish** (``humanizer.adaptive_polish``) restores
the input's detail level -- a capped unsharp + edge-masked grain targeting the input's
Laplacian variance -- to counter the over-smoothed "AI look". It is self-limiting on
text/graphics (already high-frequency, so almost no polish) and spares text/edges by
masking the grain.
Detection is **cv2-only and torch-free**: OpenCV YuNet (``cv2.FaceDetectorYN``) for
faces -- a 232 KB MIT-licensed model bundled in ``assets/`` -- plus a Canny
edge-density + MSER region heuristic for text/structure. The whole planner peaks
~100 MB RSS in a few ms, so it adds nothing meaningful to a GPU run and runs anywhere
the pipeline runs.
The text heuristic is a deliberately rough Phase-1 placeholder (DBNet via cv2.dnn is
the planned precision upgrade); it only ever ADDS controlnet, so a miss is backstopped
by the edge-density route and a false positive only costs a controlnet run.
"""
# cv2/numpy boundary: cv2 ships no usable element types; relax the unknown-type rules
# for this file only.
# pyright: reportUnknownMemberType=false, reportUnknownArgumentType=false, reportUnknownVariableType=false, reportUnknownParameterType=false, reportMissingTypeArgument=false, reportMissingTypeStubs=false, reportMissingImports=false, reportArgumentType=false, reportAssignmentType=false, reportReturnType=false, reportCallIssue=false, reportIndexIssue=false, reportOperatorIssue=false, reportOptionalMemberAccess=false, reportOptionalCall=false, reportOptionalSubscript=false, reportOptionalOperand=false, reportAttributeAccessIssue=false, reportPrivateImportUsage=false, reportPrivateUsage=false, reportInvalidTypeForm=false, reportConstantRedefinition=false, reportUnnecessaryComparison=false
from __future__ import annotations
import logging
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
if TYPE_CHECKING:
from numpy.typing import NDArray
logger = logging.getLogger(__name__)
# ── Routing thresholds (tunable; quality-priority -> controlnet unless clearly flat) ──
# Canny edge-density below this, AND no face AND no text -> plain SDXL (nothing to
# preserve). The headshot measures ~0.022, a busy photo higher; only a near-flat
# gradient/solid image falls under 0.008.
_STRUCTURELESS_EDGE_MAX = 0.008
# MSER regions per megapixel above this -> likely text. Rough Phase-1 heuristic: a
# no-text portrait measures a few hundred/MP, dense text far more. Set high so it
# rarely false-fires; it only ever ADDS controlnet so miscalibration is low-harm.
_TEXT_MSER_PER_MP = 1500.0
_FACE_SCORE = 0.6 # YuNet confidence for a face to count
# Downscale the long side to this for DETECTION only (faces stay detectable down to
# ~10px, and this bounds YuNet/MSER cost on huge inputs). Removal runs at full res.
_DETECT_MAX_SIDE = 1024
# When a smoothing pass ran (controlnet or face restore), the adaptive polish
# (humanizer.adaptive_polish) restores the input's detail level, sparing text --
# replacing the old fixed unsharp/grain which over-/under-corrected and speckled text.
_UPSCALE_FLOOR = 1024
_YUNET_ASSET = "face_detection_yunet_2023mar.onnx" # MIT (Shiqi Yu), OpenCV Zoo
_yunet: Any = None # lazy singleton
@dataclass(frozen=True)
class AutoConfig:
"""Resolved quality modes from content analysis (the ``--auto`` plan)."""
pipeline: str # "default" | "controlnet"
restore_faces: bool
adaptive_polish: bool # restore the input's detail level (sharpen + masked grain), sparing text
unsharp: float # fixed-polish knobs, 0 in auto (the adaptive polish replaces them)
humanize: float
min_resolution: int
# signals retained for logging / debugging a bad pick
has_face: bool
has_text: bool
edge_density: float
width: int
height: int
@property
def reason(self) -> str:
"""One-line human-readable summary of the plan (logged per image)."""
bits = ["face" if self.has_face else "no-face"]
if self.has_text:
bits.append("text")
bits.append(f"edges={self.edge_density:.3f}")
rf = ", face-restore on" if self.restore_faces else ""
if self.adaptive_polish:
polish = ", adaptive polish"
elif self.unsharp or self.humanize:
polish = f", unsharp {self.unsharp}/grain {self.humanize}"
else:
polish = ""
return f"{'+'.join(bits)} -> {self.pipeline} pipeline{rf}{polish}"
def _to_bgr(image: NDArray[Any]) -> NDArray[Any]:
"""Normalize a 2D grayscale or 4-channel BGRA array to 3-channel BGR."""
import cv2
if image.ndim == 2:
return cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
if image.shape[2] == 4:
return cv2.cvtColor(image, cv2.COLOR_BGRA2BGR)
return image
def _to_gray(image: NDArray[Any]) -> NDArray[Any]:
"""Single-channel grayscale; passes a 2D (already-gray) input through unchanged."""
import cv2
if image.ndim == 3 and image.shape[2] >= 3:
return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
return image
def _downscale_for_detection(image: NDArray[Any]) -> NDArray[Any]:
"""Shrink the long side to ``_DETECT_MAX_SIDE`` for cheap, bounded detection."""
import cv2
h, w = image.shape[:2]
long_side = max(h, w)
if long_side <= _DETECT_MAX_SIDE:
return image
scale = _DETECT_MAX_SIDE / long_side
return cv2.resize(image, (max(1, round(w * scale)), max(1, round(h * scale))), interpolation=cv2.INTER_AREA)
def detect_face(image: NDArray[Any]) -> bool:
"""True if OpenCV YuNet finds at least one face. cv2-only, torch-free."""
import cv2
global _yunet
img = _to_bgr(image)
h, w = img.shape[:2]
if h < 1 or w < 1:
return False
try:
if _yunet is None:
model = Path(__file__).parent / "assets" / _YUNET_ASSET
_yunet = cv2.FaceDetectorYN.create(str(model), "", (w, h), _FACE_SCORE, 0.3, 5000)
_yunet.setInputSize((w, h))
_, faces = _yunet.detect(img)
except cv2.error as e: # malformed input / model
logger.debug("YuNet face detect failed (%s); assuming no face", e)
return False
return faces is not None and len(faces) > 0
def detect_text(image: NDArray[Any]) -> bool:
"""Rough MSER-based text-presence heuristic (Phase-1 placeholder for DBNet)."""
import cv2
gray = _to_gray(image)
h, w = gray.shape[:2]
try:
regions, _ = cv2.MSER_create().detectRegions(gray)
except cv2.error:
return False
per_mp = len(regions) / max(1e-6, (h * w) / 1e6)
return per_mp > _TEXT_MSER_PER_MP
def edge_density(image: NDArray[Any]) -> float:
"""Fraction of Canny edge pixels -- a cheap 'has structure' proxy in [0, 1]."""
import cv2
gray = _to_gray(image)
edges = cv2.Canny(gray, 100, 200)
return float((edges > 0).mean())
def plan(image_path: Path) -> AutoConfig | None:
"""Inspect the input image and return the quality modes, or None if unreadable.
Pure analysis: loads the image, runs the cv2 detectors on a downscaled copy, and
applies the quality-priority routing rules. Safe to call wherever the pipeline
runs; no diffusion model is loaded.
"""
from remove_ai_watermarks import image_io
image = image_io.imread(image_path)
if image is None:
return None
h, w = image.shape[:2]
small = _downscale_for_detection(image)
gray = _to_gray(small) # convert once; the text/edge detectors pass a gray input through
has_face = detect_face(small) # YuNet needs the 3-channel image
has_text = detect_text(gray)
edges = edge_density(gray)
structureless = (not has_face) and (not has_text) and edges < _STRUCTURELESS_EDGE_MAX
pipeline = "default" if structureless else "controlnet"
restore_faces = has_face
smoothing = pipeline == "controlnet" or restore_faces
cfg = AutoConfig(
pipeline=pipeline,
restore_faces=restore_faces,
adaptive_polish=smoothing, # adaptive (detail-targeted) polish when a smoothing pass ran
unsharp=0.0,
humanize=0.0,
min_resolution=_UPSCALE_FLOOR,
has_face=has_face,
has_text=has_text,
edge_density=edges,
width=w,
height=h,
)
logger.debug("auto plan for %s: %s", image_path, cfg.reason)
return cfg