mirror of
https://github.com/wiltodelta/remove-ai-watermarks.git
synced 2026-06-10 12:53:56 +02:00
6d11c11b52
Three content-quality features for the invisible/all/batch pipeline.
DBNet text detector (auto_config): replace the MSER text heuristic with
PP-OCRv3 differentiable-binarization via cv2.dnn.TextDetectionModel_DB,
using a bundled 2.4 MB Apache-2.0 model (en/cn detection nets are
byte-identical, so it ships language-neutral). cv2.dnn is core OpenCV, so
no new pip dep. MSER stays as the fallback when the model can't load.
Validated on real images: matches MSER everywhere and additionally catches
the Doubao CJK mark MSER missed; routing decisions unchanged otherwise.
Real-ESRGAN upscaler (new upscaler.py, esrgan extra): optional
pre-diffusion super-resolution for the min-resolution floor upscale, loaded
via spandrel (MIT, no basicsr) with BSD-3-Clause weights downloaded on
first use. New --upscaler {lanczos,esrgan} on invisible/all/batch; default
stays lanczos and the engine falls back to lanczos when the extra is absent
or the model errors (never breaks removal). It is a manual opt-in knob (the
auto plan never selects it) -- as a generic GAN it sharpens photo/texture
content strongly but can degrade faces (the diffusion pass regenerates
them) and thin text, documented accordingly.
batch --auto: wire the content-adaptive --auto (+ --adaptive-polish) into
cmd_batch. The plan is recomputed per image and the invisible engine is
cached per resolved pipeline (default/controlnet), so a mixed directory
builds at most one engine of each kind. Verified end-to-end: 3 mixed
images routed correctly with only 2 pipeline loads (controlnet reused).
ruff + strict pyright(src/) clean; 558 tests pass.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
272 lines
12 KiB
Python
272 lines
12 KiB
Python
"""Automatic pipeline planning for the ``--auto`` quality mode.
|
|
|
|
``plan(image_path)`` inspects the INPUT image (before the diffusion model loads)
|
|
and returns the quality modes to use, so the pipeline can adapt to content. It is
|
|
meant to run as the FIRST step of the invisible/all pipeline, wherever that pipeline
|
|
runs (locally, or the raiw.cc Modal GPU worker) -- never on a memory-constrained web
|
|
host (image work there OOM-crashes the container).
|
|
|
|
Routing is **quality-priority**: ControlNet (text/face-structure preservation) is the
|
|
default; it is only skipped for a clearly structure-less image (no face, no text,
|
|
near-zero edges), where plain SDXL is cheaper and just as good. GFPGAN face
|
|
restoration is enabled when a face is present. When a smoothing pass (controlnet or
|
|
face restore) ran, the **adaptive polish** (``humanizer.adaptive_polish``) restores
|
|
the input's detail level -- a capped unsharp + edge-masked grain targeting the input's
|
|
Laplacian variance -- to counter the over-smoothed "AI look". It is self-limiting on
|
|
text/graphics (already high-frequency, so almost no polish) and spares text/edges by
|
|
masking the grain.
|
|
|
|
Detection is **cv2-only and torch-free**: OpenCV YuNet (``cv2.FaceDetectorYN``) for
|
|
faces -- a 232 KB MIT-licensed model bundled in ``assets/`` -- DBNet (PP-OCRv3
|
|
differentiable-binarization via ``cv2.dnn.TextDetectionModel_DB``, a 2.4 MB Apache-2.0
|
|
model bundled in ``assets/``) for text, and a Canny ``edge_density``. The whole planner
|
|
peaks ~100 MB RSS in a few ms, so it adds nothing meaningful to a GPU run and runs
|
|
anywhere the pipeline runs.
|
|
|
|
The text detector falls back to the old MSER region heuristic if the DBNet model can't
|
|
load. Either way text only ever ADDS controlnet, so a miss is backstopped by the
|
|
edge-density route and a false positive only costs a controlnet run.
|
|
"""
|
|
|
|
# cv2/numpy boundary: cv2 ships no usable element types; relax the unknown-type rules
|
|
# for this file only.
|
|
# pyright: reportUnknownMemberType=false, reportUnknownArgumentType=false, reportUnknownVariableType=false, reportUnknownParameterType=false, reportMissingTypeArgument=false, reportMissingTypeStubs=false, reportMissingImports=false, reportArgumentType=false, reportAssignmentType=false, reportReturnType=false, reportCallIssue=false, reportIndexIssue=false, reportOperatorIssue=false, reportOptionalMemberAccess=false, reportOptionalCall=false, reportOptionalSubscript=false, reportOptionalOperand=false, reportAttributeAccessIssue=false, reportPrivateImportUsage=false, reportPrivateUsage=false, reportInvalidTypeForm=false, reportConstantRedefinition=false, reportUnnecessaryComparison=false
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import TYPE_CHECKING, Any
|
|
|
|
if TYPE_CHECKING:
|
|
from numpy.typing import NDArray
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ── Routing thresholds (tunable; quality-priority -> controlnet unless clearly flat) ──
|
|
# Canny edge-density below this, AND no face AND no text -> plain SDXL (nothing to
|
|
# preserve). The headshot measures ~0.022, a busy photo higher; only a near-flat
|
|
# gradient/solid image falls under 0.008.
|
|
_STRUCTURELESS_EDGE_MAX = 0.008
|
|
# MSER regions per megapixel above this -> likely text. The MSER path is now only the
|
|
# FALLBACK when the bundled DBNet model can't load; DBNet (below) is the primary text
|
|
# detector. Rough heuristic: a no-text portrait measures a few hundred/MP, dense text
|
|
# far more. Set high so it rarely false-fires; text only ever ADDS controlnet.
|
|
_TEXT_MSER_PER_MP = 1500.0
|
|
_FACE_SCORE = 0.6 # YuNet confidence for a face to count
|
|
# Downscale the long side to this for DETECTION only (faces stay detectable down to
|
|
# ~10px, and this bounds YuNet/DBNet/MSER cost on huge inputs). Removal runs at full res.
|
|
_DETECT_MAX_SIDE = 1024
|
|
|
|
# DBNet (PP-OCRv3 differentiable-binarization) text-region detector via cv2.dnn -- the
|
|
# primary "has meaningful text" signal. The model is the shared PP-OCRv3 detection net
|
|
# from OpenCV Zoo (Apache-2.0); en/cn variants are byte-identical, so it is bundled
|
|
# language-neutral. cv2.dnn is core OpenCV, so this adds NO new pip dependency.
|
|
_DBNET_ASSET = "text_detection_ppocrv3_2023may.onnx" # Apache-2.0 (OpenCV Zoo PP-OCRv3 DB)
|
|
_DBNET_BINARY_THRESHOLD = 0.3
|
|
_DBNET_POLYGON_THRESHOLD = 0.5
|
|
_DBNET_MAX_CANDIDATES = 200
|
|
_DBNET_UNCLIP_RATIO = 2.0
|
|
_DBNET_INPUT_SIDE = 736 # square input, multiple of 32 (PP-OCRv3 default)
|
|
_DBNET_MEAN = (122.67891434, 116.66876762, 104.00698793) # ImageNet mean * 255
|
|
_dbnet: Any = None # lazy singleton; set to False after a load failure (-> MSER fallback)
|
|
|
|
# When a smoothing pass ran (controlnet or face restore), the adaptive polish
|
|
# (humanizer.adaptive_polish) restores the input's detail level, sparing text --
|
|
# replacing the old fixed unsharp/grain which over-/under-corrected and speckled text.
|
|
_UPSCALE_FLOOR = 1024
|
|
|
|
_YUNET_ASSET = "face_detection_yunet_2023mar.onnx" # MIT (Shiqi Yu), OpenCV Zoo
|
|
_yunet: Any = None # lazy singleton
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class AutoConfig:
|
|
"""Resolved quality modes from content analysis (the ``--auto`` plan)."""
|
|
|
|
pipeline: str # "default" | "controlnet"
|
|
restore_faces: bool
|
|
adaptive_polish: bool # restore the input's detail level (sharpen + masked grain), sparing text
|
|
unsharp: float # fixed-polish knobs, 0 in auto (the adaptive polish replaces them)
|
|
humanize: float
|
|
min_resolution: int
|
|
# signals retained for logging / debugging a bad pick
|
|
has_face: bool
|
|
has_text: bool
|
|
edge_density: float
|
|
width: int
|
|
height: int
|
|
|
|
@property
|
|
def reason(self) -> str:
|
|
"""One-line human-readable summary of the plan (logged per image)."""
|
|
bits = ["face" if self.has_face else "no-face"]
|
|
if self.has_text:
|
|
bits.append("text")
|
|
bits.append(f"edges={self.edge_density:.3f}")
|
|
rf = ", face-restore on" if self.restore_faces else ""
|
|
if self.adaptive_polish:
|
|
polish = ", adaptive polish"
|
|
elif self.unsharp or self.humanize:
|
|
polish = f", unsharp {self.unsharp}/grain {self.humanize}"
|
|
else:
|
|
polish = ""
|
|
return f"{'+'.join(bits)} -> {self.pipeline} pipeline{rf}{polish}"
|
|
|
|
|
|
def _to_bgr(image: NDArray[Any]) -> NDArray[Any]:
|
|
"""Normalize a 2D grayscale or 4-channel BGRA array to 3-channel BGR."""
|
|
import cv2
|
|
|
|
if image.ndim == 2:
|
|
return cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
|
|
if image.shape[2] == 4:
|
|
return cv2.cvtColor(image, cv2.COLOR_BGRA2BGR)
|
|
return image
|
|
|
|
|
|
def _to_gray(image: NDArray[Any]) -> NDArray[Any]:
|
|
"""Single-channel grayscale; passes a 2D (already-gray) input through unchanged."""
|
|
import cv2
|
|
|
|
if image.ndim == 3 and image.shape[2] >= 3:
|
|
return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
|
return image
|
|
|
|
|
|
def _downscale_for_detection(image: NDArray[Any]) -> NDArray[Any]:
|
|
"""Shrink the long side to ``_DETECT_MAX_SIDE`` for cheap, bounded detection."""
|
|
import cv2
|
|
|
|
h, w = image.shape[:2]
|
|
long_side = max(h, w)
|
|
if long_side <= _DETECT_MAX_SIDE:
|
|
return image
|
|
scale = _DETECT_MAX_SIDE / long_side
|
|
return cv2.resize(image, (max(1, round(w * scale)), max(1, round(h * scale))), interpolation=cv2.INTER_AREA)
|
|
|
|
|
|
def detect_face(image: NDArray[Any]) -> bool:
|
|
"""True if OpenCV YuNet finds at least one face. cv2-only, torch-free."""
|
|
import cv2
|
|
|
|
global _yunet
|
|
img = _to_bgr(image)
|
|
h, w = img.shape[:2]
|
|
if h < 1 or w < 1:
|
|
return False
|
|
try:
|
|
if _yunet is None:
|
|
model = Path(__file__).parent / "assets" / _YUNET_ASSET
|
|
_yunet = cv2.FaceDetectorYN.create(str(model), "", (w, h), _FACE_SCORE, 0.3, 5000)
|
|
_yunet.setInputSize((w, h))
|
|
_, faces = _yunet.detect(img)
|
|
except cv2.error as e: # malformed input / model
|
|
logger.debug("YuNet face detect failed (%s); assuming no face", e)
|
|
return False
|
|
return faces is not None and len(faces) > 0
|
|
|
|
|
|
def _detect_text_dbnet(image: NDArray[Any]) -> bool | None:
|
|
"""DBNet (PP-OCRv3) text-region presence via cv2.dnn.
|
|
|
|
Returns True/False on a successful run, or None if the bundled model can't load
|
|
(the caller then falls back to the MSER heuristic). Loads once, lazily.
|
|
"""
|
|
import cv2
|
|
|
|
global _dbnet
|
|
if _dbnet is False: # a prior load failed; skip straight to the MSER fallback
|
|
return None
|
|
img = _to_bgr(image)
|
|
h, w = img.shape[:2]
|
|
if h < 1 or w < 1:
|
|
return False
|
|
try:
|
|
if _dbnet is None:
|
|
model = Path(__file__).parent / "assets" / _DBNET_ASSET
|
|
net = cv2.dnn.TextDetectionModel_DB(str(model))
|
|
net.setBinaryThreshold(_DBNET_BINARY_THRESHOLD)
|
|
net.setPolygonThreshold(_DBNET_POLYGON_THRESHOLD)
|
|
net.setMaxCandidates(_DBNET_MAX_CANDIDATES)
|
|
net.setUnclipRatio(_DBNET_UNCLIP_RATIO)
|
|
net.setInputParams(1.0 / 255.0, (_DBNET_INPUT_SIDE, _DBNET_INPUT_SIDE), _DBNET_MEAN)
|
|
_dbnet = net
|
|
boxes, _ = _dbnet.detect(img)
|
|
except Exception as e: # model load / inference can raise cv2.error or others
|
|
logger.debug("DBNet text detect failed (%s); falling back to MSER", e)
|
|
_dbnet = False
|
|
return None
|
|
return boxes is not None and len(boxes) > 0
|
|
|
|
|
|
def _detect_text_mser(image: NDArray[Any]) -> bool:
|
|
"""Fallback MSER-based text-presence heuristic (used only if DBNet can't load)."""
|
|
import cv2
|
|
|
|
gray = _to_gray(image)
|
|
h, w = gray.shape[:2]
|
|
try:
|
|
regions, _ = cv2.MSER_create().detectRegions(gray)
|
|
except cv2.error:
|
|
return False
|
|
per_mp = len(regions) / max(1e-6, (h * w) / 1e6)
|
|
return per_mp > _TEXT_MSER_PER_MP
|
|
|
|
|
|
def detect_text(image: NDArray[Any]) -> bool:
|
|
"""Text-presence: DBNet (cv2.dnn) when the bundled model loads, else the MSER heuristic."""
|
|
dbnet = _detect_text_dbnet(image)
|
|
return _detect_text_mser(image) if dbnet is None else dbnet
|
|
|
|
|
|
def edge_density(image: NDArray[Any]) -> float:
|
|
"""Fraction of Canny edge pixels -- a cheap 'has structure' proxy in [0, 1]."""
|
|
import cv2
|
|
|
|
gray = _to_gray(image)
|
|
edges = cv2.Canny(gray, 100, 200)
|
|
return float((edges > 0).mean())
|
|
|
|
|
|
def plan(image_path: Path) -> AutoConfig | None:
|
|
"""Inspect the input image and return the quality modes, or None if unreadable.
|
|
|
|
Pure analysis: loads the image, runs the cv2 detectors on a downscaled copy, and
|
|
applies the quality-priority routing rules. Safe to call wherever the pipeline
|
|
runs; no diffusion model is loaded.
|
|
"""
|
|
from remove_ai_watermarks import image_io
|
|
|
|
image = image_io.imread(image_path)
|
|
if image is None:
|
|
return None
|
|
|
|
h, w = image.shape[:2]
|
|
small = _downscale_for_detection(image)
|
|
gray = _to_gray(small) # convert once; edge density + the MSER fallback use gray
|
|
has_face = detect_face(small) # YuNet needs the 3-channel image
|
|
has_text = detect_text(small) # DBNet wants BGR; the MSER fallback grays it internally
|
|
edges = edge_density(gray)
|
|
|
|
structureless = (not has_face) and (not has_text) and edges < _STRUCTURELESS_EDGE_MAX
|
|
pipeline = "default" if structureless else "controlnet"
|
|
restore_faces = has_face
|
|
smoothing = pipeline == "controlnet" or restore_faces
|
|
|
|
cfg = AutoConfig(
|
|
pipeline=pipeline,
|
|
restore_faces=restore_faces,
|
|
adaptive_polish=smoothing, # adaptive (detail-targeted) polish when a smoothing pass ran
|
|
unsharp=0.0,
|
|
humanize=0.0,
|
|
min_resolution=_UPSCALE_FLOOR,
|
|
has_face=has_face,
|
|
has_text=has_text,
|
|
edge_density=edges,
|
|
width=w,
|
|
height=h,
|
|
)
|
|
logger.debug("auto plan for %s: %s", image_path, cfg.reason)
|
|
return cfg
|