From a0bf62e60161b4461dc9a0a46cc45ac43c875221 Mon Sep 17 00:00:00 2001 From: Victor Kuznetsov Date: Thu, 28 May 2026 11:59:15 -0700 Subject: [PATCH] feat(invisible): preserve text/CJK via Differential Diffusion (--protect-text) (v0.6.10) SDXL img2img regenerates every pixel, so small text and CJK glyphs deform at the strengths that defeat SynthID (issue #21). With --protect-text a CJK-native PP-OCRv3 detector (2.4 MB ONNX, cv2.dnn, no torch, cached on first use) locates text regions and the pass switches to the SDXL Differential-Diffusion community pipeline: a per-pixel change map keeps text regions largely intact while the background is regenerated to strip the watermark. Gated to the SDXL default model; falls back to plain img2img with a warning when unavailable. Co-Authored-By: Claude Opus 4.7 --- CLAUDE.md | 1 + README.md | 3 + pyproject.toml | 2 +- src/remove_ai_watermarks/cli.py | 16 ++ src/remove_ai_watermarks/invisible_engine.py | 4 + .../noai/img2img_runner.py | 90 ++++++++++ .../noai/watermark_remover.py | 114 ++++++++++++ src/remove_ai_watermarks/text_protector.py | 167 ++++++++++++++++++ tests/test_img2img_runner.py | 80 ++++++++- tests/test_text_protector.py | 59 +++++++ uv.lock | 2 +- 11 files changed, 535 insertions(+), 3 deletions(-) create mode 100644 src/remove_ai_watermarks/text_protector.py create mode 100644 tests/test_text_protector.py diff --git a/CLAUDE.md b/CLAUDE.md index f4f7f87..5936965 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -36,6 +36,7 @@ You are a **principal Python engineer** maintaining a CLI tool and library for r - `region_eraser.py` — universal region eraser (`erase` CLI). `erase(image, boxes=|mask=, backend=)`: `boxes_to_mask` → `cv2.inpaint` (`cv2` backend, default, no deps) or big-LaMa via onnxruntime (`lama` backend, extra `lama`, `Carve/LaMa-ONNX` Apache-2.0 model downloaded on first use, never bundled). `erase_lama` crops a padded region around the mask, runs LaMa at its fixed 512² input, pastes only masked pixels back (untouched areas stay pixel-exact). Lazy `_get_lama_session` singleton; `lama_available()` guards the optional import. **LaMa-ONNX costs ~3.5-4 GB peak RAM and ~5-6 s/call on CPU** (FFC working set, not arena — `enable_cpu_mem_arena=False` does not help), so it does NOT fit a minimal droplet; the cv2 backend (tens of MB, ~30 ms) does. LaMa quality at low RAM = serverless/GPU, mirroring how raiw.cc offloads SDXL to fal. - `invisible_watermark.py` — `detect_invisible_watermark(path)` decodes the OPEN DWT-DCT watermarks (public decoder, no key) embedded by Stable Diffusion / SDXL / FLUX via the `imwatermark` library. Known fixed patterns (verified against upstream source) live in `_BITS_48` (SDXL 48-bit, FLUX.2 48-bit) and `_SD1_STRING` ("StableDiffusionV1", SD 1.x/2.x). Optional dep (extra `detect`); returns None when absent. The `detect` extra pulls **torch** transitively (invisible-watermark declares torch a hard dep, and `WatermarkDecoder` eagerly imports `rivaGan` -> `torch` at import time), so detection needs torch present even though dwtDct runs CPU-only on cv2/numpy/pywavelets — no GPU and no separate `gpu` extra required. **Unlike SynthID this is locally detectable**, but the watermark is fragile (does not survive JPEG re-encode/resize — verified gone after JPEG q90), so it confirms origin only on pristine files. Add new known patterns here. The file carries a top-of-module pyright pragma because imwatermark/cv2 ship no type stubs. - `trustmark_detector.py` — `detect_trustmark(path)` decodes the OPEN, keyless **Adobe TrustMark** watermark (the soft binding behind Adobe Durable Content Credentials, `alg` `com.adobe.trustmark.P`) via the optional `trustmark` package (extra `trustmark`; pulls torch, downloads model weights on first use). Mirrors `invisible_watermark.py` (lazy singleton, top-of-module pyright pragma, returns None when absent). It detects *provenance*, not AI origin as such (TrustMark also marks human-authored content), so `identify` lists it as a watermark without setting `is_ai_generated`. Other soft-binding vendors (Digimarc/Imatag/Steg.AI/...) have no public decoder — they are only *named* via the `C2PA_SOFT_BINDINGS` scan, not decoded. +- `text_protector.py` — text-region protection for the `invisible` SDXL img2img pass (issue #21: CJK/small text deforms at watermark-removal strengths). `is_available()` gates on `cv2.dnn.TextDetectionModel_DB`; `TextProtector.detect_text_boxes(bgr)` runs the **PP-OCRv3 CN** ONNX detector (~2.4 MB, Apache-2.0, opencv_zoo, CJK-native, returns rotated quad polygons) — downloaded+cached to `~/.cache/remove-ai-watermarks` on first use via atomic temp-rename, never bundled, **no torch (cv2.dnn only)**. `build_change_map(boxes, h, w, preserve=0.9, feather=15)` paints a Differential-Diffusion change map. **Polarity (verified empirically):** white(1.0)=PRESERVE original pixels, black(0.0)=MAX change; map is black bg + `preserve` inside text polygons, Gaussian-feathered edges, clipped to [0,1]. `preserve` stays below a hard 1.0 freeze by default so text still scrubs lightly (SynthID survives cropping). Wired into `watermark_remover._run_differential` via the community `pipeline_stable_diffusion_xl_differential_img2img` (loaded with `custom_revision="0.38.0"` — HF resolves the **PyPI** version string, not the `v0.38.0` git tag); gated to the SDXL `DEFAULT_MODEL_ID` only (`_can_protect_text`), falls back to plain img2img with a warning otherwise. The diff pipeline upcasts the VAE to fp32 internally, so do **not** add `upcast_vae()`/`enable_attention_slicing` (both produced NaN/black on fp16 MPS). `build_change_map` is unit-tested without any model download (`tests/test_text_protector.py`). - `face_protector.py` — YOLO detect + soft-blend pattern; mirror this for any "protect region during diffusion" features - `image_io.py` — Unicode-safe cv2 IO (issue #17). `imread(path, flags=None)` / `imwrite(path, img)` wrap `np.fromfile`+`cv2.imdecode` / `cv2.imencode`+`tofile` so non-ASCII paths work on Windows -- bare `cv2.imread`/`cv2.imwrite` use the platform ANSI code-page API there and fail (empty decode + `can't open/read file`) on Chinese/Cyrillic/accented filenames. `imread` keeps `cv2.imread` semantics (defaults to `IMREAD_COLOR`, returns `None` on missing/empty/undecodable). **Every cv2 file read/write in the package routes through here; do not call `cv2.imread`/`cv2.imwrite` directly.** macOS/Linux already accept UTF-8 paths, so it is behavior-neutral there (the bug only reproduces on Windows). cv2/numpy are imported lazily inside the functions, so the module is cheap to import in a bare env. diff --git a/README.md b/README.md index 5213e10..9a64f7e 100644 --- a/README.md +++ b/README.md @@ -107,6 +107,8 @@ SDXL is the default since May 2026: empirically defeats SynthID v2 on Gemini 3 P **Analog Humanizer**: optional film grain and chromatic aberration injection that mimics a photo of a screen, raising the bar for AI-generated image classifiers. (It frustrates generic classifiers but does not guarantee forensic invisibility — see the [arXiv:2605.09203](https://arxiv.org/abs/2605.09203) note above.) +**Text Protection** (`--protect-text`): SDXL img2img regenerates every pixel, so small text and CJK glyphs get deformed at the strengths that defeat SynthID. With this flag a CJK-native PP-OCRv3 text detector (a 2.4 MB ONNX model run on CPU via OpenCV's DNN module, downloaded and cached on first use) locates text regions and the pass switches to Differential Diffusion: a per-pixel change map keeps the text regions largely intact while the background is regenerated normally, so glyphs survive the removal pass. SDXL default pipeline only. + ### Stripping C2PA, EXIF, and "Made with AI" metadata AI tools embed generation metadata that social platforms use to show "Made with AI" labels: @@ -247,6 +249,7 @@ remove-ai-watermarks erase image.png --region 1640,1930,400,100 -o clean.png remove-ai-watermarks invisible image.png -o clean.png --humanize 4.0 # Runs at native resolution by default. On a very large image that OOMs the # GPU/MPS, cap the long side: --max-resolution 2048 +# Preserve text / CJK glyphs during regeneration: --protect-text # Check / strip AI metadata (C2PA, EXIF, "Made with AI" labels) # --check also flags SynthID-bearing sources: a C2PA manifest signed by diff --git a/pyproject.toml b/pyproject.toml index b87e0b7..df29d65 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "remove-ai-watermarks" -version = "0.6.9" +version = "0.6.10" description = "Remove visible and invisible AI watermarks from images (Gemini / Nano Banana, ChatGPT, Stable Diffusion)" readme = "README.md" requires-python = ">=3.10" diff --git a/src/remove_ai_watermarks/cli.py b/src/remove_ai_watermarks/cli.py index ce90936..9809824 100644 --- a/src/remove_ai_watermarks/cli.py +++ b/src/remove_ai_watermarks/cli.py @@ -460,6 +460,12 @@ def cmd_erase( default=0, help="Cap long side (px) before diffusion; 0 = native (best quality, like raiw.cc). Raise only on GPU/MPS OOM.", ) +@click.option( + "--protect-text", + is_flag=True, + default=False, + help="Preserve detected text (incl. CJK) via Differential Diffusion. SDXL default pipeline only.", +) @click.pass_context def cmd_invisible( ctx: click.Context, @@ -473,6 +479,7 @@ def cmd_invisible( hf_token: str | None, humanize: float, max_resolution: int, + protect_text: bool, ) -> None: """Remove invisible AI watermarks (SynthID, StableSignature, TreeRing). @@ -519,6 +526,7 @@ def cmd_invisible( guidance_scale=None, seed=seed, humanize=humanize, + protect_text=protect_text, max_resolution=max_resolution, ) elapsed = time.monotonic() - t0 @@ -671,6 +679,12 @@ def cmd_identify(ctx: click.Context, source: Path, no_visible: bool, as_json: bo default=0, help="Cap long side (px) before diffusion; 0 = native (best quality, like raiw.cc). Raise only on GPU/MPS OOM.", ) +@click.option( + "--protect-text", + is_flag=True, + default=False, + help="Preserve detected text (incl. CJK) via Differential Diffusion. SDXL default pipeline only.", +) @click.pass_context def cmd_all( ctx: click.Context, @@ -687,6 +701,7 @@ def cmd_all( hf_token: str | None, humanize: float, max_resolution: int, + protect_text: bool, ) -> None: """Remove ALL watermarks: visible + invisible + metadata. @@ -778,6 +793,7 @@ def cmd_all( num_inference_steps=steps, seed=seed, humanize=humanize, + protect_text=protect_text, max_resolution=max_resolution, ) console.print(" [green]✓[/] Invisible watermark removed") diff --git a/src/remove_ai_watermarks/invisible_engine.py b/src/remove_ai_watermarks/invisible_engine.py index 98225d1..f979c5d 100644 --- a/src/remove_ai_watermarks/invisible_engine.py +++ b/src/remove_ai_watermarks/invisible_engine.py @@ -125,6 +125,7 @@ class InvisibleEngine: seed: int | None = None, humanize: float = 0.0, protect_faces: bool = True, + protect_text: bool = False, max_resolution: int = 0, ) -> Path: """Remove invisible watermark from an image. @@ -138,6 +139,8 @@ class InvisibleEngine: seed: Random seed for reproducibility. humanize: Intensity of Analog Humanizer film grain (0 = off). protect_faces: Boolean to extract and restore faces intact. + protect_text: Preserve detected text regions via Differential + Diffusion so glyphs (incl. CJK) survive the removal pass. max_resolution: Cap the long side (px) before diffusion. 0 (default) = native resolution, no pre-downscale -- matches the hosted raiw.cc backend. Set a positive value only to bound GPU/MPS @@ -210,6 +213,7 @@ class InvisibleEngine: num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, seed=seed, + protect_text=protect_text, ) # Optional: Face restoration & Humanizer (Phase 2 - Post-processing) diff --git a/src/remove_ai_watermarks/noai/img2img_runner.py b/src/remove_ai_watermarks/noai/img2img_runner.py index 9a14cf8..85951aa 100644 --- a/src/remove_ai_watermarks/noai/img2img_runner.py +++ b/src/remove_ai_watermarks/noai/img2img_runner.py @@ -121,6 +121,96 @@ def run_img2img_with_mps_fallback( raise +def run_differential( + pipeline: Any, + image: Image.Image, + change_map: Any, + strength: float, + num_inference_steps: int, + guidance_scale: float, + generator: Any, + device: str, + set_progress: Callable[[str], None], +) -> Image.Image: + """Run the SDXL Differential-Diffusion pipeline and return the image. + + Unlike standard img2img, the differential pipeline needs pre-processed image + tensors plus a per-pixel change map (HxW float32 in [0, 1]); white preserves + the original pixels, black regenerates them. Runs without a step callback -- + the community pipeline's callback signature differs across diffusers + versions, and a protect-text pass is short. + """ + import torch + + image_tensor = pipeline.image_processor.preprocess(image).to(device) + map_tensor = torch.from_numpy(change_map)[None].to(device) # pyright: ignore[reportPrivateImportUsage, reportUnknownMemberType] + set_progress(f"Running protected regeneration ({device}, strength={strength})...") + result = pipeline( + prompt="", + image=image_tensor, + original_image=image_tensor, + map=map_tensor, + strength=strength, + num_inference_steps=num_inference_steps, + guidance_scale=guidance_scale, + generator=generator, + ) + return result.images[0] + + +def run_differential_with_mps_fallback( + load_pipeline: Callable[[], Any], + image: Image.Image, + change_map: Any, + strength: float, + num_inference_steps: int, + guidance_scale: float, + generator: Any, + device: str, + set_progress: Callable[[str], None], + *, + reload_on_cpu: Callable[[], Any], +) -> tuple[Image.Image, str]: + """Run differential img2img; on MPS error, fall back to CPU. + + Returns: + (result_image, final_device) -- device may change to ``"cpu"`` on fallback. + """ + pipeline = load_pipeline() + try: + img = run_differential( + pipeline, + image, + change_map, + strength, + num_inference_steps, + guidance_scale, + generator, + device, + set_progress, + ) + return img, device + except RuntimeError as error: + if device == "mps" and is_mps_error(error): + logger.warning("MPS error detected: %s. Falling back to CPU.", error) + set_progress("MPS error! Clearing cache and retrying on CPU...") + _try_clear_mps_cache() + pipeline = reload_on_cpu() + img = run_differential( + pipeline, + image, + change_map, + strength, + num_inference_steps, + guidance_scale, + None, + "cpu", + set_progress, + ) + return img, "cpu" + raise + + def _call_pipeline( pipeline: Any, image: Image.Image, diff --git a/src/remove_ai_watermarks/noai/watermark_remover.py b/src/remove_ai_watermarks/noai/watermark_remover.py index 503ce56..39c9fc4 100644 --- a/src/remove_ai_watermarks/noai/watermark_remover.py +++ b/src/remove_ai_watermarks/noai/watermark_remover.py @@ -229,6 +229,13 @@ def get_device() -> str: # Keep legacy name available for backwards compatibility _detect_model_profile_from_id = detect_model_profile +# SDXL Differential-Diffusion community pipeline, pinned to the installed +# diffusers version so the fetched pipeline code matches the library (see #21). +# Diffusers' dynamic-module loader resolves ``custom_revision`` against the +# package version string (``0.38.0``), NOT the GitHub git tag (``v0.38.0``). +_DIFF_PIPELINE_NAME = "pipeline_stable_diffusion_xl_differential_img2img" +_DIFF_PIPELINE_REVISION = "0.38.0" + class WatermarkRemover: """Remove watermarks from images using diffusion model regeneration. @@ -271,6 +278,7 @@ class WatermarkRemover: self.torch_dtype = torch_dtype self._pipeline: AutoImg2ImgPipeline | None = None + self._diff_pipeline: Any = None self._ctrlregen_engine: Any = None self._progress_callback = progress_callback self.hf_token: str | None = hf_token or os.environ.get("HF_TOKEN") @@ -379,6 +387,7 @@ class WatermarkRemover: num_inference_steps: int = 50, guidance_scale: float | None = None, seed: int | None = None, + protect_text: bool = False, ) -> Path: """Remove watermark from an image using regeneration attack. @@ -389,6 +398,8 @@ class WatermarkRemover: num_inference_steps: Number of denoising steps. guidance_scale: Classifier-free guidance scale. seed: Random seed for reproducibility. + protect_text: Preserve detected text regions via Differential + Diffusion (SDXL default profile only). Off by default. Returns: Path to the cleaned image. @@ -437,7 +448,21 @@ class WatermarkRemover: guidance_scale, generator, ) + elif protect_text and self._can_protect_text(): + cleaned_image = self._run_differential( + init_image, + strength, + num_inference_steps, + guidance_scale, + generator, + ) else: + if protect_text: + logger.warning( + "protect_text requested but unavailable " + "(needs the SDXL default model and the cv2 text detector); " + "running standard img2img." + ) cleaned_image = self._run_img2img( init_image, strength, @@ -520,6 +545,95 @@ class WatermarkRemover: self._pipeline = None return self._load_pipeline() + # ── Text-protected differential runner ─────────────────────────── + + def _can_protect_text(self) -> bool: + """True when text protection can run: SDXL default model + cv2 detector.""" + from remove_ai_watermarks import text_protector + + return self.model_id == self.DEFAULT_MODEL_ID and text_protector.is_available() + + def _load_differential_pipeline(self) -> Any: + """Load the SDXL Differential-Diffusion community pipeline lazily.""" + if self._diff_pipeline is None: + from diffusers import DiffusionPipeline + + self._set_progress("Loading Differential-Diffusion pipeline (protect-text)...") + use_fp16 = self.device in {"mps", "cuda"} + load_kwargs: dict[str, Any] = { + "custom_pipeline": _DIFF_PIPELINE_NAME, + "custom_revision": _DIFF_PIPELINE_REVISION, + "torch_dtype": torch.float16 if use_fp16 else torch.float32, # type: ignore[attr-defined] + "use_safetensors": True, + } + if use_fp16: + load_kwargs["variant"] = "fp16" + if self.hf_token: + load_kwargs["token"] = self.hf_token + + pipeline = DiffusionPipeline.from_pretrained(self.model_id, **load_kwargs).to(self.device) + # The differential pipeline upcasts the SDXL VAE to fp32 internally + # (the fp16 VAE decodes to NaN/black otherwise), so we add no extra + # VAE handling here. Attention slicing is also left off on MPS: it + # produced NaN latents with this pipeline, and the protect-text pass + # is short enough not to need it. + with contextlib.suppress(Exception): + pipeline.set_progress_bar_config(disable=True) + self._diff_pipeline = pipeline + return self._diff_pipeline + + def _reload_differential_on_cpu(self) -> Any: + """Reload the differential pipeline on CPU after an MPS failure.""" + self.device = "cpu" + self.torch_dtype = torch.float32 # type: ignore[assignment] + self._diff_pipeline = None + return self._load_differential_pipeline() + + def _run_differential( + self, + init_image: Image.Image, + strength: float, + num_inference_steps: int, + guidance_scale: float, + generator: Any, + ) -> Image.Image: + """Run differential img2img that preserves detected text regions.""" + import cv2 + import numpy as np + + from remove_ai_watermarks import text_protector + + self._set_progress("Detecting text regions to protect (protect-text)...") + bgr = cv2.cvtColor(np.array(init_image), cv2.COLOR_RGB2BGR) + try: + boxes = text_protector.TextProtector().detect_text_boxes(bgr) + except Exception as exc: + logger.warning("Text detection failed (%s); running standard img2img.", exc) + return self._run_img2img(init_image, strength, num_inference_steps, guidance_scale, generator) + + width, height = init_image.size + change_map = text_protector.build_change_map(boxes, height, width) + self._set_progress(f"Protecting {len(boxes)} text region(s) via Differential Diffusion...") + + from remove_ai_watermarks.noai.img2img_runner import run_differential_with_mps_fallback + + result_image, final_device = run_differential_with_mps_fallback( + load_pipeline=self._load_differential_pipeline, + image=init_image, + change_map=change_map, + strength=strength, + num_inference_steps=num_inference_steps, + guidance_scale=guidance_scale, + generator=generator, + device=self.device, + set_progress=self._set_progress, + reload_on_cpu=self._reload_differential_on_cpu, + ) + if final_device != self.device: + self.device = final_device + self.torch_dtype = torch.float32 # type: ignore[assignment] + return result_image + # ── CtrlRegen runner ───────────────────────────────────────────── def _run_ctrlregen( diff --git a/src/remove_ai_watermarks/text_protector.py b/src/remove_ai_watermarks/text_protector.py new file mode 100644 index 0000000..fe10fe3 --- /dev/null +++ b/src/remove_ai_watermarks/text_protector.py @@ -0,0 +1,167 @@ +"""Text-region protection for diffusion-based watermark removal. + +SDXL img2img (the ``invisible`` pipeline) regenerates every pixel, so small text +and CJK glyphs get deformed at the strengths that defeat SynthID (issue #21). +This module detects text regions and builds a per-pixel "change map" for +Differential Diffusion: the background is regenerated normally while text +regions are largely preserved, so glyphs survive the watermark-removal pass. + +Detection uses only OpenCV's DNN module (no torch): the PP-OCRv3 text detector +is a ~2.4 MB ONNX model (Apache-2.0, from opencv_zoo) that is CJK-native and +returns rotated quadrilaterals. The model is downloaded and cached on first use; +it is never bundled in this repo. + +Change-map polarity (verified empirically against the differential pipeline): +white (1.0) = PRESERVE the original pixels, black (0.0) = MAXIMUM change. So the +map is black everywhere except the text polygons, which are painted toward +white. ``preserve`` stays below a hard 1.0 freeze by default: SynthID is +designed to survive cropping, so totally freezing text pixels would leave the +watermark intact there. A high-but-partial preserve still scrubs lightly. +""" + +# cv2 ships no type stubs; mirror the pragma used by the other cv2-using modules. +# pyright: reportMissingTypeStubs=false, reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false, reportCallIssue=false, reportArgumentType=false, reportReturnType=false + +from __future__ import annotations + +import logging +import os +import tempfile +import urllib.request +from pathlib import Path +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from numpy.typing import NDArray + +logger = logging.getLogger(__name__) + +# PP-OCRv3 Chinese text detector (DB head), opencv_zoo, Apache-2.0. +_MODEL_URL = ( + "https://github.com/opencv/opencv_zoo/raw/main/models/text_detection_ppocr/text_detection_cn_ppocrv3_2023may.onnx" +) +_MODEL_FILENAME = "text_detection_cn_ppocrv3_2023may.onnx" + +# DB detector input: long side scaled to this, rounded to a multiple of 32. +_DET_INPUT_LONG_SIDE = 736 +# ImageNet mean (x255) and 1/255 scale -- the normalization PP-OCRv3 expects. +_DET_MEAN = (0.485 * 255, 0.456 * 255, 0.406 * 255) +_DET_SCALE = 1 / 255.0 + + +def is_available() -> bool: + """True when OpenCV's DNN text-detection model is importable.""" + try: + import cv2 + + return hasattr(cv2.dnn, "TextDetectionModel_DB") + except ImportError: + return False + + +def _cache_dir() -> Path: + """Local cache directory for the detector model (created on demand).""" + cache = Path.home() / ".cache" / "remove-ai-watermarks" + cache.mkdir(parents=True, exist_ok=True) + return cache + + +def _model_path() -> Path: + """Return the cached detector path, downloading it on first use.""" + target = _cache_dir() / _MODEL_FILENAME + if target.exists() and target.stat().st_size > 0: + return target + logger.info("Downloading PP-OCRv3 text detector (~2.4 MB) to %s", target) + # Download to a temp file in the same dir, then atomically rename so a + # partial download never leaves a corrupt model cached. + fd, tmp_name = tempfile.mkstemp(dir=str(target.parent), suffix=".onnx.part") + tmp_path = Path(tmp_name) + try: + os.close(fd) + with urllib.request.urlopen(_MODEL_URL) as resp: # noqa: S310 (trusted GitHub URL) + tmp_path.write_bytes(resp.read()) + tmp_path.replace(target) + finally: + if tmp_path.exists(): + tmp_path.unlink() + return target + + +def build_change_map( + boxes: list[NDArray[Any]], + height: int, + width: int, + preserve: float = 0.9, + feather: int = 15, +) -> NDArray[Any]: + """Build a Differential-Diffusion change map from text polygons. + + Args: + boxes: Text-region polygons as arrays of (x, y) vertices. + height: Output map height in pixels. + width: Output map width in pixels. + preserve: Map value painted inside text polygons (0..1). White (1.0) + fully preserves the original pixels; the default 0.9 preserves + strongly while still letting a light scrub through. + feather: Gaussian-blur kernel size for soft polygon edges (forced odd). + + Returns: + Float32 HxW array in [0, 1]: ~0 in the background (full change), + ``preserve`` inside text regions, blended at the edges. + """ + import cv2 + import numpy as np + + change_map = np.zeros((height, width), np.float32) + if boxes: + polys = [np.asarray(b, np.int32) for b in boxes] + cv2.fillPoly(change_map, polys, float(preserve)) + if feather > 0: + if feather % 2 == 0: + feather += 1 + change_map = cv2.GaussianBlur(change_map, (feather, feather), 0) + # GaussianBlur can overshoot the painted value by a float epsilon; keep + # the contract that the map stays a valid [0, 1] change map. + np.clip(change_map, 0.0, 1.0, out=change_map) + return change_map + + +class TextProtector: + """Detect text regions with PP-OCRv3 for diffusion change-map protection.""" + + def __init__( + self, + binary_threshold: float = 0.3, + polygon_threshold: float = 0.5, + max_candidates: int = 200, + unclip_ratio: float = 2.0, + ) -> None: + import cv2 + + self._detector = cv2.dnn.TextDetectionModel_DB(str(_model_path())) + self._detector.setBinaryThreshold(binary_threshold) + self._detector.setPolygonThreshold(polygon_threshold) + self._detector.setMaxCandidates(max_candidates) + self._detector.setUnclipRatio(unclip_ratio) + + def detect_text_boxes(self, bgr_image: NDArray[Any]) -> list[NDArray[Any]]: + """Detect text regions, returning a list of rotated quad polygons. + + Args: + bgr_image: Image as an HxWx3 BGR uint8 array (OpenCV convention). + + Returns: + One array of four (x, y) vertices per detected text region. + """ + height, width = bgr_image.shape[:2] + scale = _DET_INPUT_LONG_SIDE / max(height, width) + in_w = max((round(width * scale) // 32) * 32, 32) + in_h = max((round(height * scale) // 32) * 32, 32) + self._detector.setInputParams( + scale=_DET_SCALE, + size=(in_w, in_h), + mean=_DET_MEAN, + swapRB=True, + ) + boxes, _confidences = self._detector.detect(bgr_image) + return list(boxes) diff --git a/tests/test_img2img_runner.py b/tests/test_img2img_runner.py index c76bed5..5b0b73c 100644 --- a/tests/test_img2img_runner.py +++ b/tests/test_img2img_runner.py @@ -14,7 +14,11 @@ from unittest.mock import Mock import pytest from remove_ai_watermarks.noai import img2img_runner -from remove_ai_watermarks.noai.img2img_runner import run_img2img, run_img2img_with_mps_fallback +from remove_ai_watermarks.noai.img2img_runner import ( + run_differential_with_mps_fallback, + run_img2img, + run_img2img_with_mps_fallback, +) _MPS_OOM = "MPS backend out of memory (MPS allocated: 17.21 GiB, max allowed: 20.13 GiB)" @@ -106,6 +110,80 @@ class TestMpsFallback: reload_on_cpu.assert_not_called() +class TestDifferentialMpsFallback: + """The protect-text (Differential Diffusion) path shares the MPS->CPU + fallback contract; mock ``run_differential`` so no torch/model is needed.""" + + def test_mps_error_reloads_on_cpu_and_retries(self, monkeypatch: pytest.MonkeyPatch): + sentinel = object() + inner = Mock(side_effect=[RuntimeError(_MPS_OOM), sentinel]) + monkeypatch.setattr(img2img_runner, "run_differential", inner) + reload_on_cpu = Mock(return_value="cpu_pipe") + + img, device = run_differential_with_mps_fallback( + load_pipeline=Mock(return_value="gpu_pipe"), + image=object(), + change_map=object(), + strength=0.05, + num_inference_steps=50, + guidance_scale=7.5, + generator="gen", + device="mps", + set_progress=lambda _m: None, + reload_on_cpu=reload_on_cpu, + ) + + assert (img, device) == (sentinel, "cpu") + reload_on_cpu.assert_called_once() + assert inner.call_count == 2 + # Retry uses the reloaded CPU pipeline, device "cpu", and drops the MPS + # generator (generator=None) for deterministic CPU execution. + retry_args = inner.call_args_list[1].args + assert retry_args[0] == "cpu_pipe" + assert retry_args[6] is None # generator + assert retry_args[7] == "cpu" # device + + def test_happy_path_returns_original_device_without_reload(self, monkeypatch: pytest.MonkeyPatch): + sentinel = object() + monkeypatch.setattr(img2img_runner, "run_differential", Mock(return_value=sentinel)) + reload_on_cpu = Mock() + + img, device = run_differential_with_mps_fallback( + load_pipeline=Mock(return_value="gpu_pipe"), + image=object(), + change_map=object(), + strength=0.05, + num_inference_steps=50, + guidance_scale=7.5, + generator="gen", + device="mps", + set_progress=lambda _m: None, + reload_on_cpu=reload_on_cpu, + ) + + assert (img, device) == (sentinel, "mps") + reload_on_cpu.assert_not_called() + + def test_non_mps_runtime_error_propagates(self, monkeypatch: pytest.MonkeyPatch): + monkeypatch.setattr(img2img_runner, "run_differential", Mock(side_effect=RuntimeError("CUDA out of memory"))) + reload_on_cpu = Mock() + + with pytest.raises(RuntimeError, match="CUDA"): + run_differential_with_mps_fallback( + load_pipeline=Mock(return_value="gpu_pipe"), + image=object(), + change_map=object(), + strength=0.05, + num_inference_steps=50, + guidance_scale=7.5, + generator="gen", + device="mps", + set_progress=lambda _m: None, + reload_on_cpu=reload_on_cpu, + ) + reload_on_cpu.assert_not_called() + + class TestRunImg2Img: def test_returns_first_image_from_pipeline_result(self): sentinel = object() diff --git a/tests/test_text_protector.py b/tests/test_text_protector.py new file mode 100644 index 0000000..7ac7dcb --- /dev/null +++ b/tests/test_text_protector.py @@ -0,0 +1,59 @@ +"""Unit tests for the text-protection change-map helper (no model download). + +``build_change_map`` is the pure cv2/numpy part of ``text_protector``: it turns +detected text polygons into a Differential-Diffusion change map. The polarity is +load-bearing and was verified empirically (white = preserve, black = change), so +a regression here would either freeze the whole image or fail to protect text. +The PP-OCRv3 detector itself needs a model download and is not exercised here. +""" + +from __future__ import annotations + +import numpy as np + +from remove_ai_watermarks.text_protector import build_change_map + + +class TestBuildChangeMap: + def test_no_boxes_is_all_change(self): + m = build_change_map([], 32, 48) + assert m.shape == (32, 48) + assert m.dtype == np.float32 + assert float(m.max()) == 0.0 + + def test_text_region_is_preserved_background_is_change(self): + # A 20x20 box centered in a 64x64 map, no feather for a crisp check. + box = np.array([[22, 22], [42, 22], [42, 42], [22, 42]]) + m = build_change_map([box], 64, 64, preserve=0.9, feather=0) + # Inside the polygon: painted to preserve value. + assert m[32, 32] == np.float32(0.9) + # Far background: untouched -> full change (0.0). + assert m[2, 2] == 0.0 + # Polarity: text preserved more than background. + assert m[32, 32] > m[2, 2] + + def test_preserve_value_is_respected(self): + box = np.array([[10, 10], [30, 10], [30, 30], [10, 30]]) + m = build_change_map([box], 40, 40, preserve=0.5, feather=0) + assert m[20, 20] == np.float32(0.5) + + def test_feather_creates_soft_edge_gradient(self): + box = np.array([[20, 20], [44, 20], [44, 44], [20, 44]]) + m = build_change_map([box], 64, 64, preserve=1.0, feather=15) + center = m[32, 32] + # An edge pixel just outside the polygon should be partially blended: + # strictly between full-change (0) and the preserved center. + edge = m[32, 47] + assert 0.0 < edge < center + assert center <= 1.0 + + def test_even_feather_does_not_crash(self): + box = np.array([[10, 10], [30, 10], [30, 30], [10, 30]]) + m = build_change_map([box], 40, 40, feather=14) + assert m.shape == (40, 40) + + def test_values_stay_in_unit_range(self): + box = np.array([[5, 5], [35, 5], [35, 35], [5, 35]]) + m = build_change_map([box], 40, 40, preserve=1.0, feather=9) + assert float(m.min()) >= 0.0 + assert float(m.max()) <= 1.0 diff --git a/uv.lock b/uv.lock index a10c695..f45badb 100644 --- a/uv.lock +++ b/uv.lock @@ -2865,7 +2865,7 @@ wheels = [ [[package]] name = "remove-ai-watermarks" -version = "0.6.9" +version = "0.6.10" source = { editable = "." } dependencies = [ { name = "click" },