From 0eec3001bbf31a19021315f9eb3a69c6b902fa37 Mon Sep 17 00:00:00 2001 From: Victor Kuznetsov Date: Thu, 28 May 2026 12:24:09 -0700 Subject: [PATCH] feat(invisible): protect text automatically by default (#21) Mirror protect_faces: protect_text defaults to True in invisible_engine and watermark_remover, so the SDXL pipeline detects text per image and switches to Differential Diffusion only when glyphs are found. Text-free inputs fall back to plain img2img with no differential-pipeline load, so the autonomy is free. The CLI now exposes a single off-switch --no-protect-text instead of the positive flag, keeping the interface minimal. Co-Authored-By: Claude Opus 4.7 --- CLAUDE.md | 2 +- README.md | 4 ++-- src/remove_ai_watermarks/cli.py | 16 ++++++++-------- src/remove_ai_watermarks/invisible_engine.py | 7 ++++--- .../noai/watermark_remover.py | 16 +++++++++++----- 5 files changed, 26 insertions(+), 19 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 5936965..00421d2 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -36,7 +36,7 @@ You are a **principal Python engineer** maintaining a CLI tool and library for r - `region_eraser.py` — universal region eraser (`erase` CLI). `erase(image, boxes=|mask=, backend=)`: `boxes_to_mask` → `cv2.inpaint` (`cv2` backend, default, no deps) or big-LaMa via onnxruntime (`lama` backend, extra `lama`, `Carve/LaMa-ONNX` Apache-2.0 model downloaded on first use, never bundled). `erase_lama` crops a padded region around the mask, runs LaMa at its fixed 512² input, pastes only masked pixels back (untouched areas stay pixel-exact). Lazy `_get_lama_session` singleton; `lama_available()` guards the optional import. **LaMa-ONNX costs ~3.5-4 GB peak RAM and ~5-6 s/call on CPU** (FFC working set, not arena — `enable_cpu_mem_arena=False` does not help), so it does NOT fit a minimal droplet; the cv2 backend (tens of MB, ~30 ms) does. LaMa quality at low RAM = serverless/GPU, mirroring how raiw.cc offloads SDXL to fal. - `invisible_watermark.py` — `detect_invisible_watermark(path)` decodes the OPEN DWT-DCT watermarks (public decoder, no key) embedded by Stable Diffusion / SDXL / FLUX via the `imwatermark` library. Known fixed patterns (verified against upstream source) live in `_BITS_48` (SDXL 48-bit, FLUX.2 48-bit) and `_SD1_STRING` ("StableDiffusionV1", SD 1.x/2.x). Optional dep (extra `detect`); returns None when absent. The `detect` extra pulls **torch** transitively (invisible-watermark declares torch a hard dep, and `WatermarkDecoder` eagerly imports `rivaGan` -> `torch` at import time), so detection needs torch present even though dwtDct runs CPU-only on cv2/numpy/pywavelets — no GPU and no separate `gpu` extra required. **Unlike SynthID this is locally detectable**, but the watermark is fragile (does not survive JPEG re-encode/resize — verified gone after JPEG q90), so it confirms origin only on pristine files. Add new known patterns here. The file carries a top-of-module pyright pragma because imwatermark/cv2 ship no type stubs. - `trustmark_detector.py` — `detect_trustmark(path)` decodes the OPEN, keyless **Adobe TrustMark** watermark (the soft binding behind Adobe Durable Content Credentials, `alg` `com.adobe.trustmark.P`) via the optional `trustmark` package (extra `trustmark`; pulls torch, downloads model weights on first use). Mirrors `invisible_watermark.py` (lazy singleton, top-of-module pyright pragma, returns None when absent). It detects *provenance*, not AI origin as such (TrustMark also marks human-authored content), so `identify` lists it as a watermark without setting `is_ai_generated`. Other soft-binding vendors (Digimarc/Imatag/Steg.AI/...) have no public decoder — they are only *named* via the `C2PA_SOFT_BINDINGS` scan, not decoded. -- `text_protector.py` — text-region protection for the `invisible` SDXL img2img pass (issue #21: CJK/small text deforms at watermark-removal strengths). `is_available()` gates on `cv2.dnn.TextDetectionModel_DB`; `TextProtector.detect_text_boxes(bgr)` runs the **PP-OCRv3 CN** ONNX detector (~2.4 MB, Apache-2.0, opencv_zoo, CJK-native, returns rotated quad polygons) — downloaded+cached to `~/.cache/remove-ai-watermarks` on first use via atomic temp-rename, never bundled, **no torch (cv2.dnn only)**. `build_change_map(boxes, h, w, preserve=0.9, feather=15)` paints a Differential-Diffusion change map. **Polarity (verified empirically):** white(1.0)=PRESERVE original pixels, black(0.0)=MAX change; map is black bg + `preserve` inside text polygons, Gaussian-feathered edges, clipped to [0,1]. `preserve` stays below a hard 1.0 freeze by default so text still scrubs lightly (SynthID survives cropping). Wired into `watermark_remover._run_differential` via the community `pipeline_stable_diffusion_xl_differential_img2img` (loaded with `custom_revision="0.38.0"` — HF resolves the **PyPI** version string, not the `v0.38.0` git tag); gated to the SDXL `DEFAULT_MODEL_ID` only (`_can_protect_text`), falls back to plain img2img with a warning otherwise. The diff pipeline upcasts the VAE to fp32 internally, so do **not** add `upcast_vae()`/`enable_attention_slicing` (both produced NaN/black on fp16 MPS). `build_change_map` is unit-tested without any model download (`tests/test_text_protector.py`). +- `text_protector.py` — text-region protection for the `invisible` SDXL img2img pass (issue #21: CJK/small text deforms at watermark-removal strengths). `is_available()` gates on `cv2.dnn.TextDetectionModel_DB`; `TextProtector.detect_text_boxes(bgr)` runs the **PP-OCRv3 CN** ONNX detector (~2.4 MB, Apache-2.0, opencv_zoo, CJK-native, returns rotated quad polygons) — downloaded+cached to `~/.cache/remove-ai-watermarks` on first use via atomic temp-rename, never bundled, **no torch (cv2.dnn only)**. `build_change_map(boxes, h, w, preserve=0.9, feather=15)` paints a Differential-Diffusion change map. **Polarity (verified empirically):** white(1.0)=PRESERVE original pixels, black(0.0)=MAX change; map is black bg + `preserve` inside text polygons, Gaussian-feathered edges, clipped to [0,1]. `preserve` stays below a hard 1.0 freeze by default so text still scrubs lightly (SynthID survives cropping). Wired into `watermark_remover._run_differential` via the community `pipeline_stable_diffusion_xl_differential_img2img` (loaded with `custom_revision="0.38.0"` — HF resolves the **PyPI** version string, not the `v0.38.0` git tag); gated to the SDXL `DEFAULT_MODEL_ID` only (`_can_protect_text`), falls back to plain img2img otherwise. **Autonomous by default** (`protect_text=True` in `invisible_engine`/`watermark_remover`, mirroring `protect_faces`): the detector runs per image and `_run_differential` falls back to plain img2img when **no boxes** are found, so text-free inputs pay only the cheap cv2 detection (no differential-pipeline load). CLI exposes a single off-switch `--no-protect-text` on `invisible`/`all` (passed as `protect_text=not no_protect_text`); the unavailable-model case logs at debug, not warning, since it is now the default path. The diff pipeline upcasts the VAE to fp32 internally, so do **not** add `upcast_vae()`/`enable_attention_slicing` (both produced NaN/black on fp16 MPS). `build_change_map` is unit-tested without any model download (`tests/test_text_protector.py`). - `face_protector.py` — YOLO detect + soft-blend pattern; mirror this for any "protect region during diffusion" features - `image_io.py` — Unicode-safe cv2 IO (issue #17). `imread(path, flags=None)` / `imwrite(path, img)` wrap `np.fromfile`+`cv2.imdecode` / `cv2.imencode`+`tofile` so non-ASCII paths work on Windows -- bare `cv2.imread`/`cv2.imwrite` use the platform ANSI code-page API there and fail (empty decode + `can't open/read file`) on Chinese/Cyrillic/accented filenames. `imread` keeps `cv2.imread` semantics (defaults to `IMREAD_COLOR`, returns `None` on missing/empty/undecodable). **Every cv2 file read/write in the package routes through here; do not call `cv2.imread`/`cv2.imwrite` directly.** macOS/Linux already accept UTF-8 paths, so it is behavior-neutral there (the bug only reproduces on Windows). cv2/numpy are imported lazily inside the functions, so the module is cheap to import in a bare env. diff --git a/README.md b/README.md index 9a64f7e..93fcb3f 100644 --- a/README.md +++ b/README.md @@ -107,7 +107,7 @@ SDXL is the default since May 2026: empirically defeats SynthID v2 on Gemini 3 P **Analog Humanizer**: optional film grain and chromatic aberration injection that mimics a photo of a screen, raising the bar for AI-generated image classifiers. (It frustrates generic classifiers but does not guarantee forensic invisibility — see the [arXiv:2605.09203](https://arxiv.org/abs/2605.09203) note above.) -**Text Protection** (`--protect-text`): SDXL img2img regenerates every pixel, so small text and CJK glyphs get deformed at the strengths that defeat SynthID. With this flag a CJK-native PP-OCRv3 text detector (a 2.4 MB ONNX model run on CPU via OpenCV's DNN module, downloaded and cached on first use) locates text regions and the pass switches to Differential Diffusion: a per-pixel change map keeps the text regions largely intact while the background is regenerated normally, so glyphs survive the removal pass. SDXL default pipeline only. +**Text Protection** (automatic): SDXL img2img regenerates every pixel, so small text and CJK glyphs get deformed at the strengths that defeat SynthID. The SDXL pipeline guards against this by default: a CJK-native PP-OCRv3 text detector (a 2.4 MB ONNX model run on CPU via OpenCV's DNN module, downloaded and cached on first use) locates text regions, and if any are found the pass switches to Differential Diffusion so a per-pixel change map keeps the text regions largely intact while the background is regenerated normally. Text-free images run the standard pass at no extra cost. Pass `--no-protect-text` to turn it off. SDXL default pipeline only. ### Stripping C2PA, EXIF, and "Made with AI" metadata @@ -249,7 +249,7 @@ remove-ai-watermarks erase image.png --region 1640,1930,400,100 -o clean.png remove-ai-watermarks invisible image.png -o clean.png --humanize 4.0 # Runs at native resolution by default. On a very large image that OOMs the # GPU/MPS, cap the long side: --max-resolution 2048 -# Preserve text / CJK glyphs during regeneration: --protect-text +# Text / CJK glyphs are preserved automatically; disable with --no-protect-text # Check / strip AI metadata (C2PA, EXIF, "Made with AI" labels) # --check also flags SynthID-bearing sources: a C2PA manifest signed by diff --git a/src/remove_ai_watermarks/cli.py b/src/remove_ai_watermarks/cli.py index 9809824..d2ad9a1 100644 --- a/src/remove_ai_watermarks/cli.py +++ b/src/remove_ai_watermarks/cli.py @@ -461,10 +461,10 @@ def cmd_erase( help="Cap long side (px) before diffusion; 0 = native (best quality, like raiw.cc). Raise only on GPU/MPS OOM.", ) @click.option( - "--protect-text", + "--no-protect-text", is_flag=True, default=False, - help="Preserve detected text (incl. CJK) via Differential Diffusion. SDXL default pipeline only.", + help="Disable automatic text protection (text/CJK is preserved by default on the SDXL pipeline).", ) @click.pass_context def cmd_invisible( @@ -479,7 +479,7 @@ def cmd_invisible( hf_token: str | None, humanize: float, max_resolution: int, - protect_text: bool, + no_protect_text: bool, ) -> None: """Remove invisible AI watermarks (SynthID, StableSignature, TreeRing). @@ -526,7 +526,7 @@ def cmd_invisible( guidance_scale=None, seed=seed, humanize=humanize, - protect_text=protect_text, + protect_text=not no_protect_text, max_resolution=max_resolution, ) elapsed = time.monotonic() - t0 @@ -680,10 +680,10 @@ def cmd_identify(ctx: click.Context, source: Path, no_visible: bool, as_json: bo help="Cap long side (px) before diffusion; 0 = native (best quality, like raiw.cc). Raise only on GPU/MPS OOM.", ) @click.option( - "--protect-text", + "--no-protect-text", is_flag=True, default=False, - help="Preserve detected text (incl. CJK) via Differential Diffusion. SDXL default pipeline only.", + help="Disable automatic text protection (text/CJK is preserved by default on the SDXL pipeline).", ) @click.pass_context def cmd_all( @@ -701,7 +701,7 @@ def cmd_all( hf_token: str | None, humanize: float, max_resolution: int, - protect_text: bool, + no_protect_text: bool, ) -> None: """Remove ALL watermarks: visible + invisible + metadata. @@ -793,7 +793,7 @@ def cmd_all( num_inference_steps=steps, seed=seed, humanize=humanize, - protect_text=protect_text, + protect_text=not no_protect_text, max_resolution=max_resolution, ) console.print(" [green]✓[/] Invisible watermark removed") diff --git a/src/remove_ai_watermarks/invisible_engine.py b/src/remove_ai_watermarks/invisible_engine.py index f979c5d..130334e 100644 --- a/src/remove_ai_watermarks/invisible_engine.py +++ b/src/remove_ai_watermarks/invisible_engine.py @@ -125,7 +125,7 @@ class InvisibleEngine: seed: int | None = None, humanize: float = 0.0, protect_faces: bool = True, - protect_text: bool = False, + protect_text: bool = True, max_resolution: int = 0, ) -> Path: """Remove invisible watermark from an image. @@ -139,8 +139,9 @@ class InvisibleEngine: seed: Random seed for reproducibility. humanize: Intensity of Analog Humanizer film grain (0 = off). protect_faces: Boolean to extract and restore faces intact. - protect_text: Preserve detected text regions via Differential - Diffusion so glyphs (incl. CJK) survive the removal pass. + protect_text: Detect text regions and preserve them via Differential + Diffusion when any are found, so glyphs (incl. CJK) survive the + removal pass. On by default; the detector decides per image. max_resolution: Cap the long side (px) before diffusion. 0 (default) = native resolution, no pre-downscale -- matches the hosted raiw.cc backend. Set a positive value only to bound GPU/MPS diff --git a/src/remove_ai_watermarks/noai/watermark_remover.py b/src/remove_ai_watermarks/noai/watermark_remover.py index 39c9fc4..9038709 100644 --- a/src/remove_ai_watermarks/noai/watermark_remover.py +++ b/src/remove_ai_watermarks/noai/watermark_remover.py @@ -387,7 +387,7 @@ class WatermarkRemover: num_inference_steps: int = 50, guidance_scale: float | None = None, seed: int | None = None, - protect_text: bool = False, + protect_text: bool = True, ) -> Path: """Remove watermark from an image using regeneration attack. @@ -398,8 +398,10 @@ class WatermarkRemover: num_inference_steps: Number of denoising steps. guidance_scale: Classifier-free guidance scale. seed: Random seed for reproducibility. - protect_text: Preserve detected text regions via Differential - Diffusion (SDXL default profile only). Off by default. + protect_text: Detect text regions and preserve them via Differential + Diffusion when any are found (SDXL default profile only). On by + default; the detector decides per image, and text-free inputs run + the standard pass at no extra cost. Returns: Path to the cleaned image. @@ -458,8 +460,8 @@ class WatermarkRemover: ) else: if protect_text: - logger.warning( - "protect_text requested but unavailable " + logger.debug( + "Text protection unavailable " "(needs the SDXL default model and the cv2 text detector); " "running standard img2img." ) @@ -611,6 +613,10 @@ class WatermarkRemover: logger.warning("Text detection failed (%s); running standard img2img.", exc) return self._run_img2img(init_image, strength, num_inference_steps, guidance_scale, generator) + if not boxes: + self._set_progress("No text detected; running standard img2img.") + return self._run_img2img(init_image, strength, num_inference_steps, guidance_scale, generator) + width, height = init_image.size change_map = text_protector.build_change_map(boxes, height, width) self._set_progress(f"Protecting {len(boxes)} text region(s) via Differential Diffusion...")