From ef6fdaeeec07583f948eb3698afd8a67c9f9a868 Mon Sep 17 00:00:00 2001 From: Victor Kuznetsov Date: Fri, 29 May 2026 12:28:30 -0700 Subject: [PATCH] Detect text at native resolution (capped), fixing small-text recall on large images (#27) The text-protection detector scaled every image to a fixed 736 px long side, so small text on large canvases (e.g. ~16 px on 2048) was downscaled below the detector and missed -> deformed by the SDXL pass (issue #14). Detect at the native long side capped at 1536, never upscaled (_detection_input_size, a pure unit-tested helper). Detection is script-agnostic (DB segments regions, not characters), so this is language-agnostic: a new benchmark (scripts/text_detection_benchmark.py) measures recall across Latin/Cyrillic/CJK/ Hangul/Arabic/digits x sizes x canvas -> overall hit-rate 0.91 -> 1.00, worst cell (2048/16 px) 0.06 -> 1.00. Docs updated. Co-authored-by: Claude Opus 4.8 (1M context) --- CLAUDE.md | 2 +- README.md | 2 +- scripts/text_detection_benchmark.py | 150 +++++++++++++++++++++ src/remove_ai_watermarks/text_protector.py | 33 ++++- tests/test_text_protector.py | 45 ++++++- 5 files changed, 224 insertions(+), 8 deletions(-) create mode 100644 scripts/text_detection_benchmark.py diff --git a/CLAUDE.md b/CLAUDE.md index d44f64c..6915995 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -39,7 +39,7 @@ You are a **principal Python engineer** maintaining a CLI tool and library for r - `region_eraser.py` — universal region eraser (`erase` CLI). `erase(image, boxes=|mask=, backend=)`: `boxes_to_mask` → `cv2.inpaint` (`cv2` backend, default, no deps) or big-LaMa via onnxruntime (`lama` backend, extra `lama`, `Carve/LaMa-ONNX` Apache-2.0 model downloaded on first use, never bundled). `erase_lama` crops a padded region around the mask, runs LaMa at its fixed 512² input, pastes only masked pixels back (untouched areas stay pixel-exact). Lazy `_get_lama_session` singleton; `lama_available()` guards the optional import. **LaMa-ONNX costs ~3.5-4 GB peak RAM and ~5-6 s/call on CPU** (FFC working set, not arena — `enable_cpu_mem_arena=False` does not help), so it does NOT fit a minimal droplet; the cv2 backend (tens of MB, ~30 ms) does. LaMa quality at low RAM = serverless/GPU, mirroring how raiw.cc offloads SDXL to fal. - `invisible_watermark.py` — `detect_invisible_watermark(path)` decodes the OPEN DWT-DCT watermarks (public decoder, no key) embedded by Stable Diffusion / SDXL / FLUX via the `imwatermark` library. Known fixed patterns (verified against upstream source) live in `_BITS_48` (SDXL 48-bit, FLUX.2 48-bit) and `_SD1_STRING` ("StableDiffusionV1", SD 1.x/2.x). Optional dep (extra `detect`); returns None when absent. The `detect` extra pulls **torch** transitively (invisible-watermark declares torch a hard dep, and `WatermarkDecoder` eagerly imports `rivaGan` -> `torch` at import time), so detection needs torch present even though dwtDct runs CPU-only on cv2/numpy/pywavelets — no GPU and no separate `gpu` extra required. **Unlike SynthID this is locally detectable**, but the watermark is fragile (does not survive JPEG re-encode/resize — verified gone after JPEG q90), so it confirms origin only on pristine files. Add new known patterns here. The file carries a top-of-module pyright pragma because imwatermark/cv2 ship no type stubs. - `trustmark_detector.py` — `detect_trustmark(path)` decodes the OPEN, keyless **Adobe TrustMark** watermark (the soft binding behind Adobe Durable Content Credentials, `alg` `com.adobe.trustmark.P`) via the optional `trustmark` package (extra `trustmark`; pulls torch, downloads model weights on first use). Mirrors `invisible_watermark.py` (lazy singleton, top-of-module pyright pragma, returns None when absent). It detects *provenance*, not AI origin as such (TrustMark also marks human-authored content), so `identify` lists it as a watermark without setting `is_ai_generated`. Other soft-binding vendors (Digimarc/Imatag/Steg.AI/...) have no public decoder — they are only *named* via the `C2PA_SOFT_BINDINGS` scan, not decoded. -- `text_protector.py` — text-region protection for the `invisible` SDXL img2img pass (issue #21: CJK/small text deforms at watermark-removal strengths). `is_available()` gates on `cv2.dnn.TextDetectionModel_DB`; `TextProtector.detect_text_boxes(bgr)` runs the **PP-OCRv3 CN** ONNX detector (~2.4 MB, Apache-2.0, opencv_zoo, CJK-native, returns rotated quad polygons) — downloaded+cached to `~/.cache/remove-ai-watermarks` on first use via atomic temp-rename, never bundled, **no torch (cv2.dnn only)**. `build_change_map(boxes, h, w, preserve=0.9, feather=15)` paints a Differential-Diffusion change map. **Polarity (verified empirically):** white(1.0)=PRESERVE original pixels, black(0.0)=MAX change; map is black bg + `preserve` inside text polygons, Gaussian-feathered edges, clipped to [0,1]. `preserve` stays below a hard 1.0 freeze by default so text still scrubs lightly (SynthID survives cropping). Wired into `watermark_remover._run_differential` via the community `pipeline_stable_diffusion_xl_differential_img2img` (loaded with `custom_revision="0.38.0"` — HF resolves the **PyPI** version string, not the `v0.38.0` git tag); gated to the SDXL `DEFAULT_MODEL_ID` only (`_can_protect_text`), falls back to plain img2img otherwise. **Autonomous by default** (`protect_text=True` in `invisible_engine`/`watermark_remover`, mirroring `protect_faces`): the detector runs per image and `_run_differential` falls back to plain img2img when **no boxes** are found, so text-free inputs pay only the cheap cv2 detection (no differential-pipeline load). CLI exposes a single off-switch `--no-protect-text` on `invisible`/`all` (passed as `protect_text=not no_protect_text`); the unavailable-model case logs at debug, not warning, since it is now the default path. The diff pipeline upcasts the VAE to fp32 internally, so do **not** add `upcast_vae()`/`enable_attention_slicing` (both produced NaN/black on fp16 MPS). `build_change_map` is unit-tested without any model download (`tests/test_text_protector.py`). +- `text_protector.py` — text-region protection for the `invisible` SDXL img2img pass (issue #21: CJK/small text deforms at watermark-removal strengths). `is_available()` gates on `cv2.dnn.TextDetectionModel_DB`; `TextProtector.detect_text_boxes(bgr)` runs the **PP-OCRv3 DB** ONNX detector (~2.4 MB, Apache-2.0, opencv_zoo, returns rotated quad polygons) — downloaded+cached to `~/.cache/remove-ai-watermarks` on first use via atomic temp-rename, never bundled, **no torch (cv2.dnn only)**. **Detection is script-agnostic** (DB segments text *regions*, not characters), so Latin / Cyrillic / CJK / Hangul / Arabic / digits all detect identically — language was never the recall lever, **resolution was**. `_detection_input_size(h, w)` (pure, unit-tested) detects at the **native long side capped at `_DET_MAX_LONG_SIDE` (1536), never upscaled**: the old fixed 736 downscaled large canvases so small text fell below the detector and was missed (issue #14, e.g. ~16 px text on a 2048 image). `scripts/text_detection_benchmark.py` measures recall across scripts × sizes × canvas: the cap fix lifts overall hit-rate 0.91 → 1.00 (worst cell 2048/16 px: 0.06 → 1.00) at ~100 ms CPU. Very large canvases with tiny text may still need tiling (documented limit, not built). `build_change_map(boxes, h, w, preserve=0.9, feather=15)` paints a Differential-Diffusion change map. **Polarity (verified empirically):** white(1.0)=PRESERVE original pixels, black(0.0)=MAX change; map is black bg + `preserve` inside text polygons, Gaussian-feathered edges, clipped to [0,1]. `preserve` stays below a hard 1.0 freeze by default so text still scrubs lightly (SynthID survives cropping). Wired into `watermark_remover._run_differential` via the community `pipeline_stable_diffusion_xl_differential_img2img` (loaded with `custom_revision="0.38.0"` — HF resolves the **PyPI** version string, not the `v0.38.0` git tag); gated to the SDXL `DEFAULT_MODEL_ID` only (`_can_protect_text`), falls back to plain img2img otherwise. **Autonomous by default** (`protect_text=True` in `invisible_engine`/`watermark_remover`, mirroring `protect_faces`): the detector runs per image and `_run_differential` falls back to plain img2img when **no boxes** are found, so text-free inputs pay only the cheap cv2 detection (no differential-pipeline load). CLI exposes a single off-switch `--no-protect-text` on `invisible`/`all` (passed as `protect_text=not no_protect_text`); the unavailable-model case logs at debug, not warning, since it is now the default path. The diff pipeline upcasts the VAE to fp32 internally, so do **not** add `upcast_vae()`/`enable_attention_slicing` (both produced NaN/black on fp16 MPS). `build_change_map` is unit-tested without any model download (`tests/test_text_protector.py`). - `face_protector.py` — YOLO detect + soft-blend pattern; mirror this for any "protect region during diffusion" features - `image_io.py` — Unicode-safe cv2 IO (issue #17). `imread(path, flags=None)` / `imwrite(path, img)` wrap `np.fromfile`+`cv2.imdecode` / `cv2.imencode`+`tofile` so non-ASCII paths work on Windows -- bare `cv2.imread`/`cv2.imwrite` use the platform ANSI code-page API there and fail (empty decode + `can't open/read file`) on Chinese/Cyrillic/accented filenames. `imread` keeps `cv2.imread` semantics (defaults to `IMREAD_COLOR`, returns `None` on missing/empty/undecodable). **Every cv2 file read/write in the package routes through here; do not call `cv2.imread`/`cv2.imwrite` directly.** macOS/Linux already accept UTF-8 paths, so it is behavior-neutral there (the bug only reproduces on Windows). cv2/numpy are imported lazily inside the functions, so the module is cheap to import in a bare env. diff --git a/README.md b/README.md index cf549c6..aab15fe 100644 --- a/README.md +++ b/README.md @@ -108,7 +108,7 @@ SDXL is the default since May 2026: empirically defeats SynthID v2 on Gemini 3 P **Analog Humanizer**: optional film grain and chromatic aberration injection that mimics a photo of a screen, raising the bar for AI-generated image classifiers. (It frustrates generic classifiers but does not guarantee forensic invisibility — see the [arXiv:2605.09203](https://arxiv.org/abs/2605.09203) note above.) -**Text Protection** (automatic): SDXL img2img regenerates every pixel, so small text and CJK glyphs get deformed at the strengths that defeat SynthID. The SDXL pipeline guards against this by default: a CJK-native PP-OCRv3 text detector (a 2.4 MB ONNX model run on CPU via OpenCV's DNN module, downloaded and cached on first use) locates text regions, and if any are found the pass switches to Differential Diffusion so a per-pixel change map keeps the text regions largely intact while the background is regenerated normally. Text-free images run the standard pass at no extra cost. Pass `--no-protect-text` to turn it off. SDXL default pipeline only. +**Text Protection** (automatic): SDXL img2img regenerates every pixel, so small text and glyphs get deformed at the strengths that defeat SynthID. The SDXL pipeline guards against this by default: a PP-OCRv3 text detector (a 2.4 MB ONNX model run on CPU via OpenCV's DNN module, downloaded and cached on first use) locates text regions, and if any are found the pass switches to Differential Diffusion so a per-pixel change map keeps the text regions largely intact while the background is regenerated normally. Detection is **language-agnostic** (it finds text regions, not characters), so Latin, Cyrillic, CJK, Hangul, Arabic, and digits are all protected, and it runs at the image's native resolution so small text on large images is not missed. Text-free images run the standard pass at no extra cost. Pass `--no-protect-text` to turn it off. SDXL default pipeline only. ### Stripping C2PA, EXIF, and "Made with AI" metadata diff --git a/scripts/text_detection_benchmark.py b/scripts/text_detection_benchmark.py new file mode 100644 index 0000000..c39c0c9 --- /dev/null +++ b/scripts/text_detection_benchmark.py @@ -0,0 +1,150 @@ +"""Multilingual recall benchmark for the text-protection detector. + +Measures the core lever of text protection (`text_protector.TextProtector`): if +the PP-OCRv3 DB detector misses a text region, that text is NOT preserved during +the SDXL watermark-removal pass and gets deformed (issue #14). This renders short +text in several scripts at several font sizes on two canvas sizes, runs detection, +and reports the fraction of each known text bbox the detector covers. + +Findings (2026-05-29): + - Detection is script-agnostic: DB segments text *regions*, not characters, so + Latin / Cyrillic / CJK / Hangul / Arabic / digits score identically. Language + was never the lever. + - The only lever is resolution. A fixed small detector input downscaled large + canvases so far that small text was missed. Detecting at the native long side + (capped, see ``text_protector._DET_MAX_LONG_SIDE``) lifts overall hit-rate + from 0.91 to 1.00 and the worst cell (~16 px text on a 2048 canvas) from + 0.06 to 1.00. + +This needs the detector model (downloaded on first use) and a font that covers +all the scripts (macOS "Arial Unicode"; on Linux install a Noto super-font). +No GPU. Run: + + uv run python scripts/text_detection_benchmark.py +""" + +from __future__ import annotations + +import logging +import sys +from collections import defaultdict +from pathlib import Path +from typing import Any + +import cv2 +import numpy as np +from PIL import Image, ImageDraw, ImageFont +from rich.console import Console + +from remove_ai_watermarks import text_protector as tp + +log = logging.getLogger(__name__) +console = Console() + +# A single font covering every tested script isolates "language" from "font". +_FONT_CANDIDATES = [ + "/System/Library/Fonts/Supplemental/Arial Unicode.ttf", + "/Library/Fonts/Arial Unicode.ttf", + "/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc", + "/usr/share/fonts/truetype/unifont/unifont.ttf", +] +SCRIPTS = { + "Latin": "Generated by AI", + "Cyrillic": "Сгенерировано ИИ", + "CJK": "豆包AI生成内容", + "Hangul": "AI로 생성됨", + "Arabic": "أنشئ بالذكاء", + "Digits": "0123456789", +} +FONT_SIZES = [16, 24, 32, 48, 64] +CANVASES = [1024, 2048] +PLACEMENTS = [(0.08, 0.15), (0.30, 0.55), (0.10, 0.82)] +BG_COLORS = [(35, 40, 60), (210, 205, 200)] + + +def _find_font() -> str: + for path in _FONT_CANDIDATES: + if Path(path).exists(): + return path + raise SystemExit( + "No multi-script font found. Install one (macOS ships 'Arial Unicode'; " + "on Linux: a Noto CJK/super font) and add its path to _FONT_CANDIDATES." + ) + + +def _render( + font_path: str, + canvas: int, + text: str, + font_size: int, + place: tuple[float, float], + bg: tuple[int, int, int], +) -> tuple[Image.Image, tuple[int, int, int, int]]: + img = Image.new("RGB", (canvas, canvas), bg) + draw = ImageDraw.Draw(img) + font = ImageFont.truetype(font_path, font_size) + x, y = int(place[0] * canvas), int(place[1] * canvas) + fg = (245, 245, 245) if sum(bg) < 360 else (20, 20, 20) + draw.text((x, y), text, font=font, fill=fg) + return img, draw.textbbox((x, y), text, font=font) + + +def _coverage(boxes: list[Any], bbox: tuple[int, int, int, int], h: int, w: int) -> float | None: + gt = np.zeros((h, w), np.uint8) + cv2.rectangle(gt, (bbox[0], bbox[1]), (bbox[2], bbox[3]), 1, -1) + area = int(gt.sum()) + if area == 0: + return None + det = np.zeros((h, w), np.uint8) + if boxes: + cv2.fillPoly(det, [np.asarray(b, np.int32) for b in boxes], 1) + return int((gt & det).sum()) / area + + +def _hitrate(values: list[float], thr: float = 0.5) -> float: + return sum(c >= thr for c in values) / len(values) if values else float("nan") + + +def main() -> int: + logging.basicConfig(level=logging.WARNING) + if not tp.is_available(): + raise SystemExit("text detector unavailable (need opencv with cv2.dnn.TextDetectionModel_DB)") + font_path = _find_font() + detector = tp.TextProtector() + + by_script_size: dict[tuple[str, int], list[float]] = defaultdict(list) + by_size_canvas: dict[tuple[int, int], list[float]] = defaultdict(list) + by_script: dict[str, list[float]] = defaultdict(list) + + for canvas in CANVASES: + for script, text in SCRIPTS.items(): + for font_size in FONT_SIZES: + for idx, place in enumerate(PLACEMENTS): + img, bbox = _render(font_path, canvas, text, font_size, place, BG_COLORS[idx % len(BG_COLORS)]) + bgr = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) + cov = _coverage(detector.detect_text_boxes(bgr), bbox, canvas, canvas) + if cov is None: + continue + by_script_size[(script, font_size)].append(cov) + by_size_canvas[(font_size, canvas)].append(cov) + by_script[script].append(cov) + + console.print("=== hit-rate (coverage>=0.5) by script x font-size ===") + console.print("script".ljust(10) + "".join(f"{fs:>7}" for fs in FONT_SIZES)) + for script in SCRIPTS: + console.print( + script.ljust(10) + "".join(f"{_hitrate(by_script_size[(script, fs)]):>7.2f}" for fs in FONT_SIZES) + ) + + console.print("\n=== hit-rate by font-size x canvas (the downscale effect) ===") + console.print("size".ljust(8) + "".join(f"{c:>8}" for c in CANVASES)) + for fs in FONT_SIZES: + console.print(str(fs).ljust(8) + "".join(f"{_hitrate(by_size_canvas[(fs, c)]):>8.2f}" for c in CANVASES)) + + overall = _hitrate([c for vals in by_script.values() for c in vals]) + console.print(f"\nOVERALL hit-rate: {overall:.2f} (detector max long side = {tp._DET_MAX_LONG_SIDE})") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/remove_ai_watermarks/text_protector.py b/src/remove_ai_watermarks/text_protector.py index fe10fe3..1f61782 100644 --- a/src/remove_ai_watermarks/text_protector.py +++ b/src/remove_ai_watermarks/text_protector.py @@ -42,8 +42,18 @@ _MODEL_URL = ( ) _MODEL_FILENAME = "text_detection_cn_ppocrv3_2023may.onnx" -# DB detector input: long side scaled to this, rounded to a multiple of 32. -_DET_INPUT_LONG_SIDE = 736 +# DB detector input: the image is detected at its NATIVE long side, capped at +# this value (rounded to a multiple of 32), never upscaled. A fixed small input +# (the old 736) downscaled large images so far that small text fell below the +# detector's resolution and was missed -- the cause of the "small text still +# distorts" reports (issue #14). Detection is script-agnostic (DB segments text +# *regions*, not characters), so this recall fix applies to every language; the +# only lever that mattered was resolution. 1536 recovers full recall down to +# ~12 px text on a 2048 canvas at ~100 ms on CPU (a fixed 736 missed it); going +# higher buys no measured recall at 2x+ the cost. Benchmarked in +# scripts/text_detection_benchmark.py. Very large canvases with tiny text may +# still need tiling -- a documented limit, not yet built. +_DET_MAX_LONG_SIDE = 1536 # ImageNet mean (x255) and 1/255 scale -- the normalization PP-OCRv3 expects. _DET_MEAN = (0.485 * 255, 0.456 * 255, 0.406 * 255) _DET_SCALE = 1 / 255.0 @@ -87,6 +97,21 @@ def _model_path() -> Path: return target +def _detection_input_size(height: int, width: int) -> tuple[int, int]: + """DB-detector input ``(in_w, in_h)`` for an image of the given size. + + Detect at the native long side, capped at ``_DET_MAX_LONG_SIDE`` and never + upscaled, each side rounded down to a multiple of 32 (the DB head requires + /32 dims), floored at 32. Pure function so the resolution contract (the + issue #14 small-text recall fix) is unit-testable without the model. + """ + long_side = max(height, width) + scale = min(_DET_MAX_LONG_SIDE, long_side) / long_side + in_w = max((round(width * scale) // 32) * 32, 32) + in_h = max((round(height * scale) // 32) * 32, 32) + return in_w, in_h + + def build_change_map( boxes: list[NDArray[Any]], height: int, @@ -154,9 +179,7 @@ class TextProtector: One array of four (x, y) vertices per detected text region. """ height, width = bgr_image.shape[:2] - scale = _DET_INPUT_LONG_SIDE / max(height, width) - in_w = max((round(width * scale) // 32) * 32, 32) - in_h = max((round(height * scale) // 32) * 32, 32) + in_w, in_h = _detection_input_size(height, width) self._detector.setInputParams( scale=_DET_SCALE, size=(in_w, in_h), diff --git a/tests/test_text_protector.py b/tests/test_text_protector.py index 7ac7dcb..fd80e97 100644 --- a/tests/test_text_protector.py +++ b/tests/test_text_protector.py @@ -11,7 +11,50 @@ from __future__ import annotations import numpy as np -from remove_ai_watermarks.text_protector import build_change_map +from remove_ai_watermarks.text_protector import _DET_MAX_LONG_SIDE, _detection_input_size, build_change_map + + +class TestDetectionInputSize: + """Resolution contract for the DB detector input (issue #14 recall fix). + + A fixed small input (the old 736) downscaled large canvases so far that small + text fell below the detector's resolution and was missed. Detection now runs + at the native long side, capped and never upscaled. + """ + + def test_large_canvas_not_downscaled_to_old_736(self): + # The #14 regression: a 2048 canvas must detect well above the old 736 + # so ~12-16 px text survives. Capped at the max long side. + in_w, in_h = _detection_input_size(2048, 2048) + assert in_w == _DET_MAX_LONG_SIDE + assert in_h == _DET_MAX_LONG_SIDE + assert in_w > 736 # the old fixed input that missed small text + + def test_native_resolution_not_upscaled(self): + # A 1024 canvas detects at native 1024 (not upscaled to the cap, not + # downscaled to the old 736). + assert _detection_input_size(1024, 1024) == (1024, 1024) + + def test_small_image_is_native(self): + assert _detection_input_size(512, 512) == (512, 512) + + def test_dims_are_multiples_of_32(self): + for h, w in [(2048, 1024), (1234, 567), (4096, 4096), (1000, 1000)]: + in_w, in_h = _detection_input_size(h, w) + assert in_w % 32 == 0 + assert in_h % 32 == 0 + + def test_aspect_ratio_preserved_when_capped(self): + # Portrait 2048x1024: long side capped to the max, short side scaled by + # the same factor (so the 2:1 aspect is roughly kept). + in_w, in_h = _detection_input_size(2048, 1024) + assert in_h == _DET_MAX_LONG_SIDE + assert abs((in_w / in_h) - 0.5) < 0.05 + + def test_floor_at_32(self): + in_w, in_h = _detection_input_size(10, 5) + assert in_w >= 32 + assert in_h >= 32 class TestBuildChangeMap: