From 2fcd00ced0fce542fd598aae5376fa18c9f68431 Mon Sep 17 00:00:00 2001 From: Victor Kuznetsov Date: Tue, 9 Jun 2026 13:21:13 -0700 Subject: [PATCH] fix: address whole-project code review (visible all/batch, engine consolidation, I/O) Nine findings from a high-effort project-wide review, fixed and verified (571 passed, ruff/pyright clean): Correctness: - all/batch now remove Doubao/Jimeng/Samsung visible text marks: the visible step routes through the registry (new cli._remove_visible_auto) instead of a hardcoded GeminiEngine, so they no longer leave the wordmark intact. - batch always reads the original source (dropped the out_path-reuse that re-processed already-cleaned outputs on a re-run). - img2img_runner only retries the diffusion call on the deprecated-callback TypeError; any other TypeError now propagates instead of double-running. - gemini detect/remove and the reverse-alpha engines normalize channels via a new image_io.to_bgr, fixing a grayscale/BGRA crash in the FP-gate path. - _png_late_metadata advances its cursor by the clamped length, so a malformed chunk length no longer aborts the late AI-label scan. Cleanup / efficiency: - Consolidate the ~90%-identical Doubao/Jimeng/Samsung engines into a shared config-driven _text_mark_engine.TextMarkEngine base; each engine is now a thin subclass (TextMarkConfig + test shims). Behavior is byte-exact (the three engine test suites pass unchanged). Registry adapters collapse to one _text_mark(...) row each. Gemini stays a separate engine. - scan_head is memoized per (path, size, mtime), so identify() reads the file head once instead of ~8 times. - invisible_engine post-processing decodes/encodes the output once (chained in memory) instead of 2-4 times across stages. - Remove the orphaned get_model_id_for_profile (+ CONTROLNET_PROFILE); derive the --strength help from the strength constants (strength_default_help) so it cannot drift; share the --pipeline/--strength click options; simplify the retired --auto resolver. Net -835 lines. Tests added for the registry-routed visible pass, to_bgr, the polish/model/guidance wiring, and strength_default_help. CLAUDE.md updated for the new base module, the engine/registry changes, image_io.to_bgr, and the scan_head cache. Co-Authored-By: Claude Opus 4.8 --- CLAUDE.md | 13 +- src/remove_ai_watermarks/_text_mark_engine.py | 349 ++++++++++++++ src/remove_ai_watermarks/cli.py | 72 ++- src/remove_ai_watermarks/doubao_engine.py | 423 +++-------------- src/remove_ai_watermarks/gemini_engine.py | 20 +- src/remove_ai_watermarks/image_io.py | 19 + src/remove_ai_watermarks/invisible_engine.py | 70 ++- src/remove_ai_watermarks/jimeng_engine.py | 438 +++--------------- src/remove_ai_watermarks/metadata.py | 28 +- .../noai/img2img_runner.py | 11 +- .../noai/watermark_profiles.py | 14 - src/remove_ai_watermarks/samsung_engine.py | 422 +++-------------- .../watermark_registry.py | 88 ++-- tests/test_cli.py | 15 + tests/test_gemini_engine.py | 13 + tests/test_image_io.py | 26 ++ tests/test_platform.py | 20 +- 17 files changed, 777 insertions(+), 1264 deletions(-) create mode 100644 src/remove_ai_watermarks/_text_mark_engine.py diff --git a/CLAUDE.md b/CLAUDE.md index 9c83850..9264b1c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -35,20 +35,21 @@ You are a **principal Python engineer** maintaining a CLI tool and library for r - `noai/c2pa.py` — PNG chunk parser; use `extract_c2pa_chunk(path)` to get raw caBX payload, `has_c2pa_metadata(path)` to detect. Do not reimplement chunk parsing. `extract_c2pa_info(path)` sets `synthid_watermark`/`synthid_vendors` when the manifest is signed by a SynthID-using vendor, and `soft_binding`/`soft_binding_vendors` when a `c2pa.soft-binding` `alg` names a forensic-watermark vendor (`soft_binding_vendors_in(buffer)` is the shared byte-scan, used by both the PNG parser and the non-PNG binary path). PNG/caBX chunk reads are clamped to the remaining file size (`safe_length = min(length, remaining)`; skipped chunks use seek) so a malformed huge `length` cannot drive a multi-GB allocation (shared safety discipline matching `isobmff.scan_c2pa_region`). - `noai/constants.py` — PNG_SIGNATURE, C2PA_CHUNK_TYPE, C2PA_SIGNATURES, and `C2PA_AI_VENDORS` — the single `C2paAiVendor` registry of C2PA-signing vendors (issuer byte, resolved org name, the `identify` platform label, and a `synthid` flag), from which `C2PA_ISSUERS`, `SYNTHID_C2PA_ISSUERS` (issuers that pair SynthID with C2PA: Google, OpenAI), and `identify._ISSUER_PLATFORM` are all **derived** — plus `C2PA_SOFT_BINDINGS` (soft-binding `alg` prefix → forensic-watermark vendor: Adobe TrustMark, Digimarc, Imatag, Steg.AI, Microsoft, ...). Add a new C2PA vendor as one `C2PA_AI_VENDORS` entry (never edit the derived dicts), a new soft-binding to `C2PA_SOFT_BINDINGS`; not inline. -- `metadata.py` — `scan_head(path, size=1MB)` is the shared input for every C2PA/AIGC/IPTC byte scan: first `size` bytes plus the payloads of any provenance metadata found beyond that window — for ISOBMFF, the late provenance boxes from `isobmff.scan_c2pa_region` (catches a manifest after a large `mdat`); for **PNG**, the late `tEXt`/`iTXt`/`zTXt`/`eXIf`/`iCCP` chunks from `_png_late_metadata` (catches an XMP/EXIF packet appended after a large `IDAT`, e.g. a TC260 AIGC label at ~2.7 MB). Behavior-neutral (`f.read(size)`) for non-ISOBMFF inputs and for any file that fits within `size`. Use it instead of `open().read(1MB)` for any new marker scan. `synthid_source(path)` returns the vendor name(s) if the C2PA manifest implies a SynthID pixel watermark, else None. Format-agnostic: PNG via the caBX parser, JPEG/WebP/AVIF/HEIF/JXL via a binary scan (C2PA marker + SynthID issuer + AI-source marker). `get_ai_metadata` surfaces the verdict, and `metadata --check` prints it as a callout. Both `get_ai_metadata` and `has_ai_metadata` guard the PIL open with `except Exception` (HEIC/unknown formats raise non-OSError) and fall through to the binary scan. `xai_signature(path)` detects xAI/Grok's EXIF-only scheme (`ImageDescription` = `Signature: ` + UUID `Artist`); it feeds `has_ai_metadata`, `get_ai_metadata` (key `xai_signature`), and `identify`. `iptc_ai_system(path)` detects the IPTC Photo Metadata 2025.1 AI-disclosure XMP properties (`IPTC_AI_FIELD_MARKERS` = `AISystemUsed`/`AISystemVersionUsed`/`AIPromptInformation`/`AIPromptWriterName`) and returns the `AISystemUsed` generator name (or `"fields present"`). `remove_ai_metadata` routes **ISOBMFF video** (`.mp4`/`.mov`/`.m4v`) through the same `isobmff.strip_c2pa_boxes` as AVIF/HEIF (MP4 is ISOBMFF), and `_scrub_ai_exif` removes the xAI signature + AI-generator EXIF tags on JPEG output. `strip_c2pa_boxes` is **fail-safe** on a malformed box: it returns the original bytes unchanged with a logged warning instead of truncating the tail to EOF (detection-only `scan_c2pa_region` still stops at a malformed box). `_png_late_metadata` clamps each late-chunk read to the remaining file size (`safe_length = min(length, remaining)`) so a malformed `length` cannot drive a multi-GB allocation. +- `metadata.py` — `scan_head(path, size=1MB)` is the shared input for every C2PA/AIGC/IPTC byte scan: first `size` bytes plus the payloads of any provenance metadata found beyond that window — for ISOBMFF, the late provenance boxes from `isobmff.scan_c2pa_region` (catches a manifest after a large `mdat`); for **PNG**, the late `tEXt`/`iTXt`/`zTXt`/`eXIf`/`iCCP` chunks from `_png_late_metadata` (catches an XMP/EXIF packet appended after a large `IDAT`, e.g. a TC260 AIGC label at ~2.7 MB). Behavior-neutral (`f.read(size)`) for non-ISOBMFF inputs and for any file that fits within `size`. Use it instead of `open().read(1MB)` for any new marker scan. **Memoized per (path, size, mtime)** (added 2026-06-09, `_scan_head_cached` lru_cache, `maxsize=8`): one `identify`/`get_ai_metadata` call fans out to ~8 byte-scan detectors that each re-read the same file head, so the cache turns those into a single read; the mtime key invalidates on change, a stat failure falls back to an uncached read. `synthid_source(path)` returns the vendor name(s) if the C2PA manifest implies a SynthID pixel watermark, else None. Format-agnostic: PNG via the caBX parser, JPEG/WebP/AVIF/HEIF/JXL via a binary scan (C2PA marker + SynthID issuer + AI-source marker). `get_ai_metadata` surfaces the verdict, and `metadata --check` prints it as a callout. Both `get_ai_metadata` and `has_ai_metadata` guard the PIL open with `except Exception` (HEIC/unknown formats raise non-OSError) and fall through to the binary scan. `xai_signature(path)` detects xAI/Grok's EXIF-only scheme (`ImageDescription` = `Signature: ` + UUID `Artist`); it feeds `has_ai_metadata`, `get_ai_metadata` (key `xai_signature`), and `identify`. `iptc_ai_system(path)` detects the IPTC Photo Metadata 2025.1 AI-disclosure XMP properties (`IPTC_AI_FIELD_MARKERS` = `AISystemUsed`/`AISystemVersionUsed`/`AIPromptInformation`/`AIPromptWriterName`) and returns the `AISystemUsed` generator name (or `"fields present"`). `remove_ai_metadata` routes **ISOBMFF video** (`.mp4`/`.mov`/`.m4v`) through the same `isobmff.strip_c2pa_boxes` as AVIF/HEIF (MP4 is ISOBMFF), and `_scrub_ai_exif` removes the xAI signature + AI-generator EXIF tags on JPEG output. `strip_c2pa_boxes` is **fail-safe** on a malformed box: it returns the original bytes unchanged with a logged warning instead of truncating the tail to EOF (detection-only `scan_c2pa_region` still stops at a malformed box). `_png_late_metadata` clamps each late-chunk read to the remaining file size (`safe_length = min(length, remaining)`) so a malformed `length` cannot drive a multi-GB allocation, AND advances the cursor by `safe_length` (not the raw `length`) so an inflated length cannot jump past EOF and abort the scan, silently skipping a genuine AI-label chunk after it. - `identify.py` — the OpenAI rollout caveat is keyed on `_vendor_of(synthid) == "OpenAI"` (not a raw substring over the issuer + verdict blob). `identify(path)` aggregates every locally-readable signal (C2PA issuer→platform, C2PA soft-binding forensic-watermark vendor, IPTC "Made with AI" + IPTC 2025.1 `AISystemUsed`, embedded SD/ComfyUI params, SynthID proxy, xAI/Grok EXIF signature via `metadata.xai_signature`, the China TC260 AIGC label via `metadata.aigc_label`, the HuggingFace `hf-job-id` job marker via `metadata.huggingface_job`, the Samsung Galaxy AI editing marker via `metadata.samsung_genai`, the visible marks — Gemini sparkle plus the ByteDance Doubao 豆包AI生成 / Jimeng 即梦AI / Samsung Galaxy AI "Contenuti generati dall'AI" text marks via the `watermark_registry` — open invisible watermark, Adobe TrustMark via `trustmark_detector`) into one `ProvenanceReport`. `is_ai_generated` is True or None (never asserted False — stripped metadata is not proof of clean origin). The `hf_job`, visible-mark, and Samsung `samsung_genai` signals are **medium** confidence: each lifts an otherwise-Unknown verdict to a tentative AI (`hf_only` / `visible_only` / `samsung_only`, parallel branches; `visible_only` fires on any `visible_*` signal) but is excluded from the high-confidence `ai_from_metadata` set, so none overrides a hard metadata signal. **Visible-mark detection** (`check_visible`, signals `visible_sparkle` / `visible_doubao` / `visible_jimeng` / `visible_samsung`): the Gemini sparkle keeps its own file-level path (`_visible_sparkle` → `gemini_engine.detect_sparkle_confidence`, promoted only at confidence ≥ `_SPARKLE_THRESHOLD` 0.5; corpus-tuned to separate Gemini sparkles ≥0.56 from non-sparkle ≤0.49), while Doubao/Jimeng/Samsung reuse the registry detectors (`_visible_text_marks` → `watermark_registry`, iterating `_VISIBLE_MARK_PLATFORM`), each gated by its own engine NCC threshold via `MarkDetection.detected` (Doubao 0.4, Jimeng 0.45, Samsung 0.4). Doubao/Jimeng are normally also caught by the TC260 AIGC metadata label and Samsung by its C2PA + `genAIType` marker, so the visible path is their stripped-metadata fallback. Visible marks set `platform` only when no harder signal already did, and (like the sparkle) are excluded from integrity-clash vendor claims. The cv2 dependency lives in the engines, not here. **`import identify` is deliberately light** (~21 MB; ~36 MB with cv2 loaded by a visible-mark run, ~106 MB for a full `check_visible` run): it imports only the pure `noai.c2pa`/`noai.constants` submodules, and `noai/__init__` is lazy (see "Test and lint"), so torch/diffusers are NOT pulled at import even in a full `gpu`/`detect` install — fits a 512 MB host. The heavy paths are opt-in: `check_invisible=True` needs the `detect`/`trustmark` extras (each pulls **torch**; TrustMark also **downloads weights**), so on a core-only deploy leave `check_invisible` off (it is a no-op there anyway). Before the lazy `__init__`, the mere presence of torch in the env inflated `import identify` to ~420 MB. **C2PA platform attribution is device-token-first, issuer-scan fallback** (`_device_platform` scans manifest bytes for `_DEVICE_C2PA_PLATFORM` tokens, then `_attribute_platform`/`_ISSUER_PLATFORM`). **Why, verified on real signed files 2026-05-26:** the old issuer-only byte-scan matched ANY issuer substring anywhere, so multi-entity manifests mis-attributed -- Leica→"Truepic" (a signing authority in the trust chain), Nikon→"Adobe Firefly" (XMP-toolkit "Adobe" + the sample's "Adobe_MAX" name), Pixel→"Google (Gemini)" ("Google LLC" cert org), Truepic→"Google". A distinctive device token wins instead. **Token distinctiveness is load-bearing:** bare `b"Truepic"` mis-fires (it appears in unrelated trust chains -- it mis-attributed the OpenAI `chatgpt-1.png` fixture), so the token is the specific `b"Truepic_Lens"` from the Lens SDK claim generator; likewise `b"Pixel Camera"` (cert CN) not bare `b"Pixel"`. `_DEVICE_C2PA_PLATFORM` lists ONLY tokens **verified against a real C2PA file**: Leica (`lc_c2pa`/`Leica Camera`), Nikon (`NIKON`), Pixel (`Pixel Camera` -- from a real Pixel 10 Pro file attached to c2pa-rs issue #1609/#1554), Sony (`sony.sig`/`sony.cert` -- Sony's own C2PA assertion namespace, verified on a real Sony PXW-Z300 file; NOT bare "Sony" which is a common EXIF Make), Truepic (`Truepic_Lens`). Canon/Bria have **no public direct-download C2PA sample** (checked exhaustively: GitHub issue/PR attachments, contentcredentials gallery, HF datasets -- all upload-to-verify or token-gated; Canon's only public file was a self-signed hobbyist CR3, not factory), so they stay unmapped until a real file is captured (same fixture discipline as Grok/Doubao). The Sony sample is video (MP4) -- our ISOBMFF C2PA path detects it; Sony Alpha stills likely share the `sony.*` namespace but are not separately verified. **Samsung Galaxy + ASUS Gallery live in a separate `_SIGNER_C2PA_PLATFORM` (scanned after `_device_platform`, before the issuer fallback), NOT in `_DEVICE_C2PA_PLATFORM`** — verified on real signed files 2026-05-29. Reason: a Galaxy phone stamps BOTH its device cert AND a `trainedAlgorithmicMedia`/genAIType AI marker on a Generative-Edit image, so treating it as a "genuine camera capture" would false-fire integrity-clash rule 2 on every Galaxy AI edit. The signer tokens (`b"Samsung Galaxy"` cert org — distinct from the EXIF `SM-xxxx` model string on ordinary Samsung photos; `b"com.asus.gallery"` claim generator) only resolve the platform label; the AI verdict still comes from the source-type / genAIType. ASUS Gallery is a C2PA-signed edit with no AI marker, so it attributes the platform without asserting `is_ai`. **Samsung's `genAIType` (in the proprietary `PhotoEditor_Re_Edit_Data` JSON) is an undocumented Galaxy-AI editing marker** (`metadata.samsung_genai`, gated on the `PhotoEditor_Re_Edit_Data` container; non-zero value = AI tool used, values {1,5} observed): medium-confidence because the field has no public spec (verified 2026-05-29: absent from C2PA spec + Samsung docs), but it co-occurred with `trainedAlgorithmicMedia` in 3/3 verified files that record a source-type and was the SOLE AI marker on a Galaxy S24 file that omits the source type. Camera C2PA marks capture authenticity, not AI (Pixel carries `computationalCapture`, not `trainedAlgorithmicMedia`), so these never set `is_ai` -- that stays driven by digital-source-type. `c2pa.cbor_text_after` (now public) is best-effort for the `generator` detail string only and can be None when the manifest keys it `claim_generator_info` (Pixel). **Issuer→generator mapping is `is_ai`-gated** (`_attribute_platform(issuers, is_ai=c2pa_is_ai)`): a specific AI-generator platform is named only when the digital-source-type is `trainedAlgorithmicMedia`; on a non-AI source an issuer substring is treated as incidental (an "Adobe XMP" toolkit string in an *unmapped* Canon/Sony capture would otherwise mislabel it "Adobe Firefly"), so it degrades to the neutral "C2PA signer: X" label. Real Firefly/OpenAI/Google output carries the AI source-type, so it is unaffected (verified: chatgpt-1.png→OpenAI, firefly-1.png→Adobe Firefly still attribute). `_attribute_platform` defaults `is_ai=True` so the mapping stays unit-testable in isolation. Add capture-camera tokens to `_DEVICE_C2PA_PLATFORM`, editing-app/AI-device signer tokens to `_SIGNER_C2PA_PLATFORM`, generator/issuer platforms to the `C2PA_AI_VENDORS` registry in `constants.py` (which derives `_ISSUER_PLATFORM`), not inline. For non-PNG containers (JPEG/WebP/AVIF/HEIF/JXL) the caBX parser returns nothing, so issuer (`_issuers_in`) and generator (`_ai_tools_in`, reusing `C2PA_AI_TOOLS`) are recovered by binary-scanning the first MB. EXIF `Software` / `Make` / `Artist` / `ImageDescription` and XMP `CreatorTool` generator tags are read by `metadata.exif_generator` (PIL+piexif for any format PIL opens incl. AVIF, plus a container-agnostic XMP raw-byte scan that also covers HEIF/JXL), matched against `AI_GENERATOR_TOKENS` so ordinary editors (plain "Adobe Photoshop") and real-camera `Make` ("Apple"/"Canon") are not flagged. **Ideogram tags its output with EXIF `Make="Ideogram AI"`** (verified on a real download 2026-05-24) — that's why `Make` is read. **Integrity-clash detection** (`_integrity_clashes`, surfaced as `ProvenanceReport.integrity_clashes`, printed in red by `identify` and serialized to `--json`): contradictions between independent generator stamps are a laundering/spoofing tell. Two rules: (1) two or more distinct AI-origin vendors named by **independent** signals (e.g. C2PA OpenAI + EXIF `Make="Ideogram AI"`), and (2) a camera-capture C2PA device (`_DEVICE_C2PA_PLATFORM`) coexisting with any AI-generation marker. **Independence is source-grouped (`_CLASH_SOURCE`, added 2026-06-02):** the C2PA issuer attribution (`c2pa`) and the SynthID proxy (`synthid`) are NOT independent — the proxy is inferred from the *same* manifest — so they share one source and two vendors named within a single manifest do not clash. This killed a false-positive class found on the spaces corpus: legitimate multi-actor manifests where a product wraps another vendor's engine (Microsoft Designer on OpenAI → `OpenAI, Microsoft`; Microsoft on Google → `Microsoft, Google LLC, Google C2PA Core Generator Library`) or an edit chain re-signs (Adobe over a Gemini original → Adobe c2pa + Google synthid) — 19 such files across the 2026-06-01/02 batches read as clashes before the fix. Rule 1 still fires when a manifest vendor disagrees with a genuinely independent stamp (EXIF/XMP generator, IPTC `AISystemUsed`, AIGC, xAI); each non-`c2pa`/`synthid` family is its own source (`test_identify.py::TestIntegrityClashes::{test_multi_actor_manifest_no_clash,test_manifest_vendor_vs_independent_signal_clashes}`). Vendor normalization is `_vendor_of` over `_AI_VENDOR_TOKENS` (so a C2PA "Google (Gemini)" issuer and a SynthID-Google proxy agree, while different vendors clash). **High-precision by design:** only hard generator stamps feed it (C2PA-issuer when source is AI, SynthID, EXIF/XMP generator, IPTC `AISystemUsed`, xAI, AIGC); the fuzzy visible sparkle and the open invisible watermark are **excluded** (the latter can be a by-product of our own SDXL removal pass). The c2pa vendor is classified from the issuer attribution / generator, NOT the resolved `platform` (a camera label like "Google Pixel" would mis-normalize to "Google"). All real single-origin fixtures (chatgpt/firefly/doubao/grok/mj) verified to produce **zero** clashes (false-positive guard in `test_identify.py::TestRealSamplesHaveNoClash`). -- `watermark_registry.py` — **single catalog of known visible watermarks**, the unified "find known marks in their usual places, recognize, remove" entry. **Reverse-alpha based by policy**: a mark is listed only once a real alpha map has been captured for it, and removal inverts that map (`original = (wm - a*logo)/(1-a)`) — Gemini recovers cleanly with no inpaint (its sparkle alpha comes from a pure-black capture, so it is near-exact), while **Doubao, Jimeng, and Samsung all add an always-on THIN residual inpaint** over the glyph footprint (their text marks re-rasterize + jitter a few px per image, so a single capture cannot pixel-cancel them; the inpaint blends into the reverse-alpha-recovered pixels). Arbitrary-region inpainting still lives in `region_eraser`/`erase`. Each `KnownMark` ties a key to {usual `location`, `in_auto` flag, `recovery` (="reverse-alpha"), a `detect` adapter → uniform `MarkDetection`, a `remove` adapter}. Entries today: `gemini` (bottom-right sparkle), `doubao` (bottom-right "豆包AI生成"), `jimeng` (bottom-right "★ 即梦AI"), and `samsung` (bottom-**LEFT** "✦ Contenuti generati dall'AI", Samsung Galaxy AI, Italian locale). `detect_marks` scans all; `best_auto_mark` picks the highest-confidence detection. **Cross-engine confidences aren't directly comparable**, so the gemini adapter applies the corpus-validated 0.5 sparkle threshold (`_GEMINI_AUTO_MIN_CONF`) for its `detected` flag — otherwise the gemini engine's loose internal threshold weakly fires (~0.36) on the Doubao text and hijacks `auto`. The shape-keyed Doubao/Jimeng/Samsung NCC detectors don't cross-fire (jimeng scores ~0.22 on the Doubao strip, well under its 0.45 threshold; Samsung is bottom-left so it shares no corner with the others, and scored 0.0 on Doubao/Jimeng captures and they 0.0 on a real Samsung photo), so `auto` picks the right one. `cli.cmd_visible` is registry-driven: `--mark auto` → `best_auto_mark`, `--mark ` → that mark; `--mark` choices come from `mark_keys()`. `_doubao_remove`/`_jimeng_remove`/`_samsung_remove` apply reverse-alpha only when the mark is detected AND `reverse_alpha_available`; outside that, removal is **skipped** (not inpainted). Add a new visible mark = one `KnownMark` entry + its engine (with a captured alpha map); do not re-add per-mark `if` branches in the CLI. **Alpha-on-save policy (issue #30):** `cli._write_bgr_with_alpha` rejoins the input's alpha plane **unchanged** — it must NOT zero alpha in the watermark bbox. Reverse-alpha (and `erase` inpaint) recover real pixels there, so zeroing alpha punched a transparent hole that renders as a solid **white box** on any non-transparent viewer (Gemini app exports are opaque RGBA, so every user hit it; regression-guarded by `test_visible_keeps_alpha_opaque_in_watermark_region`). The registry `remove()` still returns its region (used for `inpaint_residual` positioning), but the CLI no longer uses it to clear alpha. +- `watermark_registry.py` — **single catalog of known visible watermarks**, the unified "find known marks in their usual places, recognize, remove" entry. **Reverse-alpha based by policy**: a mark is listed only once a real alpha map has been captured for it, and removal inverts that map (`original = (wm - a*logo)/(1-a)`) — Gemini recovers cleanly with no inpaint (its sparkle alpha comes from a pure-black capture, so it is near-exact), while **Doubao, Jimeng, and Samsung all add an always-on THIN residual inpaint** over the glyph footprint (their text marks re-rasterize + jitter a few px per image, so a single capture cannot pixel-cancel them; the inpaint blends into the reverse-alpha-recovered pixels). Arbitrary-region inpainting still lives in `region_eraser`/`erase`. Each `KnownMark` ties a key to {usual `location`, `in_auto` flag, `recovery` (="reverse-alpha"), a `detect` adapter → uniform `MarkDetection`, a `remove` adapter}. Entries today: `gemini` (bottom-right sparkle), `doubao` (bottom-right "豆包AI生成"), `jimeng` (bottom-right "★ 即梦AI"), and `samsung` (bottom-**LEFT** "✦ Contenuti generati dall'AI", Samsung Galaxy AI, Italian locale). `detect_marks` scans all; `best_auto_mark` picks the highest-confidence detection. **Cross-engine confidences aren't directly comparable**, so the gemini adapter applies the corpus-validated 0.5 sparkle threshold (`_GEMINI_AUTO_MIN_CONF`) for its `detected` flag — otherwise the gemini engine's loose internal threshold weakly fires (~0.36) on the Doubao text and hijacks `auto`. The shape-keyed Doubao/Jimeng/Samsung NCC detectors don't cross-fire (jimeng scores ~0.22 on the Doubao strip, well under its 0.45 threshold; Samsung is bottom-left so it shares no corner with the others, and scored 0.0 on Doubao/Jimeng captures and they 0.0 on a real Samsung photo), so `auto` picks the right one. `cli.cmd_visible` is registry-driven: `--mark auto` → `best_auto_mark`, `--mark ` → that mark; `--mark` choices come from `mark_keys()`. **`cli._remove_visible_auto` is the shared visible-removal helper used by `cmd_all`/`cmd_batch` too** (they no longer hardcode `GeminiEngine`), so `all`/`batch` remove Doubao/Jimeng/Samsung text marks, not just the Gemini sparkle (regression-guarded by `test_all_visible_step_uses_registry`). The three text-mark adapters were consolidated 2026-06-09: a single `_text_mark(key, label, location)` builds the registry row from one parameterized `_text_mark_detect`/`_text_mark_remove` pair (reverse-alpha only when detected/forced AND `reverse_alpha_available`, else skipped — no inpaint); the gemini adapters stay bespoke. Add a new visible mark = one `_text_mark(...)` row + its `TextMarkConfig` (with a captured alpha map); do not re-add per-mark `if` branches or copy-paste adapters. **Alpha-on-save policy (issue #30):** `cli._write_bgr_with_alpha` rejoins the input's alpha plane **unchanged** — it must NOT zero alpha in the watermark bbox. Reverse-alpha (and `erase` inpaint) recover real pixels there, so zeroing alpha punched a transparent hole that renders as a solid **white box** on any non-transparent viewer (Gemini app exports are opaque RGBA, so every user hit it; regression-guarded by `test_visible_keeps_alpha_opaque_in_watermark_region`). The registry `remove()` still returns its region (used for `inpaint_residual` positioning), but the CLI no longer uses it to clear alpha. - `gemini_engine.py` — visible Gemini-sparkle remover/detector (cv2/numpy, no GPU). `detect_sparkle_confidence(path)` is the file-level entry point used by `identify.py`. The public entry points normalize a grayscale (2D) or RGBA (4-channel) input to BGR up front so a non-BGR image does not crash the cv2 pipeline. **Detection localization (issue #36):** `detect_watermark`'s global multi-scale NCC search applies a size weight (`(scale/96)**0.5`) that suppresses tiny-patch false positives but can let a larger, mediocre match (e.g. a bright collar in a portrait) outrank a small, near-perfect sparkle in the corner — so a faint sparkle on a busy background scored below threshold and read as clean (the regression osachub reported from widening the search window 256px->512px between v0.7.2 and v0.8.8). `_corner_promote` adds a bottom-right-corner raw-NCC pass on top of the global search: a match with raw NCC >= `_CORNER_PROMOTE_NCC` 0.85 that beats the global pick overrides it (it only ever replaces a lower-fidelity pick, so it cannot weaken an existing detection), rescuing the buried sparkle without reverting the wider window. The corner side is **relative-clamped** (`_CORNER_PROMOTE_FRAC` 0.20 of the short side, clamped to `[_CORNER_PROMOTE_MIN` 96, `_CORNER_PROMOTE_MAX` 384`]`): a fixed 256px is a true corner on a large image but covers ~70% of a small portrait, where a real photo raw-matches the star at ~0.81 (relative tightening drops that worst case to ~0.69, while the upper clamp stops the corner ballooning on huge images where a real photo reached ~0.83 at 512px). The 0.85 gate sits midway between the worst real-photo corner match (~0.78 across native + downscaled negatives) and a genuine faint sparkle (~0.93), so promotion adds true detections with zero corpus false positives (Gemini's sparkle sits ~60-160px from the corner at fixed margins, covered by the [96, 384] band at every measured size). Regression-guarded by `test_gemini_engine.py::TestCornerPromotion`. **Removal is reverse-alpha with an over-subtraction guard** (`remove_watermark` → `_reverse_alpha_blend`, else `_inpaint_footprint`): the sparkle alpha is computed (`alpha = max(R,G,B)/255`) from the bundled sparkle-on-black captures `assets/gemini_bg_{96,48}.png` (the capture max is ~130, NOT 255 — the sparkle is a ~51%-opaque white overlay, so `alpha` maxes at ~0.51, which is CORRECT for the capture, not under-exposed). The alpha is near-exact only when the real mark's effective opacity matches the capture, which holds on bright/flat backgrounds — re-verified clean on `demo_banana_before.png` 2026-05-31. **Issue #30 (dark-background black pit):** on a dark/textured background (e.g. grass, ~73) the real sparkle's effective opacity is LOWER than the captured 0.51, so the fixed-alpha reverse blend OVER-subtracts (`watermarked - a*logo` goes negative) and drives the footprint to black — the white sparkle becomes a black diamond. `remove_watermark` now detects this via `_reverse_alpha_oversubtracts` (fraction of footprint pixels with `alpha >= _FOOTPRINT_ALPHA` 0.1 whose numerator < 0 exceeds `_OVERSUB_FOOTPRINT_FRAC` 0.05) and **inpaints the footprint** (`_inpaint_footprint`, cv2 NS over the dilated alpha mask) from the surrounding pixels instead. **Behavior-neutral on the working case:** a bright background over-subtracts at ~0% so reverse-alpha is used and the output is byte-identical to before (verified: demo_banana 0.0 frac vs issue-#30 grass 0.61 frac; regression-guarded by `test_gemini_engine.py::TestOverSubtractionGuard`, which composites the sparkle at a reduced effective alpha to reproduce the mismatch). **Under-subtraction (the symmetric case, fixed 2026-06-03):** some real Gemini sparkles are rendered MORE opaque than the captured ~0.51, so the fixed-alpha reverse blend UNDER-subtracts and leaves a bright sparkle residual the detector still fires on (measured on the spaces corpus: a visible-removal audit through the registry path left a detectable sparkle on a meaningful fraction of marks, all under-removals, NOT a background-brightness class — failures and successes had the same input confidence and the same background-luma distribution; the discriminator was the removal delta itself). `remove_watermark` now estimates a per-image alpha gain (`_estimate_alpha_gain`: effective sparkle opacity at the bright core vs the local background ring, `a_eff/a_cap`, clamped `[1.0, _ALPHA_GAIN_MAX` 1.94`]`) and scales the alpha to match before the over-sub/blend branch. The gain cleanly separates on the corpus (under-removed marks ~1.47, cleanly-removed ~1.00), and a deadband (`_ALPHA_GAIN_DEADBAND` 1.05) keeps a matching sparkle **byte-identical** to the pre-fix output, so the fix is purely additive (0 regressions on the audit set; the over-sub guard still runs on the scaled alpha as the safety net for an over-shooting estimate). Regression-guarded by `test_gemini_engine.py::TestUnderSubtractionGain` (composites a more-opaque-than-capture sparkle; **asserts on footprint pixels, NOT the detector** — the detector's NCC is degenerate on a flat synthetic background, so a re-detect conf is meaningless there; the real corpus removal drops the detector from ~0.80 to ~0.27). **False-positive gate (added 2026-06-03):** `detect_watermark`'s shape-only NCC (`spatial*0.5 + gradient*0.3 + var*0.2`) fires on ornate/flat content (text strips, banners, hatching) that coincidentally matches the diamond shape — a real Gemini sparkle is a bright WHITE overlay, so its core sits above the local background, but the NCC is contrast-invariant and cannot see that. The fusion now **demotes** (caps confidence to 0.30) any match that is BOTH low-confidence (`< _SPARKLE_FP_CONF` 0.65) AND has a low core-ring brightness margin (`_core_ring_margin < _SPARKLE_FP_MARGIN` 5). Real sparkles escape via EITHER high confidence (white-bg sparkles score ≥0.79 despite a low margin — the NCC shape match is strong) OR high margin (dark/mid backgrounds, incl. the #36 faint-corner case, lift well clear), so BOTH must fail to demote. The gate is **monotonic** (only ever removes detections, never adds), so it cannot regress the verified-negative corpus (already 0 FPs). On the spaces corpus it demoted 16/495 flagged sparkles (13 carried no AI metadata = content FPs; the 3 AI-meta were visually FPs / a near-invisible white-on-white sparkle whose AI verdict is held by metadata anyway), and dropped the removal-audit failures 20→15 (post-removal flat footprints the NCC re-fired on). `_core_ring_margin` and `_estimate_alpha_gain` share the `_core_and_bg` helper (core 75th-pct brightness vs background-ring median). Regression-guarded by `test_gemini_engine.py::TestSparkleFalsePositiveGate`. **Self-verify repair (added 2026-06-04):** the gain estimate corrects most under-subtractions, but a tail of strong sparkles still survived reverse-alpha (position jitter, or a gain the `[1.0, 1.94]` clamp could not fully reach). After the reverse blend, `remove_watermark` re-detects via `_verify_and_repair`; when a sparkle at or above `_VERIFY_FALLBACK_CONF` 0.5 (the registry's real fail line) remains, it inpaints the footprint and **keeps that only when it lowers the re-detect confidence** — purely additive (the common clean removal re-detects below 0.5 and is returned untouched, so it can never regress). On the spaces corpus this rescued **4 of the 15 remaining gemini removal-audit failures** (15→11, doubao/jimeng still 0), verified through the registry/CLI path. Costs one extra `detect_watermark` per removal (two when the fallback fires). Regression-guarded by `test_gemini_engine.py::TestVerifyAndRepair` (stubs `detect_watermark` to drive the keep-best control flow, since the NCC is degenerate on flat synthetics). **An offset+scale alignment search was prototyped on the remaining 11 fails and REJECTED (2026-06-04):** an audit "ceiling" test suggested it could rescue 4 more (e.g. a5a9 0.577→0.417), but direct inspection showed those were NCC-gaming, not removal — the lower-scoring placement left the sparkle as bright or BRIGHTER (a5a9: first-pass slot 99.5th-pct ~76 at background level, the "aligned win" slot ~164), it just reshaped the residual so the contrast-invariant shape-NCC scored lower. A slot-brightness sanity gate rejected every one, so alignment contributed 0 genuine rescues and was removed (the footprint inpaint stays because it physically reconstructs the slot from its darker surroundings, so its rescues are real). **Lesson: the visible-audit pass/fail metric (re-detect conf < 0.5) is gameable by reshaping the residual — optimizing it directly finds NCC-gaming placements, not clean removals; gate any removal candidate on a physical brightness check, not the detector alone.** The 11 survivors are near-white ill-conditioning (reverse-alpha divides by `1-a`≈0.02) or detector false positives (before≈after≈0.51) that no reverse-alpha placement fixes. The registry's optional `inpaint_residual` (edge cleanup) is a no-op on a clean reverse-alpha removal (and on the same corpus it lowered the re-detect conf on 3 marks, raised it on 10, no-op on 466 — net-neutral on pass/fail, so the self-verify repair, not it, drives the removal tail); an earlier "Gemini smears" read was a misjudged soft-fur original, not an artifact. **The bg assets are now rebuilt from OUR OWN controlled captures** (`data/gemini_capture/captures/`, committed) by `scripts/visible_alpha_solve.py gemini`, which locates the 96px sparkle on the black capture and crops it to the two logo sizes; our capture matched the previously third-party-sourced `gemini_bg_96.png` to **NCC 0.9998**, validating the asset and making it reproducible. Gemini's multi-size fixed-slot model is genuinely different from the Doubao/Jimeng text-strip engines (so it stays a separate engine, not part of the shared-base refactor). -- `doubao_engine.py` — visible Doubao "豆包AI生成" remover/detector (cv2/numpy, no GPU). `DoubaoEngine.locate` anchors a bottom-right box by **geometry** (mark scales with image WIDTH), `extract_mask` pulls the light, low-chroma glyphs (the detection candidate) using a per-pixel channel-spread proxy `sat = roi.max(axis=2) - roi.min(axis=2)` (no HSV conversion). `detect` is **shape-consistent**: it matches the bundled alpha glyph silhouette (`assets/doubao_alpha.png`) against the candidate via zero-mean normalized correlation (`_template_match_score`, cv2 `TM_CCOEFF_NORMED`), gated at `DETECT_NCC_THRESHOLD` 0.4 over a small `DETECT_MIN_COVERAGE` floor. Keying on glyph SHAPE (not coverage heuristics) fixed #23 (corpus FP 7/1243). **Removal = reverse-alpha + thin residual inpaint** (`remove_watermark_reverse_alpha`): `original = (wm - a*logo)/(1-a)` from the bundled alpha map + `_ALPHA_LOGO_BGR` (pure white) + `_ALPHA_*_FRAC` geometry, then a deliberately THIN inpaint (`_RESIDUAL_*`, `INPAINT_NS`) over the glyph footprint clears leftover edges without smearing. **Alpha is rebuilt by `scripts/visible_alpha_solve.py` (the careful gray-self solve: cubic background fit, mean over channels, full halo, unblurred), same recipe as Jimeng** — the captures are committed in `data/doubao_capture/captures/`. **Removal aligns ALWAYS** (no `_ALPHA_NATIVE_BAND` fast-path): it tries fixed geometry AND `_aligned_alpha_map`'s `TM_CCOEFF_NORMED` scale+position search and keeps the lower-residual one — the mark is re-rasterized and a few px off per image, so fixed geometry alone leaves a visible outline even at 2048. **The locate box (`WM_*`) is generous (0.22 wide, margins 0.004) and reaches close to the corner** — a tight box (the old 0.185 / margin 0.012) let a corner-ward shift fall OUTSIDE the alignment search, so the align missed and a readable outline survived; regression-guarded by `test_recovers_shifted_mark_on_texture` (composes the alpha shifted on a known texture; old box ~29 vs new ~1 mean residual). **Issue #13 follow-up defect (found 2026-05-31): the SHIPPED Doubao removal left a clearly READABLE "豆包AI生成" outline on the real `doubao-1.png` sample, while `detect` returned conf 0.0 (it is fooled by a thin outline) so `test_reverse_alpha_removes_mark` passed and the old "56/56 clean" claim was detector-measured, not visual.** Root cause: bad alpha (under-estimated, max ~0.65) + fixed-no-inpaint + tight box; the careful rebuild + always-align + thin inpaint + wide box takes it from a readable outline to faint texture-level traces (parity with Jimeng — a single capture cannot pixel-cancel a per-image re-rasterized mark). **Lesson: a detector-only removal test is insufficient; assert visual residual (the textured-shift test).** **`extract_mask` guards a degenerate ROI (`bh < 16 or bw < 16` -> empty mask, skips cv2):** the always-align removal scores each placement with a residual `detect(out)`, and on an extremely wide/short image (e.g. 2048x1, `test_wide_short_does_not_raise`) that fed cv2's GaussianBlur a ~1-px-tall ROI and **faulted natively on Windows py3.12 (access violation, non-deterministic — one CI cell went red while a re-run passed)**; the old at-native path never ran `detect` on degenerate sizes. Real images always clear the guard (the `WM_*` box floors are `max(16, …)` height / `max(40, …)` width), so it only short-circuits slivers. `reverse_alpha_available` is just "asset present"; the registry gates removal on `detect`. The shipped third-party `_refs/zhengsuanfa_doubao_alpha_120x20.png` is NOT a usable alpha (verified 2026-05-29). Arbitrary-region inpainting is `region_eraser`/`erase`. -- `jimeng_engine.py` — visible Jimeng / Dreamina "★ 即梦AI" remover/detector (cv2/numpy, no GPU), built 2026-05-30 from issue #13's solid captures (@powersee). Mirrors `doubao_engine`: `locate` anchors a bottom-right box by **geometry** (scales with WIDTH), `extract_mask` pulls the light low-chroma glyphs (white top-hat + grayish + min-luma), `detect` matches the bundled "即梦AI" glyph silhouette (`assets/jimeng_alpha.png`) via `TM_CCOEFF_NORMED` over a coverage floor. Threshold `DETECT_NCC_THRESHOLD` **0.45** cleanly separates real Jimeng marks (>=0.81) from the Doubao strip (0.21) and other AI output (0.0), so the two ByteDance marks don't cross-fire in `--mark auto`. **Logo is pure white (255,255,255)** (`_ALPHA_LOGO_BGR`; the white capture + an L-pair-solve confirm ~254.6); compositing is **sRGB, not linear** (a linear-light solve tripled the cross-residual). **Alpha rebuilt by `scripts/visible_alpha_solve.py` from the GRAY capture** (`data/jimeng_capture/captures/`, the solid captures now committed): `a = (I - B)/(255 - B)`, B a per-capture **cubic** background fit over the non-glyph pixels, **averaged over channels, full halo extent (down to a~0.02), unblurred**. Gray (bg ~132) is the deliberate choice over black: it is the best proxy for real content (the mark sits on bright photo areas, not on black), and the careful build drops the gray self-residual to ~1.3. **The mask quality, not the method, was the earlier limit** — a max-channel / quadratic-bg / blurred / halo-truncated build (and a black-dominated LS) left a visible outline (lesson from issue #13: when reverse-alpha leaves a ghost, suspect the captured alpha map before adding heuristics or switching method). Geometry emitted by the solver at `_ALPHA_NATIVE_WIDTH` 2048: `_ALPHA_WIDTH_FRAC` 0.202, `_ALPHA_HEIGHT_FRAC` 0.058, margins ~0.029. **Removal = reverse-alpha + a deliberately THIN residual inpaint** (`remove_watermark_reverse_alpha`, `_RESIDUAL_DILATE` 5 over the `_RESIDUAL_ALPHA_FLOOR` 0.05 footprint, `_RESIDUAL_INPAINT_RADIUS` 2, `INPAINT_NS`): a single 2048 alpha cannot pixel-cancel the mark re-rasterized at another resolution (alpha maps from independent captures correlate 0.998, not 1.0; off-native reverse-alpha alone only halves the mark), so a tight inpaint clears the residual edges WITHOUT the texture/edge smear a wide full-footprint pass caused. **Placement ALWAYS tries fixed geometry AND `_aligned_alpha_map`'s NCC scale+position search, keeping the lower-residual** — the mark re-rasterizes + jitters a few px per image even at the captured width, so fixed geometry alone misses (there is no `_ALPHA_NATIVE_BAND` fast-path; the scale search `_ALPHA_ALIGN_SEARCH` is fine-stepped, and the `WM_*` locate box is generous so a corner-ward shift stays inside the search — the same widen that fixed Doubao). Verified clean on the solid captures (native 2048; faint self-residual ~1.3 visible only on a dead-flat field, hidden by real texture) and a real 1440-wide Jimeng download (off-native, table edge preserved). `reverse_alpha_available` is just "asset present"; the registry gates on `detect`. **No committed real sample** (the real content download stays gitignored; only the solid calibration captures are committed) — `tests/test_jimeng_engine.py` synthesizes a mark from the bundled alpha asset, and `test_recovers_shifted_mark_on_texture` guards the align-on-shift path that the Doubao defect exposed. Jimeng images are independently caught by the China TC260 AIGC label in `metadata`/`identify`, so this engine is the visible-mark *removal* path, not a new `identify` signal. -- `samsung_engine.py` — visible Samsung Galaxy AI "✦ Contenuti generati dall'AI" remover/detector (cv2/numpy, no GPU), built 2026-06-05 from issue #37's flat captures (@f-liva). Mirrors `jimeng_engine` but anchored **bottom-LEFT** (Doubao/Jimeng are bottom-right): `locate` anchors a bottom-left box by **geometry** (scales with WIDTH), `extract_mask` pulls the light low-chroma glyphs (white top-hat + grayish + min-luma — `LOGO_MIN_LUMA` is lowered to **110** because the mark is faint, peak alpha ~0.38, so on a mid/dark background its glyph luma is lower than Jimeng's), `detect` matches the bundled glyph silhouette (`assets/samsung_alpha.png`) via `TM_CCOEFF_NORMED` over a coverage floor. Threshold `DETECT_NCC_THRESHOLD` **0.40** (real marks ~0.79 on a real photo, ~0.57/0.71 on the black/gray captures; 0.0 on Doubao/Jimeng captures, and Doubao/Jimeng score 0.0 on a real Samsung photo — no cross-fire, also because the corner differs). **Logo is pure white (255,255,255)** (`_ALPHA_LOGO_BGR`; white capture confirms). **Alpha solved by `scripts/visible_alpha_solve.py samsung` from the GRAY capture** (`data/samsung_capture/captures/`, the flat black/gray/white captures committed; the solver gained a `corner="bl"` mode + left-margin logging for this), same careful recipe as Jimeng (cubic background, mean-channel, full halo, unblurred). Geometry emitted at `_ALPHA_NATIVE_WIDTH` **1086** (the flat-edit capture width): `_ALPHA_WIDTH_FRAC` 0.3195, `_ALPHA_HEIGHT_FRAC` 0.0378, `_ALPHA_MARGIN_LEFT_FRAC` 0.0110, `_ALPHA_MARGIN_BOTTOM_FRAC` 0.0064. **Removal = reverse-alpha + a deliberately THIN residual inpaint** (`remove_watermark_reverse_alpha`, same `_RESIDUAL_*` recipe as Jimeng) with **always-try fixed AND `_aligned_alpha_map` NCC scale+position search, keep the lower-residual** (`_ALPHA_ALIGN_SEARCH` widened to (0.85, 1.18, 23) because the flat captures are far off the real-photo width). **Resolution caveat:** the flat captures arrived at 1086 wide while real photos are ~2958 wide (the mark scales with width, so the captured glyph ~334px is ~2.7x smaller than the ~903px real-photo glyph); width-scale + NCC-align still removes it cleanly (verified on a real 2958-wide @f-liva photo: re-detect 0.79→0.00, no readable text or outline on the recovered wooden table — checked **visually**, not just by the detector, per the Gemini self-verify lesson), but a flat capture at the real photo resolution would make the alpha pixel-sharp instead of upscaled (open quality upgrade, noted in `data/samsung_capture/README.md`). **The mark is locale-specific** (text differs per language); this build is the Italian "Contenuti generati dall'AI" variant — other locales need their own captured template. `reverse_alpha_available` is just "asset present"; the registry gates on `detect`. **No committed real sample** (the real photo stays gitignored; only the flat calibration captures are committed) — `tests/test_samsung_engine.py` synthesizes a mark from the bundled alpha asset (bottom-left geometry), with `test_recovers_shifted_mark_on_texture` guarding the align-on-shift path. Samsung Galaxy AI edits are independently caught by C2PA + the `genAIType` marker in `metadata`/`identify`, so this engine is the visible-mark *removal* path; it also feeds `identify` as the medium-confidence `visible_samsung` signal via the registry (the stripped-metadata fallback). +- `_text_mark_engine.py` — **shared base for the three reverse-alpha text-mark engines (Doubao/Jimeng/Samsung), extracted 2026-06-09** (they were ~90% byte-identical clones). `TextMarkEngine(config: TextMarkConfig)` owns the whole `locate → extract_mask → detect → _fixed/_aligned_alpha_map → _apply_reverse_alpha → remove_watermark_reverse_alpha` pipeline (+ the asset-keyed `load_alpha_template`/`glyph_silhouette`/`template_match_score` caches). Each engine module is now a thin subclass: it supplies only its `TextMarkConfig` (the tuned constants, the bundled asset, and the bounded structural deltas — `corner` br/bl, `margin_floor` 4/2, `morph_open_size` 5/3, `min_gw` 8/16) plus the test-facing module shims (`_alpha_template`/`_glyph_silhouette`/`_template_match_score` + the constants). Behavior is byte-exact vs the old per-engine code (the three engine test suites pass unchanged). Gemini stays a SEPARATE engine (its multi-size fixed-slot sparkle model is genuinely different). Add a new text mark = a new `TextMarkConfig` + a thin subclass + one registry `_text_mark(...)` row. The engine bullets below describe each mark's calibration history; the LOGIC lives here. +- `doubao_engine.py` — **a thin `_text_mark_engine.TextMarkEngine` subclass (config only) since 2026-06-09.** visible Doubao "豆包AI生成" remover/detector (cv2/numpy, no GPU). `DoubaoEngine.locate` anchors a bottom-right box by **geometry** (mark scales with image WIDTH), `extract_mask` pulls the light, low-chroma glyphs (the detection candidate) using a per-pixel channel-spread proxy `sat = roi.max(axis=2) - roi.min(axis=2)` (no HSV conversion). `detect` is **shape-consistent**: it matches the bundled alpha glyph silhouette (`assets/doubao_alpha.png`) against the candidate via zero-mean normalized correlation (`_template_match_score`, cv2 `TM_CCOEFF_NORMED`), gated at `DETECT_NCC_THRESHOLD` 0.4 over a small `DETECT_MIN_COVERAGE` floor. Keying on glyph SHAPE (not coverage heuristics) fixed #23 (corpus FP 7/1243). **Removal = reverse-alpha + thin residual inpaint** (`remove_watermark_reverse_alpha`): `original = (wm - a*logo)/(1-a)` from the bundled alpha map + `_ALPHA_LOGO_BGR` (pure white) + `_ALPHA_*_FRAC` geometry, then a deliberately THIN inpaint (`_RESIDUAL_*`, `INPAINT_NS`) over the glyph footprint clears leftover edges without smearing. **Alpha is rebuilt by `scripts/visible_alpha_solve.py` (the careful gray-self solve: cubic background fit, mean over channels, full halo, unblurred), same recipe as Jimeng** — the captures are committed in `data/doubao_capture/captures/`. **Removal aligns ALWAYS** (no `_ALPHA_NATIVE_BAND` fast-path): it tries fixed geometry AND `_aligned_alpha_map`'s `TM_CCOEFF_NORMED` scale+position search and keeps the lower-residual one — the mark is re-rasterized and a few px off per image, so fixed geometry alone leaves a visible outline even at 2048. **The locate box (`WM_*`) is generous (0.22 wide, margins 0.004) and reaches close to the corner** — a tight box (the old 0.185 / margin 0.012) let a corner-ward shift fall OUTSIDE the alignment search, so the align missed and a readable outline survived; regression-guarded by `test_recovers_shifted_mark_on_texture` (composes the alpha shifted on a known texture; old box ~29 vs new ~1 mean residual). **Issue #13 follow-up defect (found 2026-05-31): the SHIPPED Doubao removal left a clearly READABLE "豆包AI生成" outline on the real `doubao-1.png` sample, while `detect` returned conf 0.0 (it is fooled by a thin outline) so `test_reverse_alpha_removes_mark` passed and the old "56/56 clean" claim was detector-measured, not visual.** Root cause: bad alpha (under-estimated, max ~0.65) + fixed-no-inpaint + tight box; the careful rebuild + always-align + thin inpaint + wide box takes it from a readable outline to faint texture-level traces (parity with Jimeng — a single capture cannot pixel-cancel a per-image re-rasterized mark). **Lesson: a detector-only removal test is insufficient; assert visual residual (the textured-shift test).** **`extract_mask` guards a degenerate ROI (`bh < 16 or bw < 16` -> empty mask, skips cv2):** the always-align removal scores each placement with a residual `detect(out)`, and on an extremely wide/short image (e.g. 2048x1, `test_wide_short_does_not_raise`) that fed cv2's GaussianBlur a ~1-px-tall ROI and **faulted natively on Windows py3.12 (access violation, non-deterministic — one CI cell went red while a re-run passed)**; the old at-native path never ran `detect` on degenerate sizes. Real images always clear the guard (the `WM_*` box floors are `max(16, …)` height / `max(40, …)` width), so it only short-circuits slivers. `reverse_alpha_available` is just "asset present"; the registry gates removal on `detect`. The shipped third-party `_refs/zhengsuanfa_doubao_alpha_120x20.png` is NOT a usable alpha (verified 2026-05-29). Arbitrary-region inpainting is `region_eraser`/`erase`. +- `jimeng_engine.py` — **a thin `TextMarkEngine` subclass (config only) since 2026-06-09.** visible Jimeng / Dreamina "★ 即梦AI" remover/detector (cv2/numpy, no GPU), built 2026-05-30 from issue #13's solid captures (@powersee). Shares the base with `doubao_engine`: `locate` anchors a bottom-right box by **geometry** (scales with WIDTH), `extract_mask` pulls the light low-chroma glyphs (white top-hat + grayish + min-luma), `detect` matches the bundled "即梦AI" glyph silhouette (`assets/jimeng_alpha.png`) via `TM_CCOEFF_NORMED` over a coverage floor. Threshold `DETECT_NCC_THRESHOLD` **0.45** cleanly separates real Jimeng marks (>=0.81) from the Doubao strip (0.21) and other AI output (0.0), so the two ByteDance marks don't cross-fire in `--mark auto`. **Logo is pure white (255,255,255)** (`_ALPHA_LOGO_BGR`; the white capture + an L-pair-solve confirm ~254.6); compositing is **sRGB, not linear** (a linear-light solve tripled the cross-residual). **Alpha rebuilt by `scripts/visible_alpha_solve.py` from the GRAY capture** (`data/jimeng_capture/captures/`, the solid captures now committed): `a = (I - B)/(255 - B)`, B a per-capture **cubic** background fit over the non-glyph pixels, **averaged over channels, full halo extent (down to a~0.02), unblurred**. Gray (bg ~132) is the deliberate choice over black: it is the best proxy for real content (the mark sits on bright photo areas, not on black), and the careful build drops the gray self-residual to ~1.3. **The mask quality, not the method, was the earlier limit** — a max-channel / quadratic-bg / blurred / halo-truncated build (and a black-dominated LS) left a visible outline (lesson from issue #13: when reverse-alpha leaves a ghost, suspect the captured alpha map before adding heuristics or switching method). Geometry emitted by the solver at `_ALPHA_NATIVE_WIDTH` 2048: `_ALPHA_WIDTH_FRAC` 0.202, `_ALPHA_HEIGHT_FRAC` 0.058, margins ~0.029. **Removal = reverse-alpha + a deliberately THIN residual inpaint** (`remove_watermark_reverse_alpha`, `_RESIDUAL_DILATE` 5 over the `_RESIDUAL_ALPHA_FLOOR` 0.05 footprint, `_RESIDUAL_INPAINT_RADIUS` 2, `INPAINT_NS`): a single 2048 alpha cannot pixel-cancel the mark re-rasterized at another resolution (alpha maps from independent captures correlate 0.998, not 1.0; off-native reverse-alpha alone only halves the mark), so a tight inpaint clears the residual edges WITHOUT the texture/edge smear a wide full-footprint pass caused. **Placement ALWAYS tries fixed geometry AND `_aligned_alpha_map`'s NCC scale+position search, keeping the lower-residual** — the mark re-rasterizes + jitters a few px per image even at the captured width, so fixed geometry alone misses (there is no `_ALPHA_NATIVE_BAND` fast-path; the scale search `_ALPHA_ALIGN_SEARCH` is fine-stepped, and the `WM_*` locate box is generous so a corner-ward shift stays inside the search — the same widen that fixed Doubao). Verified clean on the solid captures (native 2048; faint self-residual ~1.3 visible only on a dead-flat field, hidden by real texture) and a real 1440-wide Jimeng download (off-native, table edge preserved). `reverse_alpha_available` is just "asset present"; the registry gates on `detect`. **No committed real sample** (the real content download stays gitignored; only the solid calibration captures are committed) — `tests/test_jimeng_engine.py` synthesizes a mark from the bundled alpha asset, and `test_recovers_shifted_mark_on_texture` guards the align-on-shift path that the Doubao defect exposed. Jimeng images are independently caught by the China TC260 AIGC label in `metadata`/`identify`, so this engine is the visible-mark *removal* path, not a new `identify` signal. +- `samsung_engine.py` — **a thin `TextMarkEngine` subclass (config only) since 2026-06-09.** visible Samsung Galaxy AI "✦ Contenuti generati dall'AI" remover/detector (cv2/numpy, no GPU), built 2026-06-05 from issue #37's flat captures (@f-liva). Shares the base but anchored **bottom-LEFT** (Doubao/Jimeng are bottom-right): `locate` anchors a bottom-left box by **geometry** (scales with WIDTH), `extract_mask` pulls the light low-chroma glyphs (white top-hat + grayish + min-luma — `LOGO_MIN_LUMA` is lowered to **110** because the mark is faint, peak alpha ~0.38, so on a mid/dark background its glyph luma is lower than Jimeng's), `detect` matches the bundled glyph silhouette (`assets/samsung_alpha.png`) via `TM_CCOEFF_NORMED` over a coverage floor. Threshold `DETECT_NCC_THRESHOLD` **0.40** (real marks ~0.79 on a real photo, ~0.57/0.71 on the black/gray captures; 0.0 on Doubao/Jimeng captures, and Doubao/Jimeng score 0.0 on a real Samsung photo — no cross-fire, also because the corner differs). **Logo is pure white (255,255,255)** (`_ALPHA_LOGO_BGR`; white capture confirms). **Alpha solved by `scripts/visible_alpha_solve.py samsung` from the GRAY capture** (`data/samsung_capture/captures/`, the flat black/gray/white captures committed; the solver gained a `corner="bl"` mode + left-margin logging for this), same careful recipe as Jimeng (cubic background, mean-channel, full halo, unblurred). Geometry emitted at `_ALPHA_NATIVE_WIDTH` **1086** (the flat-edit capture width): `_ALPHA_WIDTH_FRAC` 0.3195, `_ALPHA_HEIGHT_FRAC` 0.0378, `_ALPHA_MARGIN_LEFT_FRAC` 0.0110, `_ALPHA_MARGIN_BOTTOM_FRAC` 0.0064. **Removal = reverse-alpha + a deliberately THIN residual inpaint** (`remove_watermark_reverse_alpha`, same `_RESIDUAL_*` recipe as Jimeng) with **always-try fixed AND `_aligned_alpha_map` NCC scale+position search, keep the lower-residual** (`_ALPHA_ALIGN_SEARCH` widened to (0.85, 1.18, 23) because the flat captures are far off the real-photo width). **Resolution caveat:** the flat captures arrived at 1086 wide while real photos are ~2958 wide (the mark scales with width, so the captured glyph ~334px is ~2.7x smaller than the ~903px real-photo glyph); width-scale + NCC-align still removes it cleanly (verified on a real 2958-wide @f-liva photo: re-detect 0.79→0.00, no readable text or outline on the recovered wooden table — checked **visually**, not just by the detector, per the Gemini self-verify lesson), but a flat capture at the real photo resolution would make the alpha pixel-sharp instead of upscaled (open quality upgrade, noted in `data/samsung_capture/README.md`). **The mark is locale-specific** (text differs per language); this build is the Italian "Contenuti generati dall'AI" variant — other locales need their own captured template. `reverse_alpha_available` is just "asset present"; the registry gates on `detect`. **No committed real sample** (the real photo stays gitignored; only the flat calibration captures are committed) — `tests/test_samsung_engine.py` synthesizes a mark from the bundled alpha asset (bottom-left geometry), with `test_recovers_shifted_mark_on_texture` guarding the align-on-shift path. Samsung Galaxy AI edits are independently caught by C2PA + the `genAIType` marker in `metadata`/`identify`, so this engine is the visible-mark *removal* path; it also feeds `identify` as the medium-confidence `visible_samsung` signal via the registry (the stripped-metadata fallback). - `region_eraser.py` — universal region eraser (`erase` CLI). `erase(image, boxes=|mask=, backend=)` accepts grayscale (2D) and RGBA (4-channel) inputs on **both** backends (`erase_cv2` and `erase_lama` each split off any alpha plane and re-attach it unchanged, and promote grayscale to BGR for processing — LaMa would otherwise crash on grayscale and drop alpha on BGRA): `boxes_to_mask` → `cv2.inpaint` (`cv2` backend, default, no deps) or big-LaMa via onnxruntime (`lama` backend, extra `lama`, `Carve/LaMa-ONNX` Apache-2.0 model downloaded on first use, never bundled). `erase_lama` crops a padded region around the mask, runs LaMa at its fixed 512² input, pastes only masked pixels back (untouched areas stay pixel-exact). Lazy `_get_lama_session` singleton; `lama_available()` guards the optional import. **LaMa-ONNX costs ~3.5-4 GB peak RAM and ~5-6 s/call on CPU** (FFC working set, not arena — `enable_cpu_mem_arena=False` does not help), so it does NOT fit a minimal droplet; the cv2 backend (tens of MB, ~30 ms) does. LaMa quality at low RAM = serverless/GPU, mirroring how raiw.cc offloads SDXL to fal. - `invisible_watermark.py` — `detect_invisible_watermark(path)` decodes the OPEN DWT-DCT watermarks (public decoder, no key) embedded by Stable Diffusion / SDXL / FLUX via the `imwatermark` library. Known fixed patterns (verified against upstream source) live in `_BITS_48` (SDXL 48-bit, FLUX.2 48-bit) and `_SD1_STRING` ("StableDiffusionV1", SD 1.x/2.x). Optional dep (extra `detect`); returns None when absent. The `detect` extra pulls **torch** transitively (invisible-watermark declares torch a hard dep, and `WatermarkDecoder` eagerly imports `rivaGan` -> `torch` at import time), so detection needs torch present even though dwtDct runs CPU-only on cv2/numpy/pywavelets — no GPU and no separate `gpu` extra required. **Unlike SynthID this is locally detectable**, but the watermark is fragile (does not survive JPEG re-encode/resize — verified gone after JPEG q90), so it confirms origin only on pristine files. Add new known patterns here. The file carries a top-of-module pyright pragma because imwatermark/cv2 ship no type stubs. - `trustmark_detector.py` — `detect_trustmark(path)` decodes the OPEN, keyless **Adobe TrustMark** watermark (the soft binding behind Adobe Durable Content Credentials, `alg` `com.adobe.trustmark.P`) via the optional `trustmark` package (extra `trustmark`; pulls torch, downloads model weights on first use). Mirrors `invisible_watermark.py` (lazy singleton guarded by a double-checked `threading.Lock` so concurrent callers do not double-download the weights, top-of-module pyright pragma, returns None when absent). It detects *provenance*, not AI origin as such (TrustMark also marks human-authored content), so `identify` lists it as a watermark without setting `is_ai_generated`. Other soft-binding vendors (Digimarc/Imatag/Steg.AI/...) have no public decoder — they are only *named* via the `C2PA_SOFT_BINDINGS` scan, not decoded. **False-positive gate (added 2026-05-29):** TrustMark's `wm_present` is a BCH error-correction validity flag that spuriously validates on a content-correlated fraction of un-watermarked images — AI-generated textures trip it far more than camera photos (verified 2026-05-29 on real files: it fires on Gemini/OpenAI/Doubao output that *cannot* carry Adobe's watermark, with a random-bytes decoded secret, while signal-free camera photos did not trip it). A genuine TrustMark is a *durable* soft binding engineered to survive re-encoding, so `detect_trustmark` re-decodes after a mild JPEG round-trip (`_survives_reencode`, `_REENCODE_QUALITY` 95) and requires the same schema both times; every observed false positive collapsed (none survived even q95), so the gate is the durability property the watermark guarantees. The second decode runs only on the rare initial hit, so the cost is negligible. Do NOT remove the gate to "catch more" — a lone TrustMark hit without it is almost always content noise. - `noai/watermark_remover.py` — the `WatermarkRemover` class has two diffusion pipelines, selected by the explicit `pipeline` ctor arg (NOT inferred from `model_id` -- both use the same SDXL base, `DEFAULT_MODEL_ID`). **`sdxl`** (renamed from `default` 2026-06-09; `default` kept as a back-compat alias via `normalize_profile`) runs plain SDXL img2img (`_run_img2img`); it is the lighter opt-down alternative (no ControlNet weights). **`controlnet`** (**the DEFAULT pipeline since 2026-06-09** for `invisible`/`all`/`batch` and both engine ctors; `_run_controlnet`, `_load_controlnet_pipeline`) runs `StableDiffusionXLControlNetImg2ImgPipeline` with the SDXL-native canny ControlNet `xinsir/controlnet-canny-sdxl-1.0` (`watermark_profiles.CONTROLNET_CANNY_MODEL`): the control image is `cv2.Canny(gray, 100, 200)` stacked to 3 channels (`_CANNY_LOW`/`_CANNY_HIGH`, prompt `_CONTROLNET_PROMPT` / `_CONTROLNET_NEGATIVE`). **Removal comes from the img2img regeneration (`strength`); the ControlNet only PRESERVES text and face STRUCTURE via the edge map.** No original pixels are copied or frozen, BUT **validation 2026-06-04 disproved the old "so SynthID does not survive" claim: SynthID CAN survive controlnet on photoreal/high-detail content.** At the shared low removal strength the canny edge-conditioning keeps the regeneration so close to the original that the pixel perturbation that destroys SynthID does not happen (oracle-confirmed: an OpenAI bracelet photo + a 9-face grid read **SynthID-detected** after controlnet at strength 0.10/0.15, but **SynthID-not-detected** after the `default` pipeline at the SAME strength + resolution -- only the pipeline differed). **But the reverse also holds: a flat-graphic logo/poster SURVIVED `default` while clearing controlnet** -- removal at the low strength is content×pipeline dependent and neither pipeline is universally safe; the real lever is a higher strength. See the controlnet Known-limitations bullet for the full table + root cause. Canny holds face STRUCTURE but NOT identity (the regenerated face drifts in likeness -- canny carries edges, not identity). The drifted cleaned face is the LEAST-AI state we can reach without re-introducing SynthID; the library does NOT ship a face-restore extra. Every restore approach we evaluated (GFPGAN-on-cleaned, PhotoMaker-V2 txt2img, InstantID txt2img, InstantID img2img-on-cleaned at three parameter sweeps, 2026-06-04 - 2026-06-08 Modal cert sweeps) regenerated the face from an ArcFace embedding via SDXL diffusion -- which makes the output face look MORE AI-generated, not less. Empirical conclusion in `docs/synthid-robust-identity-research-2026-06-08.md` "Empirical follow-up". For production face preservation, ship the cleaned image as-is. `controlnet_conditioning_scale` (ctor arg, default 1.0) is the structure-preservation knob. Same dtype rule as `default` (fp32 on cpu/mps, fp16 only on cuda/xpu; the fp16-fixed SDXL VAE `_SDXL_FP16_VAE_ID` is swapped in on fp16 GPUs -- issue #29) and the same MPS->CPU fallback (reload on cpu/fp32, drop a non-cpu generator, retry once). - **`auto_config.py` + the content-detection layer were REMOVED 2026-06-09.** History: `auto_config.plan()` was a content-adaptive planner that detected faces/text/edges (bundled OpenCV YuNet + PP-OCRv3 DBNet models) to route the pipeline and toggle the adaptive polish. Once `controlnet` became the default-and-only auto pipeline (it no longer downgrades a structure-less image to `sdxl`) and the adaptive polish was confirmed to **self-gate by detail level** (`humanizer.adaptive_polish` no-ops when the cleaned image already meets the input's Laplacian variance, so it does real work only on over-smoothed photo/face texture and ~nothing on text/flat), the detection no longer changed any behavior — it only annotated a `reason` string. So the whole layer was deleted: `auto_config.py`, `tests/test_auto_config.py`, and the two detection assets (`assets/face_detection_yunet_2023mar.onnx`, `assets/text_detection_ppocrv3_2023may.onnx`, ~2.6 MB). **`--auto` is now a DEPRECATED no-op** (`cli._resolve_auto_polish`): controlnet is already the default pipeline AND the adaptive polish is ON by default, so `--auto` has nothing left to do — it only prints a deprecation warning and passes `adaptive_polish` through unchanged (an explicit `--no-adaptive-polish` still wins). (Originally it re-enabled the polish; once the polish default flipped to ON the same day, the parameter-source branch became dead and was dropped.) The **adaptive polish itself lives on** in `humanizer.adaptive_polish` (CLI `--adaptive-polish/--no-adaptive-polish`, **ON by default since 2026-06-09** — it self-gates to a no-op where there is no detail deficit, so default-on is safe; uses the full-res original as the detail reference) — see the `humanizer` test note. `batch` resolves the polish once before the loop (one warning) and caches the invisible engine per pipeline (`ctx.obj["_inv_engines"]`). - `upscaler.py` — optional Real-ESRGAN pre-diffusion super-resolution for small inputs (spandrel boundary, top-of-file pyright pragma). `is_available()` gates on spandrel+torch (via `importlib.util.find_spec`); `upscale(bgr, device=None)` loads a lazily-built spandrel `ImageModelDescriptor` singleton (double-checked lock) and upscales by the model's native factor (x2), with a non-CPU→CPU device fallback mirroring the diffusion engine's MPS→CPU retry. Weights (`RealESRGAN_x2plus.pth`, BSD-3-Clause) download on first use to the `torch.hub` checkpoints cache; never bundled. Used only when UPscaling to the `min_resolution` floor (a `max_resolution` downscale always uses Lanczos). The wiring is `InvisibleEngine._esrgan_upscale(pil, target)` — Real-ESRGAN at native factor, then a Lanczos resize to the exact target, falling back to a plain Lanczos resize if the extra is absent or the model errors (so an optional upscaler can never break removal). The default `--upscaler` is `lanczos` (cv2, no deps). **ESRGAN is a generic photo/texture GAN with no face/glyph prior**, so it best fits photo/texture content and can degrade faces (glassy/asymmetric eyes -- the diffusion pass regenerates faces so the full-pipeline final recovers) and thin/small text (the GAN invents wrong strokes, and low-strength diffusion will not fix it). Verified 2026-06-04: isolated upscale lap-var ~5x Lanczos on faces+textures but glassy eyes; end-to-end `invisible` final lap-var 1634 vs Lanczos 663 with natural faces (diffusion cleaned the artifact). Kept a **manual opt-in knob** (the auto plan never selects it) with `lanczos` the default; not content-gated by design (use Lanczos for text-heavy inputs). spandrel is MIT and pulls no basicsr. Unit-tested without the model: `tests/test_upscaler.py` (availability guard + the not-installed RuntimeError) and `tests/test_invisible_engine.py::TestEsrganUpscale` (the three `_esrgan_upscale` branches via a monkeypatched `upscaler`). -- `image_io.py` — Unicode-safe cv2 IO (issue #17). `imread(path, flags=None)` / `imwrite(path, img)` wrap `np.fromfile`+`cv2.imdecode` / `cv2.imencode`+`tofile` so non-ASCII paths work on Windows -- bare `cv2.imread`/`cv2.imwrite` use the platform ANSI code-page API there and fail (empty decode + `can't open/read file`) on Chinese/Cyrillic/accented filenames. `imread` keeps `cv2.imread` semantics (defaults to `IMREAD_COLOR`, returns `None` on missing/empty/undecodable). **Every cv2 file read/write in the package routes through here; do not call `cv2.imread`/`cv2.imwrite` directly.** `imwrite` returns `False` on an unwritable path (`OSError` caught) instead of raising, matching `cv2.imwrite` semantics. macOS/Linux already accept UTF-8 paths, so it is behavior-neutral there (the bug only reproduces on Windows). cv2/numpy are imported lazily inside the functions, so the module is cheap to import in a bare env. +- `image_io.py` — Unicode-safe cv2 IO (issue #17). `imread(path, flags=None)` / `imwrite(path, img)` wrap `np.fromfile`+`cv2.imdecode` / `cv2.imencode`+`tofile` so non-ASCII paths work on Windows -- bare `cv2.imread`/`cv2.imwrite` use the platform ANSI code-page API there and fail (empty decode + `can't open/read file`) on Chinese/Cyrillic/accented filenames. `imread` keeps `cv2.imread` semantics (defaults to `IMREAD_COLOR`, returns `None` on missing/empty/undecodable). **Every cv2 file read/write in the package routes through here; do not call `cv2.imread`/`cv2.imwrite` directly.** `imwrite` returns `False` on an unwritable path (`OSError` caught) instead of raising, matching `cv2.imwrite` semantics. macOS/Linux already accept UTF-8 paths, so it is behavior-neutral there (the bug only reproduces on Windows). **`to_bgr(image)` (added 2026-06-09)** is the shared channel normalizer: promotes 2D grayscale / (h,w,1) / 4-channel BGRA to 3-channel BGR (a 3-channel input is returned unchanged, no copy). Use it instead of inlining the `cvtColor(GRAY2BGR/BGRA2BGR)` branch — the gemini engine and the `TextMarkEngine` base both route through it so a grayscale/BGRA input (a real Gemini-app export is opaque RGBA) does not crash the `axis=2` channel reductions. cv2/numpy are imported lazily inside the functions, so the module is cheap to import in a bare env. ### Doubao clean-reverse-alpha distillation (re-investigated 2026-05-29) diff --git a/src/remove_ai_watermarks/_text_mark_engine.py b/src/remove_ai_watermarks/_text_mark_engine.py new file mode 100644 index 0000000..8fae06d --- /dev/null +++ b/src/remove_ai_watermarks/_text_mark_engine.py @@ -0,0 +1,349 @@ +"""Shared base for the reverse-alpha visible text-mark engines. + +The Doubao "豆包AI生成", Jimeng "★ 即梦AI", and Samsung "✦ Contenuti generati +dall'AI" marks are the SAME algorithm: anchor a bottom-corner box by width-relative +geometry, extract the light low-saturation glyph candidate, detect by matching the +bundled alpha-glyph silhouette via ``TM_CCOEFF_NORMED``, and remove by inverting the +alpha blend ``original = (wm - a*logo)/(1-a)`` (always trying fixed AND NCC-aligned +placement, keeping the lower-residual one) plus a thin footprint inpaint. + +They differ ONLY in a bounded set of tuned values captured by :class:`TextMarkConfig`: +the constants, the bundled asset, the corner (Doubao/Jimeng bottom-right, Samsung +bottom-left), and a few structural knobs (the morphology-open kernel size and the +minimum glyph width used by the alignment / template-match). Each engine module is a +thin :class:`TextMarkEngine` subclass plus the test-facing module constants/helpers. + +Gemini stays a SEPARATE engine (``gemini_engine``): its multi-size fixed-slot sparkle +model is genuinely different, not a tuned variant of this one. +""" + +# cv2/numpy boundary: third-party libs ship no usable element types; relax the +# unknown-type rules for this file only. +# pyright: reportUnknownMemberType=false, reportUnknownArgumentType=false, reportUnknownVariableType=false, reportUnknownParameterType=false, reportMissingTypeArgument=false, reportMissingTypeStubs=false, reportMissingImports=false, reportArgumentType=false, reportAssignmentType=false, reportReturnType=false, reportCallIssue=false, reportIndexIssue=false, reportOperatorIssue=false, reportOptionalMemberAccess=false, reportOptionalCall=false, reportOptionalSubscript=false, reportOptionalOperand=false, reportAttributeAccessIssue=false, reportPrivateImportUsage=false, reportPrivateUsage=false, reportInvalidTypeForm=false, reportConstantRedefinition=false, reportUnnecessaryComparison=false +from __future__ import annotations + +import logging +from dataclasses import dataclass +from pathlib import Path +from typing import TYPE_CHECKING, Any, Literal + +import cv2 +import numpy as np + +from remove_ai_watermarks import image_io + +if TYPE_CHECKING: + from numpy.typing import NDArray + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True) +class TextMarkConfig: + """All per-mark tuning for a reverse-alpha text-mark engine.""" + + name: str # short label for log lines (e.g. "Doubao") + asset_name: str # bundled alpha PNG under assets/ (e.g. "doubao_alpha.png") + corner: Literal["br", "bl"] # bottom-right (Doubao/Jimeng) or bottom-left (Samsung) + margin_floor: int # min margin in px for locate (4 for br marks, 2 for Samsung) + # locate geometry (fraction of image WIDTH) + width_frac: float + height_frac: float + margin_x_frac: float # right margin (br) or left margin (bl) + margin_bottom_frac: float + # glyph appearance + max_saturation: float + logo_min_luma: float + tophat_delta: float + morph_open_size: int # MORPH_OPEN kernel side (5 for br marks, 3 for Samsung) + # detection + detect_min_coverage: float + detect_ncc_threshold: float + # alpha-map geometry (fraction of WIDTH) emitted by scripts/visible_alpha_solve.py + alpha_width_frac: float + alpha_height_frac: float + alpha_margin_x_frac: float + alpha_margin_bottom_frac: float + alpha_align_search: tuple[float, float, int] # np.linspace(start, stop, num) scale search + min_gw: int # minimum glyph width for the template match / align search (8 br, 16 Samsung) + alpha_logo_bgr: tuple[float, float, float] = (255.0, 255.0, 255.0) + # residual inpaint over the glyph footprint (thin) + residual_alpha_floor: float = 0.05 + residual_dilate: int = 5 + residual_inpaint_radius: int = 2 + + +@dataclass +class TextMarkLocation: + """Located watermark box, in absolute pixel coordinates.""" + + x: int + y: int + w: int + h: int + is_fallback: bool = True # geometry anchor (no template match) -> always True for now + + @property + def bbox(self) -> tuple[int, int, int, int]: + return self.x, self.y, self.w, self.h + + +@dataclass +class TextMarkDetection: + """Result of visible text-mark detection.""" + + detected: bool = False + confidence: float = 0.0 + region: tuple[int, int, int, int] = (0, 0, 0, 0) + coverage: float = 0.0 # fraction of the box occupied by glyph pixels + + +# Alpha / silhouette templates, cached per asset name (the originals cached per +# module global; this keys by asset so the three engines share the loader without +# re-reading). Only SUCCESSFUL loads are cached, so a missing asset is retried. +_alpha_cache: dict[str, NDArray[Any]] = {} +_silhouette_cache: dict[str, NDArray[Any]] = {} + + +def load_alpha_template(asset_name: str) -> NDArray[Any] | None: + """Lazily load the bundled alpha template (float [0,1]) for ``asset_name``, or None.""" + cached = _alpha_cache.get(asset_name) + if cached is not None: + return cached + path = Path(__file__).parent / "assets" / asset_name + img = image_io.imread(str(path), cv2.IMREAD_GRAYSCALE) + if img is None: + return None + _alpha_cache[asset_name] = img.astype(np.float32) / 255.0 + return _alpha_cache[asset_name] + + +def glyph_silhouette(asset_name: str) -> NDArray[Any] | None: + """Binary glyph silhouette (255 = glyph) from the bundled alpha map, or None.""" + cached = _silhouette_cache.get(asset_name) + if cached is not None: + return cached + at = load_alpha_template(asset_name) + if at is None: + return None + _silhouette_cache[asset_name] = (at > 0.15).astype(np.uint8) * 255 + return _silhouette_cache[asset_name] + + +def template_match_score(box_mask: NDArray[Any], image_width: int, config: TextMarkConfig) -> float: + """Zero-mean normalized correlation of the alpha-template glyph silhouette + (scaled to the mark's expected size) against the candidate ``box_mask``. + + ``TM_CCOEFF_NORMED`` keys on glyph SHAPE, not coverage, so a dense textured + corner does not score highly -- only the actual glyph shape does. + """ + sil = glyph_silhouette(config.asset_name) + if sil is None or box_mask.size == 0: + return 0.0 + gw = min(box_mask.shape[1] - 1, max(config.min_gw, int(config.alpha_width_frac * image_width))) + gh = min(box_mask.shape[0] - 1, max(4, int(config.alpha_height_frac * image_width))) + if gw < config.min_gw or gh < 4: + return 0.0 + template = cv2.resize(sil, (gw, gh), interpolation=cv2.INTER_NEAREST) + return float(cv2.matchTemplate(box_mask, template, cv2.TM_CCOEFF_NORMED).max()) + + +class TextMarkEngine: + """Reverse-alpha visible text-mark remover (locate -> mask -> detect -> reverse-alpha).""" + + def __init__(self, config: TextMarkConfig) -> None: + self.config = config + + # ── Templates (delegate to the asset-keyed module cache) ──────────── + + def _alpha_template(self) -> NDArray[Any] | None: + return load_alpha_template(self.config.asset_name) + + def _glyph_silhouette(self) -> NDArray[Any] | None: + return glyph_silhouette(self.config.asset_name) + + def _template_match_score(self, box_mask: NDArray[Any], image_width: int) -> float: + return template_match_score(box_mask, image_width, self.config) + + # ── Locate ────────────────────────────────────────────────────────── + + def locate(self, image: NDArray[Any]) -> TextMarkLocation: + """Anchor the watermark box in the configured bottom corner by geometry.""" + c = self.config + h, w = image.shape[:2] + wm_w = max(40, int(w * c.width_frac)) + wm_h = max(16, int(w * c.height_frac)) + margin_x = max(c.margin_floor, int(w * c.margin_x_frac)) + margin_b = max(c.margin_floor, int(w * c.margin_bottom_frac)) + x = max(0, w - margin_x - wm_w) if c.corner == "br" else min(margin_x, max(0, w - wm_w)) + y = max(0, h - margin_b - wm_h) + wm_w = min(wm_w, w - x) + wm_h = min(wm_h, h - y) + return TextMarkLocation(x=x, y=y, w=wm_w, h=wm_h, is_fallback=True) + + # ── Mask ──────────────────────────────────────────────────────────── + + def extract_mask(self, image: NDArray[Any], loc: TextMarkLocation) -> NDArray[Any]: + """Build a full-image uint8 mask (255 = watermark glyph) for the box. + + Polarity-aware: the mark is a light, low-saturation gray rendered brighter + than the local background (white top-hat), so a white-paper document is left + untouched (nothing brighter than its surroundings is masked there). + """ + c = self.config + h, w = image.shape[:2] + x, y, bw, bh = loc.bbox + # A degenerate ROI (a sliver from an extremely wide/short image) cannot hold + # the mark and would feed cv2's GaussianBlur/morphology a ~1-px-tall array, + # which can fault native code on some platforms. Skip the cv2 pipeline. + if bh < 16 or bw < 16: + return np.zeros((h, w), np.uint8) + # Normalize the ROI to 3-channel BGR (grayscale / BGRA would break axis=2). + roi = image_io.to_bgr(image[y : y + bh, x : x + bw]).astype(np.float32) + + luma = roi.mean(axis=2) + sat = roi.max(axis=2) - roi.min(axis=2) + grayish = sat < c.max_saturation + + # Local background model: a strong Gaussian blur (sigma ~ box height); the + # white top-hat (luma - local_bg) lights up bright thin strokes regardless + # of the absolute background level. + sigma = max(4.0, bh * 0.4) + local_bg = cv2.GaussianBlur(luma, (0, 0), sigmaX=sigma, sigmaY=sigma) + tophat = luma - local_bg + + cand = grayish & (tophat > c.tophat_delta) & (luma > c.logo_min_luma) + glyph = cand.astype(np.uint8) * 255 + glyph = cv2.morphologyEx(glyph, cv2.MORPH_CLOSE, np.ones((5, 5), np.uint8)) + k = c.morph_open_size + glyph = cv2.morphologyEx(glyph, cv2.MORPH_OPEN, np.ones((k, k), np.uint8)) + + mask = np.zeros((h, w), np.uint8) + mask[y : y + bh, x : x + bw] = glyph + return mask + + # ── Detect ────────────────────────────────────────────────────────── + + def detect(self, image: NDArray[Any]) -> TextMarkDetection: + """Detect the mark by matching the alpha-template glyph silhouette against + the corner candidate (``TM_CCOEFF_NORMED``); keys on glyph SHAPE, not coverage.""" + c = self.config + det = TextMarkDetection() + if image is None or image.size == 0: + return det + loc = self.locate(image) + mask = self.extract_mask(image, loc) + x, y, bw, bh = loc.bbox + box = mask[y : y + bh, x : x + bw] + coverage = float((box > 0).sum()) / float(max(1, bw * bh)) + det.region = loc.bbox + det.coverage = coverage + if coverage >= c.detect_min_coverage: + score = self._template_match_score(box, image.shape[1]) + det.confidence = score + det.detected = score >= c.detect_ncc_threshold + logger.debug("%s detect: coverage=%.3f ncc=%.2f detected=%s", c.name, coverage, score, det.detected) + return det + + # ── Reverse-alpha (recovery + thin residual inpaint) ──────────────── + + def reverse_alpha_available(self, image: NDArray[Any]) -> bool: + """True if the bundled alpha map is loadable (NCC alignment places it at any + resolution; the caller still gates on ``detect`` so a clean corner is untouched).""" + return image is not None and image.size > 0 and self._alpha_template() is not None + + def _fixed_alpha_map(self, image: NDArray[Any]) -> tuple[NDArray[Any], tuple[int, int, int, int]] | None: + """Place the template by fixed width-relative geometry (pixel-exact at the + captured width).""" + c = self.config + at = self._alpha_template() + if at is None: + return None + h, w = image.shape[:2] + # Clamp both dims so a wide/short image cannot overflow the slice assignment. + gw = min(w, max(1, int(c.alpha_width_frac * w))) + gh = min(h, max(1, int(c.alpha_height_frac * w))) + if c.corner == "br": + ax = max(0, w - int(c.alpha_margin_x_frac * w) - gw) + else: # bottom-left + ax = min(max(0, int(c.alpha_margin_x_frac * w)), max(0, w - gw)) + ay = max(0, h - int(c.alpha_margin_bottom_frac * w) - gh) + amap = np.zeros((h, w), np.float32) + amap[ay : ay + gh, ax : ax + gw] = cv2.resize(at, (gw, gh), interpolation=cv2.INTER_LINEAR) + return amap, (ax, ay, gw, gh) + + def _aligned_alpha_map(self, image: NDArray[Any]) -> tuple[NDArray[Any], tuple[int, int, int, int]] | None: + """Register the captured template to the actual mark via a TM_CCOEFF_NORMED + scale + position search. Returns ``(alpha_map, glyph_bbox)`` or None.""" + c = self.config + at = self._alpha_template() + sil = self._glyph_silhouette() + if at is None or sil is None: + return None + h, w = image.shape[:2] + loc = self.locate(image) + bx, by, bw, bh = loc.bbox + box_mask = self.extract_mask(image, loc)[by : by + bh, bx : bx + bw] + expected = c.alpha_width_frac * w + best: tuple[float, int, int, int, int] | None = None + for scale in np.linspace(*c.alpha_align_search): + gw, gh = int(expected * scale), int(c.alpha_height_frac * w * scale) + if gw < c.min_gw or gh < 4 or gw >= bw or gh >= bh: + continue + t = cv2.resize(sil, (gw, gh), interpolation=cv2.INTER_NEAREST) + _, score, _, top_left = cv2.minMaxLoc(cv2.matchTemplate(box_mask, t, cv2.TM_CCOEFF_NORMED)) + if best is None or score > best[0]: + best = (score, gw, gh, top_left[0], top_left[1]) + if best is None: + return None + _, gw, gh, ox, oy = best + ax, ay = bx + ox, by + oy + amap = np.zeros((h, w), np.float32) + amap[ay : ay + gh, ax : ax + gw] = cv2.resize(at, (gw, gh), interpolation=cv2.INTER_LINEAR) + return amap, (ax, ay, gw, gh) + + def _apply_reverse_alpha(self, image: NDArray[Any], amap: NDArray[Any]) -> NDArray[Any]: + """Invert the alpha blend with ``amap``: ``original = (wm - a*logo)/(1-a)``.""" + a3 = np.clip(amap, 0.0, 1.0)[:, :, None] + logo = np.array(self.config.alpha_logo_bgr, np.float32) + return np.clip((image.astype(np.float32) - a3 * logo) / np.clip(1.0 - a3, 0.25, 1.0), 0, 255).astype(np.uint8) + + def remove_watermark_reverse_alpha(self, image: NDArray[Any], *, residual_inpaint: bool = True) -> NDArray[Any]: + """Recover the original pixels by inverting the alpha blend, then clear the + residual outline with a thin inpaint over the glyph footprint. + + Placement: fixed geometry AND the NCC-aligned placement are always tried and + the one leaving the least residual mark (lowest re-``detect`` confidence) is + kept -- the mark re-rasterizes a few px per image, so fixed geometry alone is + not reliable. A single capture cannot pixel-cancel the mark on every image, so + a deliberately THIN residual inpaint (``residual_*``) follows: reverse-alpha + has already recovered the true background under the mark, so the inpaint only + finishes the residual edges instead of smearing the whole footprint. Call only + when :meth:`reverse_alpha_available` and the mark is detected. + """ + c = self.config + # Normalize to 3-channel BGR (the reverse-alpha math assumes a 3-channel logo). + image = image_io.to_bgr(image) + # An image too small to hold the mark would make the geometry boxes degenerate + # and feed cv2.resize a ~1-px-tall target; skip cv2 entirely. + h, w = image.shape[:2] + if h < 32 or w < 64: + return image.copy() + maps = [m for m in (self._fixed_alpha_map(image), self._aligned_alpha_map(image)) if m is not None] + if not maps: + return image.copy() + best_out: NDArray[Any] | None = None + best_amap: NDArray[Any] | None = None + best_residual = float("inf") + for amap, _region in maps: + out = self._apply_reverse_alpha(image, amap) + residual = self.detect(out).confidence + if residual < best_residual: + best_residual, best_out, best_amap = residual, out, amap + if best_out is None or best_amap is None: # pragma: no cover - maps is non-empty + return image.copy() + if residual_inpaint: + kernel = np.ones((c.residual_dilate, c.residual_dilate), np.uint8) + rm = cv2.dilate((best_amap > c.residual_alpha_floor).astype(np.uint8) * 255, kernel) + best_out = cv2.inpaint(best_out, rm, c.residual_inpaint_radius, cv2.INPAINT_NS) + return best_out diff --git a/src/remove_ai_watermarks/cli.py b/src/remove_ai_watermarks/cli.py index fd2e435..b64ab45 100644 --- a/src/remove_ai_watermarks/cli.py +++ b/src/remove_ai_watermarks/cli.py @@ -29,7 +29,6 @@ if TYPE_CHECKING: from numpy.typing import NDArray - from remove_ai_watermarks.gemini_engine import DetectionResult # --- plain-text output layer (replaces rich: no colors, no markup, no boxes) --- @@ -291,15 +290,32 @@ def _warn_if_esrgan_unavailable(upscaler: str) -> None: console.print(" Note: --upscaler esrgan needs the 'esrgan' extra; falling back to Lanczos.") -def _watermark_region(det: DetectionResult, width: int, height: int) -> tuple[int, int, int, int]: - """Pick a watermark bbox: detector's region if confident, else the default config slot.""" - if det.confidence > 0.15: - return det.region - from remove_ai_watermarks.gemini_engine import get_watermark_config +def _remove_visible_auto( + image: NDArray[Any], + *, + inpaint: bool = True, + inpaint_method: str = "ns", + inpaint_strength: float = 0.85, +) -> tuple[NDArray[Any], str | None]: + """Remove the strongest auto-detected visible mark via the registry. - config = get_watermark_config(width, height) - px, py = config.get_position(width, height) - return (px, py, config.logo_size, config.logo_size) + Routes the ``all``/``batch`` visible step through the same registry path the + standalone ``visible`` command uses, so EVERY registered mark is handled (the + Gemini sparkle AND the Doubao/Jimeng/Samsung text marks), not just the sparkle. + Returns ``(result, label-or-None)``; when no ``in_auto`` mark fires the image is + returned unchanged with ``None``. ``inpaint*`` tune the Gemini edge-residual + cleanup only (the text engines ignore them). + """ + from remove_ai_watermarks import watermark_registry + + best = watermark_registry.best_auto_mark(image) + if best is None: + return image, None + method: Literal["telea", "ns"] = "ns" if inpaint_method == "ns" else "telea" + result, _ = watermark_registry.get_mark(best.key).remove( + image, inpaint_method=method, inpaint=inpaint, inpaint_strength=inpaint_strength, force=False + ) + return result, best.label def _read_bgr_and_alpha(path: Path) -> tuple[NDArray[Any] | None, NDArray[Any] | None]: @@ -893,8 +909,6 @@ def cmd_all( If invisible watermark deps are not installed, skips step 2 with a warning. """ - from remove_ai_watermarks.gemini_engine import GeminiEngine - _banner() source = _validate_image(source) _warn_if_esrgan_unavailable(upscaler) @@ -918,7 +932,6 @@ def cmd_all( # -- Step 1: Visible watermark -------------------------------- console.print("\n 1) Visible watermark removal") - engine = GeminiEngine() image, alpha = _read_bgr_and_alpha(source) if image is None: console.print(f"Error: Failed to read image: {source}") @@ -928,15 +941,10 @@ def cmd_all( console.print(f" Input: {source.name} ({w}x{h})") with console.status("Removing visible watermark..."): - det = engine.detect_watermark(image) - if det.detected: - result = engine.remove_watermark(image) - if inpaint: - region = _watermark_region(det, w, h) - result = engine.inpaint_residual(result, region, method=inpaint_method) - console.print(" Visible watermark removed") + result, removed_label = _remove_visible_auto(image, inpaint=inpaint, inpaint_method=inpaint_method) + if removed_label is not None: + console.print(f" Visible watermark removed ({removed_label})") else: - result = image.copy() console.print(" Skipped (no visible watermark detected)") # Save to temp file for invisible engine input (preserve alpha if present) @@ -1058,27 +1066,15 @@ def _process_batch_image( saved_alpha: NDArray[Any] | None = None if mode in ("visible", "all"): - from remove_ai_watermarks.gemini_engine import GeminiEngine - - if "_vis_engine" not in ctx.obj: - ctx.obj["_vis_engine"] = GeminiEngine() - engine = ctx.obj["_vis_engine"] - read_path = img_path - if mode == "all" and out_path.exists(): - read_path = out_path - image, alpha = _read_bgr_and_alpha(read_path) + # Always read the ORIGINAL source: the visible pass is the first step, so a + # stale out_path from a previous run must not be re-processed as if it were + # the input. (The invisible step below reads out_path for `all` -- that chain + # is within a single run.) + image, alpha = _read_bgr_and_alpha(img_path) if image is None: raise ValueError("Failed to read image") - det = engine.detect_watermark(image) - if det.detected: - result = engine.remove_watermark(image) - if inpaint: - h, w = image.shape[:2] - region = _watermark_region(det, w, h) - result = engine.inpaint_residual(result, region) - else: - result = image.copy() + result, _ = _remove_visible_auto(image, inpaint=inpaint) _write_bgr_with_alpha(out_path, result, alpha) saved_alpha = alpha diff --git a/src/remove_ai_watermarks/doubao_engine.py b/src/remove_ai_watermarks/doubao_engine.py index b79b0ea..1142c41 100644 --- a/src/remove_ai_watermarks/doubao_engine.py +++ b/src/remove_ai_watermarks/doubao_engine.py @@ -4,417 +4,122 @@ Doubao (ByteDance) stamps every generated image with a visible "豆包AI生成" (Doubao AI generated) text strip in the bottom-right corner -- the explicit AIGC label mandated by China's TC260 standard, a near-white semi-transparent overlay. -Like the Gemini sparkle and the Jimeng wordmark, it is a fixed overlay, so removal -starts from **reverse-alpha blending** against a captured alpha map -(``remove_watermark_reverse_alpha``): ``original = (wm - a*logo)/(1-a)``. The alpha -map is rebuilt by ``scripts/visible_alpha_solve.py`` from black/gray Doubao captures -(the careful gray-self solve; logo is pure white) and bundled as -``assets/doubao_alpha.png``. The mark re-rasterizes a few px off per image, so -removal ALWAYS NCC-aligns the template to the actual mark and then clears the -residual edges with a deliberately THIN inpaint over the glyph footprint (an -earlier under-estimated alpha + fixed-no-inpaint left a readable outline that the -detector did not flag -- see the reverse-alpha section below). - -Detection (``detect``) is shape-consistent: it matches that same alpha glyph -silhouette against the corner via normalized correlation, so it keys on the actual -"豆包AI生成" shape rather than coverage/structure heuristics. - -``locate`` (geometry box, scales with image WIDTH) and ``extract_mask`` (the -candidate glyph mask the detector correlates) mirror the Jimeng engine. -Arbitrary-region inpainting still lives in ``region_eraser`` / the ``erase`` -command. Fast, offline, no GPU. +Removal is **reverse-alpha blending** against a captured alpha map +(``original = (wm - a*logo)/(1-a)``), always NCC-aligned to the actual mark plus a +thin residual inpaint over the glyph footprint. This is one of the three text-mark +engines that share :class:`remove_ai_watermarks._text_mark_engine.TextMarkEngine`; +this module supplies only Doubao's tuned :class:`TextMarkConfig` (bottom-right corner, +``assets/doubao_alpha.png`` rebuilt by ``scripts/visible_alpha_solve.py``). Arbitrary- +region inpainting still lives in ``region_eraser`` / the ``erase`` command. """ +# The module-level _alpha_template / _glyph_silhouette / _template_match_score below +# are thin test-facing shims (imported by tests/), so pyright's src-only pass sees them +# as unused; the use is cross-module. +# pyright: reportUnusedFunction=false -# cv2/numpy boundary: third-party libs ship no usable element types; relax the -# unknown-type rules for this file only. -# pyright: reportUnknownMemberType=false, reportUnknownArgumentType=false, reportUnknownVariableType=false, reportUnknownParameterType=false, reportMissingTypeArgument=false, reportMissingTypeStubs=false, reportMissingImports=false, reportArgumentType=false, reportAssignmentType=false, reportReturnType=false, reportCallIssue=false, reportIndexIssue=false, reportOperatorIssue=false, reportOptionalMemberAccess=false, reportOptionalCall=false, reportOptionalSubscript=false, reportOptionalOperand=false, reportAttributeAccessIssue=false, reportPrivateImportUsage=false, reportPrivateUsage=false, reportInvalidTypeForm=false, reportConstantRedefinition=false, reportUnnecessaryComparison=false from __future__ import annotations -import logging -from dataclasses import dataclass from typing import TYPE_CHECKING, Any -import cv2 -import numpy as np +from remove_ai_watermarks import _text_mark_engine +from remove_ai_watermarks._text_mark_engine import TextMarkConfig, TextMarkDetection, TextMarkEngine if TYPE_CHECKING: from pathlib import Path from numpy.typing import NDArray -logger = logging.getLogger(__name__) - - -# Geometry as a fraction of image WIDTH. The Doubao mark scales with width and -# is anchored bottom-right. The box must be GENEROUSLY wider than the mark and -# reach close to the corner -- the mark is re-rasterized a few px off per image, -# and the NCC alignment search only registers within this box, so a tight box -# (the old 0.185 / margin 0.012) let a corner-ward shift fall partly outside it -# and the alignment missed. The glyph mask tightens the actual removal. +# Locate geometry as a fraction of image WIDTH (the mark scales with width, anchored +# bottom-right). The box is GENEROUSLY wider than the mark and reaches close to the +# corner so a per-image re-rasterization shift stays inside the NCC alignment search. WM_WIDTH_FRAC = 0.22 WM_HEIGHT_FRAC = 0.075 MARGIN_RIGHT_FRAC = 0.004 MARGIN_BOTTOM_FRAC = 0.004 -# Glyph appearance: the label is a low-saturation light gray, rendered brighter -# than the surrounding content (the common case: a generated photo/illustration). -# We detect it as a local bright feature (white top-hat: brighter than a blurred -# local background) intersected with the grayish + minimum-brightness tests. -# This is polarity-correct for bright-on-darker backgrounds and, crucially, -# leaves white-paper documents untouched (there the mark is not brighter than -# its surroundings, so nothing is masked rather than damaging the document text). +# Glyph appearance: a light, low-saturation gray rendered brighter than the local +# background (white top-hat), so a white-paper document is left untouched. MAX_SATURATION = 55 # max channel spread to count a pixel as "grayish" LOGO_MIN_LUMA = 150 # glyphs are at least this bright in absolute terms TOPHAT_DELTA = 12 # glyph must exceed the local background by this many levels -# Detection is reverse-alpha-consistent: the mark is recognized by matching the -# bundled alpha-template glyph silhouette (assets/doubao_alpha.png -- the exact -# shape we invert) against the extracted candidate mask via zero-mean normalized -# correlation (cv2 TM_CCOEFF_NORMED). It keys on the actual "豆包AI生成" glyph -# SHAPE, not on coverage/structure heuristics, so a merely-textured corner does -# not fire (the old coverage detector false-positived on ~28% of images; #23). -# Corpus-tuned: real marks score median ~0.61, arbitrary corners <=0.17 (p99); -# threshold 0.4 -> false positives 7/1243 (0.6%). A small coverage floor skips -# the template match on a near-empty candidate box. +# Shape-consistent detection: match the bundled alpha glyph silhouette against the +# corner candidate via TM_CCOEFF_NORMED (keys on glyph SHAPE, not coverage; #23). DETECT_MIN_COVERAGE = 0.04 DETECT_NCC_THRESHOLD = 0.4 -# ── Reverse-alpha (recovery + thin residual inpaint) ───────────────── -# The Doubao mark is a fixed semi-transparent white overlay, so given its alpha -# map the original pixels are recovered by inverting the blend: (wm - a*logo)/(1-a). -# The alpha map is rebuilt by scripts/visible_alpha_solve.py from the black/gray -# Doubao captures (data/doubao_capture/): the CAREFUL solve -- a = (I - B)/(255 - B) -# on the gray capture with B a per-channel cubic background fit, mean over channels, -# full halo extent, unblurred. The earlier build (a coarser solve) under-estimated -# the alpha and left a clearly READABLE "豆包AI生成" outline on real samples -# (issue #13 follow-up: the detector was fooled by the outline -- conf 0.0 -- so the -# test passed while the result was visibly bad; suspect the captured alpha map, not -# the method). The mark is re-rasterized and a few px off per image, so removal -# does NOT trust fixed geometry: it ALWAYS tries fixed AND `_aligned_alpha_map`'s -# TM_CCOEFF_NORMED scale+position search and keeps the lower-residual placement, -# then a deliberately THIN residual inpaint clears the leftover edges without -# smearing the recovered texture. Geometry below is emitted by the solver -- keep in -# sync when the asset is rebuilt. +# Reverse-alpha geometry, emitted by scripts/visible_alpha_solve.py at the captured +# width. Removal always tries fixed AND NCC-aligned placement and keeps the lower +# residual, then a thin footprint inpaint clears the leftover edges. _ALPHA_NATIVE_WIDTH = 2048 _ALPHA_LOGO_BGR: tuple[float, float, float] = (255.0, 255.0, 255.0) _ALPHA_WIDTH_FRAC = 0.1636 # asset width / image width -- the alignment scale seed _ALPHA_HEIGHT_FRAC = 0.0405 -# Margins (of image WIDTH) of the captured mark -- the geometry record / where to -# seed; alignment refines the actual position, so these are not load-bearing. _ALPHA_MARGIN_RIGHT_FRAC = 0.0132 _ALPHA_MARGIN_BOTTOM_FRAC = 0.0166 -# Alignment scale search (np.linspace args) around the width-scaled glyph size. _ALPHA_ALIGN_SEARCH = (0.88, 1.12, 25) -# Residual inpaint over the glyph footprint -- thin (NS, small radius) so it clears -# the leftover edges without the smear a wide full-footprint pass caused. _RESIDUAL_ALPHA_FLOOR = 0.05 _RESIDUAL_DILATE = 5 _RESIDUAL_INPAINT_RADIUS = 2 -_alpha_template_cache: NDArray[Any] | None = None + +_CONFIG = TextMarkConfig( + name="Doubao", + asset_name="doubao_alpha.png", + corner="br", + margin_floor=4, + width_frac=WM_WIDTH_FRAC, + height_frac=WM_HEIGHT_FRAC, + margin_x_frac=MARGIN_RIGHT_FRAC, + margin_bottom_frac=MARGIN_BOTTOM_FRAC, + max_saturation=MAX_SATURATION, + logo_min_luma=LOGO_MIN_LUMA, + tophat_delta=TOPHAT_DELTA, + morph_open_size=5, + detect_min_coverage=DETECT_MIN_COVERAGE, + detect_ncc_threshold=DETECT_NCC_THRESHOLD, + alpha_width_frac=_ALPHA_WIDTH_FRAC, + alpha_height_frac=_ALPHA_HEIGHT_FRAC, + alpha_margin_x_frac=_ALPHA_MARGIN_RIGHT_FRAC, + alpha_margin_bottom_frac=_ALPHA_MARGIN_BOTTOM_FRAC, + alpha_align_search=_ALPHA_ALIGN_SEARCH, + min_gw=8, + alpha_logo_bgr=_ALPHA_LOGO_BGR, + residual_alpha_floor=_RESIDUAL_ALPHA_FLOOR, + residual_dilate=_RESIDUAL_DILATE, + residual_inpaint_radius=_RESIDUAL_INPAINT_RADIUS, +) + +# Doubao-specific aliases for the shared detection result/engine. +DoubaoDetection = TextMarkDetection def _alpha_template() -> NDArray[Any] | None: - """Lazily load the bundled Doubao alpha template (float [0,1]), or None.""" - global _alpha_template_cache - if _alpha_template_cache is None: - from pathlib import Path - - from remove_ai_watermarks import image_io - - path = Path(__file__).parent / "assets" / "doubao_alpha.png" - img = image_io.imread(str(path), cv2.IMREAD_GRAYSCALE) - if img is None: - return None - _alpha_template_cache = img.astype(np.float32) / 255.0 - return _alpha_template_cache - - -@dataclass(frozen=True) -class DoubaoLocation: - """Located watermark box (bottom-right), in absolute pixel coordinates.""" - - x: int - y: int - w: int - h: int - is_fallback: bool = True # geometry anchor (no template match) -> always True for now - - @property - def bbox(self) -> tuple[int, int, int, int]: - return self.x, self.y, self.w, self.h - - -@dataclass -class DoubaoDetection: - """Result of visible Doubao watermark detection.""" - - detected: bool = False - confidence: float = 0.0 - region: tuple[int, int, int, int] = (0, 0, 0, 0) - coverage: float = 0.0 # fraction of the box occupied by glyph pixels - - -_silhouette_cache: NDArray[Any] | None = None + """The bundled Doubao alpha template (float [0,1]), or None.""" + return _text_mark_engine.load_alpha_template(_CONFIG.asset_name) def _glyph_silhouette() -> NDArray[Any] | None: - """Binary "豆包AI生成" silhouette (255 = glyph) from the bundled alpha map, - used as the detection template. None if the alpha asset is missing.""" - global _silhouette_cache - if _silhouette_cache is None: - at = _alpha_template() - if at is None: - return None - _silhouette_cache = (at > 0.15).astype(np.uint8) * 255 - return _silhouette_cache + """Binary "豆包AI生成" silhouette (255 = glyph) from the alpha map, or None.""" + return _text_mark_engine.glyph_silhouette(_CONFIG.asset_name) def _template_match_score(box_mask: NDArray[Any], image_width: int) -> float: - """Zero-mean normalized correlation of the alpha-template glyph silhouette - (scaled to the mark's expected size) against the candidate ``box_mask``. - - TM_CCOEFF_NORMED keys on glyph SHAPE, not coverage, so a dense textured - corner does not score highly -- only the actual "豆包AI生成" shape does. - """ - sil = _glyph_silhouette() - if sil is None or box_mask.size == 0: - return 0.0 - gw = min(box_mask.shape[1] - 1, max(8, int(_ALPHA_WIDTH_FRAC * image_width))) - gh = min(box_mask.shape[0] - 1, max(4, int(_ALPHA_HEIGHT_FRAC * image_width))) - if gw < 8 or gh < 4: - return 0.0 - template = cv2.resize(sil, (gw, gh), interpolation=cv2.INTER_NEAREST) - return float(cv2.matchTemplate(box_mask, template, cv2.TM_CCOEFF_NORMED).max()) + """TM_CCOEFF_NORMED of the Doubao glyph silhouette against ``box_mask``.""" + return _text_mark_engine.template_match_score(box_mask, image_width, _CONFIG) -class DoubaoEngine: - """Remove the visible Doubao "豆包AI生成" watermark (locate -> mask -> inpaint).""" +class DoubaoEngine(TextMarkEngine): + """Remove the visible Doubao "豆包AI生成" watermark (locate -> mask -> reverse-alpha).""" - def __init__( - self, - *, - width_frac: float = WM_WIDTH_FRAC, - height_frac: float = WM_HEIGHT_FRAC, - margin_right_frac: float = MARGIN_RIGHT_FRAC, - margin_bottom_frac: float = MARGIN_BOTTOM_FRAC, - ) -> None: - self.width_frac = width_frac - self.height_frac = height_frac - self.margin_right_frac = margin_right_frac - self.margin_bottom_frac = margin_bottom_frac - - # ── Locate ──────────────────────────────────────────────────────── - - def locate(self, image: NDArray[Any]) -> DoubaoLocation: - """Anchor the watermark box in the bottom-right corner by geometry.""" - h, w = image.shape[:2] - wm_w = max(40, int(w * self.width_frac)) - wm_h = max(16, int(w * self.height_frac)) - margin_r = max(4, int(w * self.margin_right_frac)) - margin_b = max(4, int(w * self.margin_bottom_frac)) - x = max(0, w - margin_r - wm_w) - y = max(0, h - margin_b - wm_h) - wm_w = min(wm_w, w - x) - wm_h = min(wm_h, h - y) - return DoubaoLocation(x=x, y=y, w=wm_w, h=wm_h, is_fallback=True) - - # ── Mask ────────────────────────────────────────────────────────── - - def extract_mask(self, image: NDArray[Any], loc: DoubaoLocation) -> NDArray[Any]: - """Build a full-image uint8 mask (255 = watermark glyph) for the box. - - Polarity-aware: the mark is a light, low-saturation gray. On a dark - background it is the bright region; on a light background it is the - off-white gray below paper-white. Both cases are captured by the logo - luminance band intersected with the grayish constraint, plus a - brighter-than-local-background test on dark backgrounds. - """ - h, w = image.shape[:2] - x, y, bw, bh = loc.bbox - # A degenerate ROI (a sliver from an extremely wide/short image) cannot hold - # the mark and would feed cv2's GaussianBlur/morphology a ~1-px-tall array, - # which can fault the native code on some platforms (observed: a Windows - # access violation via the always-align removal's residual `detect`). Skip - # the cv2 pipeline and return an empty mask there. - if bh < 16 or bw < 16: - return np.zeros((h, w), np.uint8) - # Normalize the ROI to 3-channel BGR: a 2D grayscale or 4-channel BGRA - # input would otherwise break the axis=2 channel reductions below. - roi = image[y : y + bh, x : x + bw] - if roi.ndim == 2: - roi = cv2.cvtColor(roi, cv2.COLOR_GRAY2BGR) - elif roi.shape[2] == 4: - roi = cv2.cvtColor(roi, cv2.COLOR_BGRA2BGR) - roi = roi.astype(np.float32) - - luma = roi.mean(axis=2) - sat = roi.max(axis=2) - roi.min(axis=2) - grayish = sat < MAX_SATURATION - - # Local background model: a strong Gaussian blur (sigma ~ box height) - # approximates the content under the glyphs. The white top-hat - # (luma - local_bg) lights up bright thin strokes regardless of the - # absolute background level. - sigma = max(4.0, bh * 0.4) - local_bg = cv2.GaussianBlur(luma, (0, 0), sigmaX=sigma, sigmaY=sigma) - tophat = luma - local_bg - - cand = grayish & (tophat > TOPHAT_DELTA) & (luma > LOGO_MIN_LUMA) - glyph = cand.astype(np.uint8) * 255 - # Connect glyph parts, then drop isolated specks (5x5 open clears the - # scattered grayish pixels that random/textured corners produce). - glyph = cv2.morphologyEx(glyph, cv2.MORPH_CLOSE, np.ones((5, 5), np.uint8)) - glyph = cv2.morphologyEx(glyph, cv2.MORPH_OPEN, np.ones((5, 5), np.uint8)) - - mask = np.zeros((h, w), np.uint8) - mask[y : y + bh, x : x + bw] = glyph - return mask - - # ── Detect ──────────────────────────────────────────────────────── - - def detect(self, image: NDArray[Any]) -> DoubaoDetection: - """Detect the visible Doubao mark by matching the alpha-template glyph - silhouette against the corner candidate (TM_CCOEFF_NORMED). - - Keys on the "豆包AI生成" SHAPE, not coverage, so a textured corner does - not fire. ``confidence`` is the correlation score; ``detected`` is it - clearing ``DETECT_NCC_THRESHOLD``. - """ - det = DoubaoDetection() - if image is None or image.size == 0: - return det - loc = self.locate(image) - mask = self.extract_mask(image, loc) - x, y, bw, bh = loc.bbox - box = mask[y : y + bh, x : x + bw] - coverage = float((box > 0).sum()) / float(max(1, bw * bh)) - det.region = loc.bbox - det.coverage = coverage - if coverage >= DETECT_MIN_COVERAGE: - score = _template_match_score(box, image.shape[1]) - det.confidence = score - det.detected = score >= DETECT_NCC_THRESHOLD - logger.debug("Doubao detect: coverage=%.3f ncc=%.2f detected=%s", coverage, score, det.detected) - return det - - # ── Reverse-alpha (exact recovery) ──────────────────────────────── - - def reverse_alpha_available(self, image: NDArray[Any]) -> bool: - """True if the bundled alpha map is loadable. Sub-pixel NCC alignment - (see ``_aligned_alpha_map``) places it on the actual mark at ANY - resolution, so there is no width gate -- the caller still gates on - ``detect`` so a clean corner is never touched.""" - return image is not None and image.size > 0 and _alpha_template() is not None - - def _fixed_alpha_map(self, image: NDArray[Any]) -> tuple[NDArray[Any], tuple[int, int, int, int]] | None: - """Place the template by fixed width-relative geometry -- pixel-exact at - the captured width (used there instead of integer-pixel NCC alignment).""" - at = _alpha_template() - if at is None: - return None - h, w = image.shape[:2] - # Glyph box scales with WIDTH; on a wide/short image the height-from-width - # box can exceed the image height. Clamp both dims so the slice assignment - # below cannot overflow (a degenerate 2048x1 input otherwise raised - # ValueError on the broadcast). Normal images are unaffected. - gw = min(w, max(1, int(_ALPHA_WIDTH_FRAC * w))) - gh = min(h, max(1, int(_ALPHA_HEIGHT_FRAC * w))) - ax = max(0, w - int(_ALPHA_MARGIN_RIGHT_FRAC * w) - gw) - ay = max(0, h - int(_ALPHA_MARGIN_BOTTOM_FRAC * w) - gh) - amap = np.zeros((h, w), np.float32) - amap[ay : ay + gh, ax : ax + gw] = cv2.resize(at, (gw, gh), interpolation=cv2.INTER_LINEAR) - return amap, (ax, ay, gw, gh) - - def _aligned_alpha_map(self, image: NDArray[Any]) -> tuple[NDArray[Any], tuple[int, int, int, int]] | None: - """Build a full-image alpha map with the captured template registered to - the actual mark via a TM_CCOEFF_NORMED scale + position search -- so the - single capture works off the captured width (a pure width-scale ghosts). - Returns ``(alpha_map, glyph_bbox)`` or None.""" - at = _alpha_template() - sil = _glyph_silhouette() - if at is None or sil is None: - return None - h, w = image.shape[:2] - loc = self.locate(image) - bx, by, bw, bh = loc.bbox - box_mask = self.extract_mask(image, loc)[by : by + bh, bx : bx + bw] - expected = _ALPHA_WIDTH_FRAC * w - best: tuple[float, int, int, int, int] | None = None - for scale in np.linspace(*_ALPHA_ALIGN_SEARCH): - gw, gh = int(expected * scale), int(_ALPHA_HEIGHT_FRAC * w * scale) - if gw < 8 or gh < 4 or gw >= bw or gh >= bh: - continue - t = cv2.resize(sil, (gw, gh), interpolation=cv2.INTER_NEAREST) - _, score, _, top_left = cv2.minMaxLoc(cv2.matchTemplate(box_mask, t, cv2.TM_CCOEFF_NORMED)) - if best is None or score > best[0]: - best = (score, gw, gh, top_left[0], top_left[1]) - if best is None: - return None - _, gw, gh, ox, oy = best - ax, ay = bx + ox, by + oy - amap = np.zeros((h, w), np.float32) - amap[ay : ay + gh, ax : ax + gw] = cv2.resize(at, (gw, gh), interpolation=cv2.INTER_LINEAR) - return amap, (ax, ay, gw, gh) - - def _apply_reverse_alpha(self, image: NDArray[Any], amap: NDArray[Any]) -> NDArray[Any]: - """Invert the alpha blend with ``amap``: ``original = (wm - a*logo)/(1-a)``.""" - a3 = np.clip(amap, 0.0, 1.0)[:, :, None] - logo = np.array(_ALPHA_LOGO_BGR, np.float32) - return np.clip((image.astype(np.float32) - a3 * logo) / np.clip(1.0 - a3, 0.25, 1.0), 0, 255).astype(np.uint8) - - def remove_watermark_reverse_alpha(self, image: NDArray[Any], *, residual_inpaint: bool = True) -> NDArray[Any]: - """Recover the original pixels by inverting the alpha blend - ``original = (wm - a*logo)/(1-a)``, then clear the residual edges with a - thin inpaint over the glyph footprint. - - Placement: fixed geometry AND the NCC-aligned placement are always tried and - the one leaving the least residual mark (lowest re-``detect`` confidence) is - kept -- the mark is re-rasterized and a few px off per image, so fixed - geometry alone leaves a visible outline (it did on the doubao-1.png sample). - A single capture cannot pixel-cancel the mark on every image, so a - deliberately THIN residual inpaint (``_RESIDUAL_*``) follows: reverse-alpha - has already recovered the true background under the mark, so the inpaint only - finishes the leftover edges instead of smearing the whole footprint. - Call only when :meth:`reverse_alpha_available` and the mark is detected. - """ - # Normalize to 3-channel BGR so a 2D grayscale or 4-channel BGRA input - # does not break the reverse-alpha math (which assumes a 3-channel logo). - if image.ndim == 2: - image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR) - elif image.shape[2] == 4: - image = cv2.cvtColor(image, cv2.COLOR_BGRA2BGR) - # An image too small to hold the mark would make the geometry boxes - # degenerate and feed cv2.resize a ~1-px-tall target / GaussianBlur a sliver - # ROI, which faults natively on Windows (access violation / "Unknown C++ - # exception"). No real watermarked image is this small; skip cv2 entirely. - h, w = image.shape[:2] - if h < 32 or w < 64: - return image.copy() - maps = [c for c in (self._fixed_alpha_map(image), self._aligned_alpha_map(image)) if c is not None] - if not maps: - return image.copy() - best_out: NDArray[Any] | None = None - best_amap: NDArray[Any] | None = None - best_residual = float("inf") - for amap, _region in maps: - out = self._apply_reverse_alpha(image, amap) - residual = self.detect(out).confidence - if residual < best_residual: - best_residual, best_out, best_amap = residual, out, amap - if best_out is None or best_amap is None: # pragma: no cover - maps is non-empty - return image.copy() - if residual_inpaint: - kernel = np.ones((_RESIDUAL_DILATE, _RESIDUAL_DILATE), np.uint8) - rm = cv2.dilate((best_amap > _RESIDUAL_ALPHA_FLOOR).astype(np.uint8) * 255, kernel) - best_out = cv2.inpaint(best_out, rm, _RESIDUAL_INPAINT_RADIUS, cv2.INPAINT_NS) - return best_out + def __init__(self) -> None: + super().__init__(_CONFIG) def load_image_bgr(path: str | Path) -> NDArray[Any]: """Read an image as BGR ndarray (helper for scripts/tests).""" from remove_ai_watermarks import image_io - img = image_io.imread(path, cv2.IMREAD_COLOR) + img = image_io.imread(path) if img is None: raise FileNotFoundError(f"Failed to read image: {path}") return img diff --git a/src/remove_ai_watermarks/gemini_engine.py b/src/remove_ai_watermarks/gemini_engine.py index dc84146..834838f 100644 --- a/src/remove_ai_watermarks/gemini_engine.py +++ b/src/remove_ai_watermarks/gemini_engine.py @@ -28,6 +28,8 @@ from typing import TYPE_CHECKING, Any, Literal import cv2 import numpy as np +from remove_ai_watermarks import image_io + if TYPE_CHECKING: from collections.abc import Iterator @@ -290,6 +292,11 @@ class GeminiEngine: if image is None or image.size == 0: return result + # Normalize to 3-channel BGR: the multi-scale search tolerates grayscale, but + # the FP-gate / alpha-gain helpers (_core_and_bg) reduce over axis=2 and would + # crash on a 2D/BGRA input reaching this public entry point (e.g. via the + # registry detect adapter or the library API). + image = image_io.to_bgr(image) h, w = image.shape[:2] base_size = force_size or get_watermark_size(w, h) result.size = base_size @@ -481,17 +488,10 @@ class GeminiEngine: Cleaned BGR image as numpy array, or an unmodified copy when no watermark is detected. """ - result = image.copy() - # Normalize to 3-channel BGR up front: 2D grayscale (no channel axis) and # 4-channel BGRA both reach this public entry point and would otherwise # crash on the channel-count checks / downstream 3-channel math. - if result.ndim == 2: - result = cv2.cvtColor(result, cv2.COLOR_GRAY2BGR) - elif result.shape[2] == 4: - result = cv2.cvtColor(result, cv2.COLOR_BGRA2BGR) - elif result.shape[2] == 1: - result = cv2.cvtColor(result, cv2.COLOR_GRAY2BGR) + result = image_io.to_bgr(image.copy()) size = force_size or get_watermark_size(result.shape[1], result.shape[0]) @@ -554,7 +554,9 @@ class GeminiEngine: Returns: Cleaned BGR image. """ - result = image.copy() + # Same channel normalization as remove_watermark: the reverse-alpha blend + # assumes 3-channel BGR (a grayscale/BGRA input would mis-broadcast). + result = image_io.to_bgr(image.copy()) x, y, rw, rh = region # Check standard sizes diff --git a/src/remove_ai_watermarks/image_io.py b/src/remove_ai_watermarks/image_io.py index fdb3dc8..d678a64 100644 --- a/src/remove_ai_watermarks/image_io.py +++ b/src/remove_ai_watermarks/image_io.py @@ -49,6 +49,25 @@ def imread(path: str | Path, flags: int | None = None) -> NDArray[Any] | None: return cv2.imdecode(data, flags) +def to_bgr(image: NDArray[Any]) -> NDArray[Any]: + """Return a 3-channel BGR view of ``image``, promoting grayscale and BGRA. + + The cv2-based engines (sparkle + the reverse-alpha text marks) assume a + 3-channel BGR array for their channel reductions (``mean(axis=2)``, the + per-pixel logo subtraction). A 2D grayscale or 4-channel BGRA input -- a real + Gemini-app export is opaque RGBA -- would otherwise crash or mis-broadcast. + Centralizes the shape coercion that was inlined across the engines. A 3-channel + input is returned unchanged (no copy). + """ + import cv2 + + if image.ndim == 2 or image.shape[2] == 1: + return cv2.cvtColor(image, cv2.COLOR_GRAY2BGR) + if image.shape[2] == 4: + return cv2.cvtColor(image, cv2.COLOR_BGRA2BGR) + return image + + def imwrite(path: str | Path, img: NDArray[Any]) -> bool: """Unicode-safe ``cv2.imwrite``. diff --git a/src/remove_ai_watermarks/invisible_engine.py b/src/remove_ai_watermarks/invisible_engine.py index d758f7b..c41ae6a 100644 --- a/src/remove_ai_watermarks/invisible_engine.py +++ b/src/remove_ai_watermarks/invisible_engine.py @@ -261,8 +261,14 @@ class InvisibleEngine: vendor=vendor, ) - # Post-processing: optional Humanizer, then restore original resolution. - if humanize > 0.0: + # Post-processing chain: decode the diffusion output ONCE, apply the + # optional stages in memory in order (humanize -> restore original + # resolution -> unsharp -> adaptive polish), and write ONCE. Previously + # each stage independently imread/imwrote the full-res output, so a run + # with several stages PNG-decoded+re-encoded the same image 2-4 times. + # PNG is lossless, so the single-write output is byte-identical. + needs_restore = target is not None # the input was resized before diffusion + if humanize > 0.0 or unsharp > 0.0 or adaptive_polish or needs_restore: import cv2 from remove_ai_watermarks import image_io @@ -271,67 +277,43 @@ class InvisibleEngine: if out_cv is None: return out_path - if self._progress_callback: - self._progress_callback(f"Applying Analog Humanizer (grain: {humanize})...") - from remove_ai_watermarks.humanizer import apply_analog_humanizer + if humanize > 0.0: + if self._progress_callback: + self._progress_callback(f"Applying Analog Humanizer (grain: {humanize})...") + from remove_ai_watermarks.humanizer import apply_analog_humanizer - out_cv = apply_analog_humanizer(out_cv, grain_intensity=humanize, chromatic_shift=1) + out_cv = apply_analog_humanizer(out_cv, grain_intensity=humanize, chromatic_shift=1) - # Restore original resolution + # Restore original resolution if the input was resized for diffusion. if (out_cv.shape[1], out_cv.shape[0]) != orig_size: if self._progress_callback: self._progress_callback( f"Upscaling result back to original resolution {orig_size[0]}x{orig_size[1]}..." ) - # Using INTER_LANCZOS4 for high-quality upscaling back to original out_cv = cv2.resize(out_cv, orig_size, interpolation=cv2.INTER_LANCZOS4) - image_io.imwrite(out_path, out_cv) - - else: - # No humanize: still restore the original size if it was capped. - import cv2 - - from remove_ai_watermarks import image_io - - out_cv = image_io.imread(out_path, cv2.IMREAD_COLOR) - if out_cv is not None and (out_cv.shape[1], out_cv.shape[0]) != orig_size: - if self._progress_callback: - self._progress_callback( - f"Upscaling result back to original resolution {orig_size[0]}x{orig_size[1]}..." - ) - out_cv = cv2.resize(out_cv, orig_size, interpolation=cv2.INTER_LANCZOS4) - image_io.imwrite(out_path, out_cv) - - # Final sharpening. - if unsharp > 0.0: - import cv2 - - from remove_ai_watermarks import image_io - from remove_ai_watermarks.humanizer import unsharp_mask - - out_cv = image_io.imread(out_path, cv2.IMREAD_COLOR) - if out_cv is not None: + if unsharp > 0.0: if self._progress_callback: self._progress_callback(f"Sharpening (unsharp mask: {unsharp})...") - image_io.imwrite(out_path, unsharp_mask(out_cv, amount=unsharp)) + from remove_ai_watermarks.humanizer import unsharp_mask - # Adaptive polish (CLI default): restore the input's detail level in the - # softened output, sparing text/edges. Self-limiting where there is no deficit. - if adaptive_polish: - import cv2 - import numpy as np + out_cv = unsharp_mask(out_cv, amount=unsharp) - from remove_ai_watermarks import humanizer, image_io + # Adaptive polish (CLI default): restore the input's detail level in the + # softened output, sparing text/edges. Self-limiting where no deficit. + if adaptive_polish: + import numpy as np + + from remove_ai_watermarks import humanizer - out_cv = image_io.imread(out_path, cv2.IMREAD_COLOR) - if out_cv is not None: ref = cv2.cvtColor(np.array(reference_pil.convert("RGB")), cv2.COLOR_RGB2BGR) if (ref.shape[1], ref.shape[0]) != (out_cv.shape[1], out_cv.shape[0]): ref = cv2.resize(ref, (out_cv.shape[1], out_cv.shape[0]), interpolation=cv2.INTER_LANCZOS4) if self._progress_callback: self._progress_callback("Adaptive polish (sharpen + grain to the input's detail level)...") - image_io.imwrite(out_path, humanizer.adaptive_polish(out_cv, ref, seed=seed)) + out_cv = humanizer.adaptive_polish(out_cv, ref, seed=seed) + + image_io.imwrite(out_path, out_cv) return out_path finally: diff --git a/src/remove_ai_watermarks/jimeng_engine.py b/src/remove_ai_watermarks/jimeng_engine.py index 871f580..a96b936 100644 --- a/src/remove_ai_watermarks/jimeng_engine.py +++ b/src/remove_ai_watermarks/jimeng_engine.py @@ -1,415 +1,109 @@ -"""Jimeng (即梦AI) visible watermark removal engine. +"""Jimeng / Dreamina visible watermark removal engine. -Jimeng / Dreamina (ByteDance's image generator, distinct from Doubao) stamps a -visible "★ 即梦AI" wordmark -- a four-point sparkle icon followed by the 即梦AI -characters -- in the bottom-right corner: a near-white semi-transparent overlay, -the explicit AIGC label under China's TC260 standard. +Jimeng (即梦AI, ByteDance) stamps generated images with a visible "★ 即梦AI" wordmark +in the bottom-right corner -- a near-white semi-transparent overlay, the same overlay +class as the Doubao text strip. -Like the Gemini sparkle and the Doubao strip, it is a fixed overlay, so removal -starts from **reverse-alpha blending** against a captured alpha map -(``remove_watermark_reverse_alpha``): ``original = (wm - a*logo)/(1-a)``. The logo -is pure white (255,255,255); the alpha map was solved from the GRAY Jimeng capture -(see data/jimeng_capture/), bundled as ``assets/jimeng_alpha.png`` -- a careful -build (cubic-background fit, mean over channels, full halo extent, unblurred) that -drops the self-residual to ~1.3. Gray is the chosen background because the mark -sits on bright photo content in real use, not on black. - -Unlike the Doubao mark, Jimeng re-rasterizes its mark per generation AND jitters -its position a few px (the alpha maps solved from independent captures correlate -0.998 but not 1.0), so a single 2048 alpha map does not pixel-cancel the mark on -every image/resolution the way Doubao's deterministic overlay does. Removal -therefore NCC-aligns the alpha to the actual mark (always, not only off-native), -reverse-alphas, then clears the residual with a deliberately THIN inpaint over the -glyph footprint. The reverse-alpha pre-step recovers the true background (including -edges) under the semi-transparent mark, so the thin inpaint only finishes the -residual edges rather than smearing the whole footprint -- a wide full-footprint -pass blurred the texture/edges under the mark. Verified clean on the solid captures -(native 2048) and on a real 1440-wide Jimeng download (off-native, table edge kept). - -Detection (``detect``) matches the bundled "即梦AI" glyph silhouette against the -corner candidate via normalized correlation, so it keys on the actual mark shape -(real marks score >=0.81, the Doubao strip 0.21, other AI output 0.0) rather than -coverage heuristics, and does not hijack ``--mark auto`` on a Doubao image. - -``locate`` (geometry box, scales with image WIDTH) and ``extract_mask`` (the -candidate glyph mask the detector correlates) mirror the Doubao engine. Fast, -offline, no GPU. Arbitrary-region inpainting still lives in ``region_eraser`` / -the ``erase`` command. +Removal is **reverse-alpha blending** against a captured alpha map +(``original = (wm - a*logo)/(1-a)``), always NCC-aligned to the actual mark plus a thin +residual inpaint over the glyph footprint. This is one of the three text-mark engines +that share :class:`remove_ai_watermarks._text_mark_engine.TextMarkEngine`; this module +supplies only Jimeng's tuned :class:`TextMarkConfig` (bottom-right corner, +``assets/jimeng_alpha.png`` rebuilt by ``scripts/visible_alpha_solve.py`` from the gray +capture). Jimeng images are also caught by the China TC260 AIGC metadata label, so this +is the visible-mark *removal* path, not a new ``identify`` signal. """ +# The module-level _alpha_template / _glyph_silhouette / _template_match_score below +# are thin test-facing shims (imported by tests/), so pyright's src-only pass sees them +# as unused; the use is cross-module. +# pyright: reportUnusedFunction=false -# cv2/numpy boundary: third-party libs ship no usable element types; relax the -# unknown-type rules for this file only. -# pyright: reportUnknownMemberType=false, reportUnknownArgumentType=false, reportUnknownVariableType=false, reportUnknownParameterType=false, reportMissingTypeArgument=false, reportMissingTypeStubs=false, reportMissingImports=false, reportArgumentType=false, reportAssignmentType=false, reportReturnType=false, reportCallIssue=false, reportIndexIssue=false, reportOperatorIssue=false, reportOptionalMemberAccess=false, reportOptionalCall=false, reportOptionalSubscript=false, reportOptionalOperand=false, reportAttributeAccessIssue=false, reportPrivateImportUsage=false, reportPrivateUsage=false, reportInvalidTypeForm=false, reportConstantRedefinition=false, reportUnnecessaryComparison=false from __future__ import annotations -import logging -from dataclasses import dataclass from typing import TYPE_CHECKING, Any -import cv2 -import numpy as np +from remove_ai_watermarks import _text_mark_engine +from remove_ai_watermarks._text_mark_engine import TextMarkConfig, TextMarkDetection, TextMarkEngine if TYPE_CHECKING: - from pathlib import Path - from numpy.typing import NDArray -logger = logging.getLogger(__name__) - - -# Geometry as a fraction of image WIDTH. The Jimeng mark scales with width and is -# anchored bottom-right. The box is intentionally generous (the glyph mask -# tightens it); values cover the measured 2048 captures plus a real 1440 download. +# Locate geometry as a fraction of image WIDTH (mark scales with width, bottom-right). WM_WIDTH_FRAC = 0.27 WM_HEIGHT_FRAC = 0.092 MARGIN_RIGHT_FRAC = 0.008 MARGIN_BOTTOM_FRAC = 0.010 -# Glyph appearance: a low-saturation light gray rendered brighter than the -# surrounding content (white top-hat: brighter than a blurred local background) -# intersected with the grayish + minimum-brightness tests. Same polarity logic as -# the Doubao engine: leaves white-paper documents untouched (the mark is not -# brighter than its surroundings there, so nothing is masked). -MAX_SATURATION = 55 # max channel spread to count a pixel as "grayish" -LOGO_MIN_LUMA = 150 # glyphs are at least this bright in absolute terms -TOPHAT_DELTA = 12 # glyph must exceed the local background by this many levels +# Glyph appearance: a light, low-saturation gray brighter than the local background. +MAX_SATURATION = 55 +LOGO_MIN_LUMA = 150 +TOPHAT_DELTA = 12 -# Detection matches the bundled alpha-template glyph silhouette -# (assets/jimeng_alpha.png) against the candidate via zero-mean normalized -# correlation (cv2 TM_CCOEFF_NORMED). Real Jimeng marks score >=0.83, the Doubao -# strip 0.22, other AI output 0.0 -> threshold 0.45 separates cleanly while -# keeping `--mark auto` from confusing Jimeng with Doubao. A small coverage floor -# skips the template match on a near-empty candidate box. +# Shape-consistent detection. Threshold 0.45 cleanly separates real Jimeng marks +# (>=0.81) from the Doubao strip (0.21), so the two ByteDance marks do not cross-fire. DETECT_MIN_COVERAGE = 0.02 DETECT_NCC_THRESHOLD = 0.45 -# ── Reverse-alpha (recovery, Gemini/Doubao-style) ──────────────────── -# The Jimeng mark is a fixed semi-transparent white overlay; given its alpha map -# the original pixels are recovered by inverting the blend. The logo is pure white -# (the white capture confirms L=255 and a pair-solve of L lands at ~254.6). The -# alpha map was solved from the GRAY capture: a = (I - B)/(255 - B) with B a -# per-capture CUBIC background fit over the non-glyph pixels, averaged over the -# three channels, kept at full halo extent (down to a~0.02) and UNBLURRED. Gray -# (background ~132, mark contrast ~120) is chosen over black because it is the -# best proxy for real content, where the mark sits on bright photo areas, not on -# black; the careful build drops the gray self-residual to ~1.3 (the earlier -# max-channel / quadratic-bg / blurred / halo-truncated build was visibly worse -- -# the mask, not the method, was the limit). The bundled asset -# (assets/jimeng_alpha.png) is the alpha template (a*255) at the captured width. -# The mark scales with image WIDTH; a pure width-scale is only sub-pixel-accurate -# at the captured width, so removal also registers the template to the actual mark -# via a TM_CCOEFF_NORMED scale+position search (`_aligned_alpha_map`) off it. +# Reverse-alpha geometry, emitted by scripts/visible_alpha_solve.py from the gray +# capture at the captured width. _ALPHA_NATIVE_WIDTH = 2048 _ALPHA_LOGO_BGR: tuple[float, float, float] = (255.0, 255.0, 255.0) -# Geometry below is emitted by scripts/visible_alpha_solve.py for the bundled -# asset -- keep them in sync when the asset is rebuilt. _ALPHA_WIDTH_FRAC = 0.2021 # asset width / image width -- the alignment scale seed _ALPHA_HEIGHT_FRAC = 0.0576 -# Margins (of image WIDTH) of the captured mark -- the geometry record / where to -# seed; alignment refines the actual position, so these are not load-bearing. _ALPHA_MARGIN_RIGHT_FRAC = 0.0288 _ALPHA_MARGIN_BOTTOM_FRAC = 0.0288 -# Alignment scale search (np.linspace args) around the width-scaled glyph size -- -# fine enough that a per-image scale/position jitter does not leave a thick -# edge-misalignment outline (a coarse step left ~4px slop at the mark ends). _ALPHA_ALIGN_SEARCH = (0.90, 1.12, 23) -# Residual inpaint footprint: unlike Doubao, Jimeng's per-image render variation -# leaves a faint outline even at native, so the glyph footprint (alpha above this) -# is always inpainted after reverse-alpha (dilated by this kernel, INPAINT_NS). -# Kept deliberately THIN -- the careful alpha map (cubic-background, mean-channel, -# full-halo solve) knocks the mark down far enough that a tight footprint clears -# it, so the inpaint does not smear the texture/edges under the mark the way a -# wide full-footprint pass did. _RESIDUAL_ALPHA_FLOOR = 0.05 _RESIDUAL_DILATE = 5 _RESIDUAL_INPAINT_RADIUS = 2 -_alpha_template_cache: NDArray[Any] | None = None + +_CONFIG = TextMarkConfig( + name="Jimeng", + asset_name="jimeng_alpha.png", + corner="br", + margin_floor=4, + width_frac=WM_WIDTH_FRAC, + height_frac=WM_HEIGHT_FRAC, + margin_x_frac=MARGIN_RIGHT_FRAC, + margin_bottom_frac=MARGIN_BOTTOM_FRAC, + max_saturation=MAX_SATURATION, + logo_min_luma=LOGO_MIN_LUMA, + tophat_delta=TOPHAT_DELTA, + morph_open_size=5, + detect_min_coverage=DETECT_MIN_COVERAGE, + detect_ncc_threshold=DETECT_NCC_THRESHOLD, + alpha_width_frac=_ALPHA_WIDTH_FRAC, + alpha_height_frac=_ALPHA_HEIGHT_FRAC, + alpha_margin_x_frac=_ALPHA_MARGIN_RIGHT_FRAC, + alpha_margin_bottom_frac=_ALPHA_MARGIN_BOTTOM_FRAC, + alpha_align_search=_ALPHA_ALIGN_SEARCH, + min_gw=8, + alpha_logo_bgr=_ALPHA_LOGO_BGR, + residual_alpha_floor=_RESIDUAL_ALPHA_FLOOR, + residual_dilate=_RESIDUAL_DILATE, + residual_inpaint_radius=_RESIDUAL_INPAINT_RADIUS, +) + +JimengDetection = TextMarkDetection def _alpha_template() -> NDArray[Any] | None: - """Lazily load the bundled Jimeng alpha template (float [0,1]), or None.""" - global _alpha_template_cache - if _alpha_template_cache is None: - from pathlib import Path - - from remove_ai_watermarks import image_io - - path = Path(__file__).parent / "assets" / "jimeng_alpha.png" - img = image_io.imread(str(path), cv2.IMREAD_GRAYSCALE) - if img is None: - return None - _alpha_template_cache = img.astype(np.float32) / 255.0 - return _alpha_template_cache - - -@dataclass(frozen=True) -class JimengLocation: - """Located watermark box (bottom-right), in absolute pixel coordinates.""" - - x: int - y: int - w: int - h: int - is_fallback: bool = True # geometry anchor (no template match) -> always True for now - - @property - def bbox(self) -> tuple[int, int, int, int]: - return self.x, self.y, self.w, self.h - - -@dataclass -class JimengDetection: - """Result of visible Jimeng watermark detection.""" - - detected: bool = False - confidence: float = 0.0 - region: tuple[int, int, int, int] = (0, 0, 0, 0) - coverage: float = 0.0 # fraction of the box occupied by glyph pixels - - -_silhouette_cache: NDArray[Any] | None = None + """The bundled Jimeng alpha template (float [0,1]), or None.""" + return _text_mark_engine.load_alpha_template(_CONFIG.asset_name) def _glyph_silhouette() -> NDArray[Any] | None: - """Binary "即梦AI" silhouette (255 = glyph) from the bundled alpha map, used - as the detection template. None if the alpha asset is missing.""" - global _silhouette_cache - if _silhouette_cache is None: - at = _alpha_template() - if at is None: - return None - _silhouette_cache = (at > 0.15).astype(np.uint8) * 255 - return _silhouette_cache + """Binary "即梦AI" silhouette (255 = glyph) from the alpha map, or None.""" + return _text_mark_engine.glyph_silhouette(_CONFIG.asset_name) def _template_match_score(box_mask: NDArray[Any], image_width: int) -> float: - """Zero-mean normalized correlation of the alpha-template glyph silhouette - (scaled to the mark's expected size) against the candidate ``box_mask``.""" - sil = _glyph_silhouette() - if sil is None or box_mask.size == 0: - return 0.0 - gw = min(box_mask.shape[1] - 1, max(8, int(_ALPHA_WIDTH_FRAC * image_width))) - gh = min(box_mask.shape[0] - 1, max(4, int(_ALPHA_HEIGHT_FRAC * image_width))) - if gw < 8 or gh < 4: - return 0.0 - template = cv2.resize(sil, (gw, gh), interpolation=cv2.INTER_NEAREST) - return float(cv2.matchTemplate(box_mask, template, cv2.TM_CCOEFF_NORMED).max()) + """TM_CCOEFF_NORMED of the Jimeng glyph silhouette against ``box_mask``.""" + return _text_mark_engine.template_match_score(box_mask, image_width, _CONFIG) -class JimengEngine: - """Remove the visible Jimeng "即梦AI" watermark (locate -> mask -> reverse-alpha).""" +class JimengEngine(TextMarkEngine): + """Remove the visible Jimeng "★ 即梦AI" watermark (locate -> mask -> reverse-alpha).""" - def __init__( - self, - *, - width_frac: float = WM_WIDTH_FRAC, - height_frac: float = WM_HEIGHT_FRAC, - margin_right_frac: float = MARGIN_RIGHT_FRAC, - margin_bottom_frac: float = MARGIN_BOTTOM_FRAC, - ) -> None: - self.width_frac = width_frac - self.height_frac = height_frac - self.margin_right_frac = margin_right_frac - self.margin_bottom_frac = margin_bottom_frac - - # ── Locate ──────────────────────────────────────────────────────── - - def locate(self, image: NDArray[Any]) -> JimengLocation: - """Anchor the watermark box in the bottom-right corner by geometry.""" - h, w = image.shape[:2] - wm_w = max(40, int(w * self.width_frac)) - wm_h = max(16, int(w * self.height_frac)) - margin_r = max(4, int(w * self.margin_right_frac)) - margin_b = max(4, int(w * self.margin_bottom_frac)) - x = max(0, w - margin_r - wm_w) - y = max(0, h - margin_b - wm_h) - wm_w = min(wm_w, w - x) - wm_h = min(wm_h, h - y) - return JimengLocation(x=x, y=y, w=wm_w, h=wm_h, is_fallback=True) - - # ── Mask ────────────────────────────────────────────────────────── - - def extract_mask(self, image: NDArray[Any], loc: JimengLocation) -> NDArray[Any]: - """Build a full-image uint8 mask (255 = watermark glyph) for the box. - - Polarity-aware: the mark is a light, low-saturation gray rendered brighter - than the local background (white top-hat), so a white-paper document is - left untouched (nothing brighter than its surroundings is masked there). - """ - h, w = image.shape[:2] - x, y, bw, bh = loc.bbox - # A degenerate ROI (a sliver from an extremely wide/short image) cannot hold - # the mark and would feed cv2's GaussianBlur/morphology a ~1-px-tall array, - # which can fault the native code on some platforms (observed: a Windows - # access violation via the always-align removal's residual `detect`). Skip - # the cv2 pipeline and return an empty mask there. - if bh < 16 or bw < 16: - return np.zeros((h, w), np.uint8) - # Normalize the ROI to 3-channel BGR: a 2D grayscale or 4-channel BGRA - # input would otherwise break the axis=2 channel reductions below. - roi = image[y : y + bh, x : x + bw] - if roi.ndim == 2: - roi = cv2.cvtColor(roi, cv2.COLOR_GRAY2BGR) - elif roi.shape[2] == 4: - roi = cv2.cvtColor(roi, cv2.COLOR_BGRA2BGR) - roi = roi.astype(np.float32) - - luma = roi.mean(axis=2) - sat = roi.max(axis=2) - roi.min(axis=2) - grayish = sat < MAX_SATURATION - - sigma = max(4.0, bh * 0.4) - local_bg = cv2.GaussianBlur(luma, (0, 0), sigmaX=sigma, sigmaY=sigma) - tophat = luma - local_bg - - cand = grayish & (tophat > TOPHAT_DELTA) & (luma > LOGO_MIN_LUMA) - glyph = cand.astype(np.uint8) * 255 - glyph = cv2.morphologyEx(glyph, cv2.MORPH_CLOSE, np.ones((5, 5), np.uint8)) - glyph = cv2.morphologyEx(glyph, cv2.MORPH_OPEN, np.ones((5, 5), np.uint8)) - - mask = np.zeros((h, w), np.uint8) - mask[y : y + bh, x : x + bw] = glyph - return mask - - # ── Detect ──────────────────────────────────────────────────────── - - def detect(self, image: NDArray[Any]) -> JimengDetection: - """Detect the visible Jimeng mark by matching the alpha-template glyph - silhouette against the corner candidate (TM_CCOEFF_NORMED).""" - det = JimengDetection() - if image is None or image.size == 0: - return det - loc = self.locate(image) - mask = self.extract_mask(image, loc) - x, y, bw, bh = loc.bbox - box = mask[y : y + bh, x : x + bw] - coverage = float((box > 0).sum()) / float(max(1, bw * bh)) - det.region = loc.bbox - det.coverage = coverage - if coverage >= DETECT_MIN_COVERAGE: - score = _template_match_score(box, image.shape[1]) - det.confidence = score - det.detected = score >= DETECT_NCC_THRESHOLD - logger.debug("Jimeng detect: coverage=%.3f ncc=%.2f detected=%s", coverage, score, det.detected) - return det - - # ── Reverse-alpha (recovery + residual inpaint) ─────────────────── - - def reverse_alpha_available(self, image: NDArray[Any]) -> bool: - """True if the bundled alpha map is loadable (NCC alignment places it at - any resolution; the caller still gates on ``detect``).""" - return image is not None and image.size > 0 and _alpha_template() is not None - - def _fixed_alpha_map(self, image: NDArray[Any]) -> tuple[NDArray[Any], tuple[int, int, int, int]] | None: - """Place the template by fixed width-relative geometry.""" - at = _alpha_template() - if at is None: - return None - h, w = image.shape[:2] - gw = min(w, max(1, int(_ALPHA_WIDTH_FRAC * w))) - gh = min(h, max(1, int(_ALPHA_HEIGHT_FRAC * w))) - ax = max(0, w - int(_ALPHA_MARGIN_RIGHT_FRAC * w) - gw) - ay = max(0, h - int(_ALPHA_MARGIN_BOTTOM_FRAC * w) - gh) - amap = np.zeros((h, w), np.float32) - amap[ay : ay + gh, ax : ax + gw] = cv2.resize(at, (gw, gh), interpolation=cv2.INTER_LINEAR) - return amap, (ax, ay, gw, gh) - - def _aligned_alpha_map(self, image: NDArray[Any]) -> tuple[NDArray[Any], tuple[int, int, int, int]] | None: - """Register the captured template to the actual mark via a - TM_CCOEFF_NORMED scale + position search -- so the single capture works - off the captured width. Returns ``(alpha_map, glyph_bbox)`` or None.""" - at = _alpha_template() - sil = _glyph_silhouette() - if at is None or sil is None: - return None - h, w = image.shape[:2] - loc = self.locate(image) - bx, by, bw, bh = loc.bbox - box_mask = self.extract_mask(image, loc)[by : by + bh, bx : bx + bw] - expected = _ALPHA_WIDTH_FRAC * w - best: tuple[float, int, int, int, int] | None = None - for scale in np.linspace(*_ALPHA_ALIGN_SEARCH): - gw, gh = int(expected * scale), int(_ALPHA_HEIGHT_FRAC * w * scale) - if gw < 8 or gh < 4 or gw >= bw or gh >= bh: - continue - t = cv2.resize(sil, (gw, gh), interpolation=cv2.INTER_NEAREST) - _, score, _, top_left = cv2.minMaxLoc(cv2.matchTemplate(box_mask, t, cv2.TM_CCOEFF_NORMED)) - if best is None or score > best[0]: - best = (score, gw, gh, top_left[0], top_left[1]) - if best is None: - return None - _, gw, gh, ox, oy = best - ax, ay = bx + ox, by + oy - amap = np.zeros((h, w), np.float32) - amap[ay : ay + gh, ax : ax + gw] = cv2.resize(at, (gw, gh), interpolation=cv2.INTER_LINEAR) - return amap, (ax, ay, gw, gh) - - def _apply_reverse_alpha(self, image: NDArray[Any], amap: NDArray[Any]) -> NDArray[Any]: - """Invert the alpha blend with ``amap``: ``original = (wm - a*logo)/(1-a)``.""" - a3 = np.clip(amap, 0.0, 1.0)[:, :, None] - logo = np.array(_ALPHA_LOGO_BGR, np.float32) - return np.clip((image.astype(np.float32) - a3 * logo) / np.clip(1.0 - a3, 0.25, 1.0), 0, 255).astype(np.uint8) - - def remove_watermark_reverse_alpha(self, image: NDArray[Any], *, residual_inpaint: bool = True) -> NDArray[Any]: - """Recover the original pixels by inverting the alpha blend, then clear - the residual outline with a thin inpaint over the glyph footprint. - - Placement: fixed geometry AND the NCC-aligned placement are always tried - and the one leaving the least residual mark (lowest re-``detect`` - confidence) is kept -- Jimeng jitters the mark a few px per image even at - the captured width, so fixed geometry alone is not reliable. A single 2048 - alpha cannot pixel-cancel the mark re-rasterized at another resolution, so a - deliberately THIN residual inpaint (``_RESIDUAL_*``) follows: reverse-alpha - has already recovered the true background (edges included) under the mark, - so the inpaint only finishes the residual edges instead of smearing the - whole footprint. Call only when :meth:`reverse_alpha_available` and the mark - is detected. - """ - # Normalize to 3-channel BGR so a 2D grayscale or 4-channel BGRA input - # does not break the reverse-alpha math (which assumes a 3-channel logo). - if image.ndim == 2: - image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR) - elif image.shape[2] == 4: - image = cv2.cvtColor(image, cv2.COLOR_BGRA2BGR) - # An image too small to hold the mark would make the geometry boxes - # degenerate and feed cv2.resize a ~1-px-tall target / GaussianBlur a sliver - # ROI, which faults natively on Windows (access violation / "Unknown C++ - # exception"). No real watermarked image is this small; skip cv2 entirely. - h, w = image.shape[:2] - if h < 32 or w < 64: - return image.copy() - # Always try fixed geometry AND the NCC-aligned placement and keep - # whichever leaves the least residual mark (re-detect confidence on the - # bare reverse-alpha). Unlike Doubao's deterministic overlay, Jimeng jitters - # the mark's position a few px PER IMAGE even at the captured width, so - # fixed geometry alone misses there too -- the NCC search registers the - # template to the actual mark; fixed stays as a fallback if the search has - # no saliency to lock onto (a flat/contrastless mark). - maps = [c for c in (self._fixed_alpha_map(image), self._aligned_alpha_map(image)) if c is not None] - if not maps: - return image.copy() - best_out: NDArray[Any] | None = None - best_amap: NDArray[Any] | None = None - best_residual = float("inf") - for amap, _region in maps: - out = self._apply_reverse_alpha(image, amap) - residual = self.detect(out).confidence - if residual < best_residual: - best_residual, best_out, best_amap = residual, out, amap - if best_out is None or best_amap is None: # pragma: no cover - maps is non-empty - return image.copy() - if residual_inpaint: - kernel = np.ones((_RESIDUAL_DILATE, _RESIDUAL_DILATE), np.uint8) - rm = cv2.dilate((best_amap > _RESIDUAL_ALPHA_FLOOR).astype(np.uint8) * 255, kernel) - best_out = cv2.inpaint(best_out, rm, _RESIDUAL_INPAINT_RADIUS, cv2.INPAINT_NS) - return best_out - - -def load_image_bgr(path: str | Path) -> NDArray[Any]: - """Read an image as BGR ndarray (helper for scripts/tests).""" - from remove_ai_watermarks import image_io - - img = image_io.imread(path, cv2.IMREAD_COLOR) - if img is None: - raise FileNotFoundError(f"Failed to read image: {path}") - return img + def __init__(self) -> None: + super().__init__(_CONFIG) diff --git a/src/remove_ai_watermarks/metadata.py b/src/remove_ai_watermarks/metadata.py index ab05df5..bd5cf7b 100644 --- a/src/remove_ai_watermarks/metadata.py +++ b/src/remove_ai_watermarks/metadata.py @@ -9,6 +9,7 @@ For metadata-only operations, the heavy ML dependencies are NOT required. from __future__ import annotations import contextlib +import functools import logging import re import struct @@ -209,7 +210,10 @@ def _png_late_metadata(image_path: Path, window: int) -> bytes: if chunk_type in _PNG_META_CHUNKS and data_start >= window: f.seek(data_start) out += f.read(safe_length) - pos = data_start + length + 4 # data + CRC + # Advance by the CLAMPED length: a malformed/inflated `length` that + # overshoots EOF must not push `pos` past the file and abort the scan + # (which would silently skip a genuine AI-label chunk after it). + pos = data_start + safe_length + 4 # data + CRC except OSError as exc: logger.debug("PNG late-metadata scan failed on %s: %s", image_path, exc) return b"" @@ -227,7 +231,29 @@ def scan_head(image_path: Path, size: int = 1024 * 1024) -> bytes: non-faststart MP4 manifest, or a PNG XMP packet appended after the pixels -- which a fixed first-MB read would miss. For other inputs, and for files that fit within ``size``, it is exactly ``f.read(size)`` -- behavior-neutral. + + The result is memoized per (path, size, mtime): one ``identify``/``get_ai_metadata`` + call fans out to ~8 byte-scan detectors that each call this on the same file, so + the cache turns those repeated reads into one. The mtime key invalidates the entry + when the file changes; the small ``maxsize`` bounds memory to a few MB. """ + try: + mtime = image_path.stat().st_mtime_ns + except OSError: + # No stat (e.g. a pipe, or a race): read uncached rather than fail. + return _scan_head_impl(image_path, size) + return _scan_head_cached(str(image_path), size, mtime) + + +@functools.lru_cache(maxsize=8) +def _scan_head_cached(path_str: str, size: int, _mtime_ns: int) -> bytes: + """Cache shim: ``_mtime_ns`` is part of the key only (invalidates on change).""" + from pathlib import Path as _Path + + return _scan_head_impl(_Path(path_str), size) + + +def _scan_head_impl(image_path: Path, size: int) -> bytes: with open(image_path, "rb") as f: head = f.read(size) # Lazy import: isobmff imports this module's constants at top level. diff --git a/src/remove_ai_watermarks/noai/img2img_runner.py b/src/remove_ai_watermarks/noai/img2img_runner.py index f295268..a5aed90 100644 --- a/src/remove_ai_watermarks/noai/img2img_runner.py +++ b/src/remove_ai_watermarks/noai/img2img_runner.py @@ -52,7 +52,16 @@ def run_img2img( ) done_ev.set() return result.images[0] - except TypeError: + except TypeError as exc: + # The only TypeError we retry is the deprecated-callback case: `_call_pipeline` + # passes the legacy `callback`/`callback_steps` kwargs, and a diffusers version + # that removed them raises TypeError("... unexpected keyword argument + # 'callback'"). We then re-run once WITHOUT the progress callback. Any OTHER + # TypeError (e.g. a bad control_image/dtype in the forward pass) is a real error + # -- re-raise it instead of silently re-running the whole diffusion pass and + # masking the cause. + if "callback" not in str(exc): + raise first_step.set() result = _call_pipeline( pipeline, image, strength, num_inference_steps, guidance_scale, generator, None, extra_kwargs diff --git a/src/remove_ai_watermarks/noai/watermark_profiles.py b/src/remove_ai_watermarks/noai/watermark_profiles.py index 153ee20..b63a0d2 100644 --- a/src/remove_ai_watermarks/noai/watermark_profiles.py +++ b/src/remove_ai_watermarks/noai/watermark_profiles.py @@ -16,7 +16,6 @@ DEFAULT_MODEL_ID = "stabilityai/stable-diffusion-xl-base-1.0" # profile is ``sdxl``; ``default`` is kept as an accepted alias (it was the profile's # name before ``controlnet`` became the default-selected pipeline, 2026-06-09). SDXL_PROFILE = "sdxl" -CONTROLNET_PROFILE = "controlnet" _PROFILE_ALIASES = {"default": SDXL_PROFILE} @@ -119,16 +118,3 @@ def vendor_for_strength(image_path: Path) -> Literal["openai", "google"] | None: if "openai" in src: return "openai" return None - - -def get_model_id_for_profile(profile: str) -> str: - """Map CLI model profile names to concrete Hugging Face model IDs. - - Both ``sdxl`` and ``controlnet`` use the SDXL base checkpoint -- the canny - ControlNet (``CONTROLNET_CANNY_MODEL``) is an add-on loaded on top of it, not a - separate base model. The legacy ``default`` alias resolves to ``sdxl``. - """ - normalized = normalize_profile(profile) - if normalized in (SDXL_PROFILE, CONTROLNET_PROFILE): - return DEFAULT_MODEL_ID - raise ValueError(f"Unknown model profile '{profile}'. Use one of: sdxl, controlnet.") diff --git a/src/remove_ai_watermarks/samsung_engine.py b/src/remove_ai_watermarks/samsung_engine.py index 4f11527..553d8a3 100644 --- a/src/remove_ai_watermarks/samsung_engine.py +++ b/src/remove_ai_watermarks/samsung_engine.py @@ -1,394 +1,116 @@ """Samsung Galaxy AI visible watermark removal engine. -Samsung's on-device Generative AI photo edits (Generative Edit / Sketch to Image / -Portrait Studio on Galaxy phones) stamp a visible localized wordmark -- a sparkle -icon followed by a "generated with AI" string -- in the **bottom-left** corner: a -light, low-opacity semi-transparent white overlay. The string is locale-specific; -this engine is calibrated for the Italian "Contenuti generati dall'AI" variant -(issue #37, captures from @f-liva). Other locales need their own captured alpha -template, but the geometry and removal recipe are shared. +Samsung's on-device Generative AI photo edits burn a visible "✦ Contenuti generati +dall'AI" wordmark into the bottom-LEFT corner (the Italian locale variant calibrated +here; the string is locale-specific). It is a faint, near-white semi-transparent +overlay, the same overlay class as the Doubao/Jimeng marks but bottom-left. -Like the Gemini sparkle and the Doubao / Jimeng marks it is a fixed overlay, so -removal starts from **reverse-alpha blending** against a captured alpha map -(``remove_watermark_reverse_alpha``): ``original = (wm - a*logo)/(1-a)``. The logo -is pure white (255,255,255); the alpha map was solved from the GRAY Samsung capture -(see ``data/samsung_capture/``), bundled as ``assets/samsung_alpha.png`` -- the same -careful build as Jimeng/Doubao (cubic-background fit, mean over channels, full halo -extent, unblurred). The Samsung mark is faint (peak alpha ~0.38), so the glyph reads -as a soft light-gray strip. - -The mark is anchored bottom-LEFT (Doubao/Jimeng are bottom-right) and scales with -image WIDTH (~0.32 of width). The flat calibration captures arrive at the phone's -flat-edit size (~1086 wide) while real photos are ~3000 wide, so a single alpha map -cannot pixel-cancel the upscaled, per-image re-rasterized mark; removal therefore -NCC-aligns the alpha to the actual mark (always), reverse-alphas, then clears the -residual with a deliberately THIN inpaint over the glyph footprint -- the exact -recipe Jimeng uses. Verified on the flat captures and a real ~2958-wide download. - -Detection (``detect``) matches the bundled glyph silhouette against the corner -candidate via normalized correlation, keying on the actual mark shape rather than -coverage heuristics. Samsung edits also carry C2PA + the Galaxy ``genAIType`` -marker (see ``metadata``/``identify``), so the visible path is the stripped-metadata -fallback / the *removal* path, not a new ``identify`` signal. - -``locate`` (geometry box) and ``extract_mask`` (the candidate glyph mask the -detector correlates) mirror the Doubao/Jimeng engines. Fast, offline, no GPU. -Arbitrary-region inpainting still lives in ``region_eraser`` / the ``erase`` command. +Removal is **reverse-alpha blending** against a captured alpha map +(``original = (wm - a*logo)/(1-a)``), always NCC-aligned to the actual mark plus a thin +residual inpaint over the glyph footprint. This is one of the three text-mark engines +that share :class:`remove_ai_watermarks._text_mark_engine.TextMarkEngine`; this module +supplies only Samsung's tuned :class:`TextMarkConfig` (bottom-LEFT corner, a lower glyph +luma since the mark is faint, ``assets/samsung_alpha.png`` solved from the flat captures +by ``scripts/visible_alpha_solve.py``). Samsung Galaxy AI edits are also caught by C2PA ++ the ``genAIType`` marker, so this is the visible-mark *removal* path; it also feeds +``identify`` as the medium-confidence ``visible_samsung`` signal via the registry. """ +# The module-level _alpha_template / _glyph_silhouette / _template_match_score below +# are thin test-facing shims (imported by tests/), so pyright's src-only pass sees them +# as unused; the use is cross-module. +# pyright: reportUnusedFunction=false -# cv2/numpy boundary: third-party libs ship no usable element types; relax the -# unknown-type rules for this file only. -# pyright: reportUnknownMemberType=false, reportUnknownArgumentType=false, reportUnknownVariableType=false, reportUnknownParameterType=false, reportMissingTypeArgument=false, reportMissingTypeStubs=false, reportMissingImports=false, reportArgumentType=false, reportAssignmentType=false, reportReturnType=false, reportCallIssue=false, reportIndexIssue=false, reportOperatorIssue=false, reportOptionalMemberAccess=false, reportOptionalCall=false, reportOptionalSubscript=false, reportOptionalOperand=false, reportAttributeAccessIssue=false, reportPrivateImportUsage=false, reportPrivateUsage=false, reportInvalidTypeForm=false, reportConstantRedefinition=false, reportUnnecessaryComparison=false from __future__ import annotations -import logging -from dataclasses import dataclass from typing import TYPE_CHECKING, Any -import cv2 -import numpy as np +from remove_ai_watermarks import _text_mark_engine +from remove_ai_watermarks._text_mark_engine import TextMarkConfig, TextMarkDetection, TextMarkEngine if TYPE_CHECKING: - from pathlib import Path - from numpy.typing import NDArray -logger = logging.getLogger(__name__) - - -# Geometry as a fraction of image WIDTH. The Samsung mark scales with width and is -# anchored bottom-LEFT. The box is intentionally generous (the glyph mask tightens -# it and the alignment search refines position); values cover the 1086 flat captures -# and the ~2958 real photos (both measured at width_frac ~0.31). +# Locate geometry as a fraction of image WIDTH (mark scales with width, bottom-LEFT). WM_WIDTH_FRAC = 0.40 WM_HEIGHT_FRAC = 0.060 MARGIN_LEFT_FRAC = 0.004 MARGIN_BOTTOM_FRAC = 0.002 -# Glyph appearance: a low-saturation light gray rendered brighter than the -# surrounding content (white top-hat), same polarity logic as Doubao/Jimeng so a -# white-paper document is left untouched. LOGO_MIN_LUMA is lower than Jimeng's -# because the Samsung mark is fainter (peak alpha ~0.38), so on a mid/dark -# background the glyph luma is lower; the top-hat + NCC shape gate keep precision. -MAX_SATURATION = 55 # max channel spread to count a pixel as "grayish" -LOGO_MIN_LUMA = 110 # glyphs are at least this bright in absolute terms -TOPHAT_DELTA = 8 # glyph must exceed the local background by this many levels +# Glyph appearance: a light, low-saturation gray. LOGO_MIN_LUMA is lower than Jimeng's +# because the mark is faint (peak alpha ~0.38), so on a mid/dark background its glyph +# luma is lower; a white-paper document is still left untouched. +MAX_SATURATION = 55 +LOGO_MIN_LUMA = 110 +TOPHAT_DELTA = 8 -# Detection matches the bundled alpha-template glyph silhouette -# (assets/samsung_alpha.png) against the candidate via zero-mean normalized -# correlation (cv2 TM_CCOEFF_NORMED). A small coverage floor skips the template -# match on a near-empty candidate box. The threshold is validated against the real -# capture set and the other visible marks (Doubao/Jimeng/Gemini must not cross-fire). +# Shape-consistent detection. Threshold 0.40; real marks ~0.79, and Doubao/Jimeng score +# 0.0 here (and Samsung 0.0 on theirs) -- no cross-fire (the corner also differs). DETECT_MIN_COVERAGE = 0.01 DETECT_NCC_THRESHOLD = 0.40 -# ── Reverse-alpha (recovery, Gemini/Doubao/Jimeng-style) ───────────── -# The Samsung mark is a fixed semi-transparent white overlay; given its alpha map -# the original pixels are recovered by inverting the blend. The logo is pure white -# (the white capture confirms it). The alpha map was solved from the GRAY capture by -# scripts/visible_alpha_solve.py (cubic-background fit, mean over channels, full halo, -# unblurred); the bundled asset (assets/samsung_alpha.png) is that template (a*255) -# at the captured width. The mark scales with image WIDTH, and the flat captures are -# ~2.7x smaller than real photos, so a pure width-scale is only approximate; removal -# also registers the template to the actual mark via a TM_CCOEFF_NORMED scale+position -# search (`_aligned_alpha_map`). +# Reverse-alpha geometry, solved by scripts/visible_alpha_solve.py from the flat gray +# capture (native width 1086). Real photos are ~2958 wide, so the captured glyph is +# upscaled; width-scale + NCC-align removes it cleanly (a flat capture at the real +# resolution would make the alpha pixel-sharp -- an open quality upgrade). _ALPHA_NATIVE_WIDTH = 1086 _ALPHA_LOGO_BGR: tuple[float, float, float] = (255.0, 255.0, 255.0) -# Geometry below is emitted by scripts/visible_alpha_solve.py for the bundled -# asset -- keep them in sync when the asset is rebuilt. _ALPHA_WIDTH_FRAC = 0.3195 # asset width / image width -- the alignment scale seed _ALPHA_HEIGHT_FRAC = 0.0378 -# Margins (of image WIDTH) of the captured mark -- the geometry record / where to -# seed; alignment refines the actual position, so these are not load-bearing. _ALPHA_MARGIN_LEFT_FRAC = 0.0110 _ALPHA_MARGIN_BOTTOM_FRAC = 0.0064 -# Alignment scale search (np.linspace args) around the width-scaled glyph size -- -# wider than Jimeng's because the flat captures are far off the real-photo width, so -# the per-image scale can drift more from the width-scaled seed. +# Wider scale search: the flat capture is far off the real-photo width. _ALPHA_ALIGN_SEARCH = (0.85, 1.18, 23) -# Residual inpaint footprint: a single capture upscaled to the real-photo width -# cannot pixel-cancel the re-rasterized mark, so the glyph footprint (alpha above -# this) is always inpainted after reverse-alpha (dilated by this kernel, INPAINT_NS). -# Kept deliberately THIN -- reverse-alpha already recovers the true background under -# the semi-transparent mark, so the inpaint only finishes the residual edges. _RESIDUAL_ALPHA_FLOOR = 0.05 _RESIDUAL_DILATE = 5 _RESIDUAL_INPAINT_RADIUS = 2 -_alpha_template_cache: NDArray[Any] | None = None + +_CONFIG = TextMarkConfig( + name="Samsung Galaxy AI", + asset_name="samsung_alpha.png", + corner="bl", + margin_floor=2, + width_frac=WM_WIDTH_FRAC, + height_frac=WM_HEIGHT_FRAC, + margin_x_frac=MARGIN_LEFT_FRAC, + margin_bottom_frac=MARGIN_BOTTOM_FRAC, + max_saturation=MAX_SATURATION, + logo_min_luma=LOGO_MIN_LUMA, + tophat_delta=TOPHAT_DELTA, + morph_open_size=3, + detect_min_coverage=DETECT_MIN_COVERAGE, + detect_ncc_threshold=DETECT_NCC_THRESHOLD, + alpha_width_frac=_ALPHA_WIDTH_FRAC, + alpha_height_frac=_ALPHA_HEIGHT_FRAC, + alpha_margin_x_frac=_ALPHA_MARGIN_LEFT_FRAC, + alpha_margin_bottom_frac=_ALPHA_MARGIN_BOTTOM_FRAC, + alpha_align_search=_ALPHA_ALIGN_SEARCH, + min_gw=16, + alpha_logo_bgr=_ALPHA_LOGO_BGR, + residual_alpha_floor=_RESIDUAL_ALPHA_FLOOR, + residual_dilate=_RESIDUAL_DILATE, + residual_inpaint_radius=_RESIDUAL_INPAINT_RADIUS, +) + +SamsungDetection = TextMarkDetection def _alpha_template() -> NDArray[Any] | None: - """Lazily load the bundled Samsung alpha template (float [0,1]), or None.""" - global _alpha_template_cache - if _alpha_template_cache is None: - from pathlib import Path - - from remove_ai_watermarks import image_io - - path = Path(__file__).parent / "assets" / "samsung_alpha.png" - img = image_io.imread(str(path), cv2.IMREAD_GRAYSCALE) - if img is None: - return None - _alpha_template_cache = img.astype(np.float32) / 255.0 - return _alpha_template_cache - - -@dataclass(frozen=True) -class SamsungLocation: - """Located watermark box (bottom-left), in absolute pixel coordinates.""" - - x: int - y: int - w: int - h: int - is_fallback: bool = True # geometry anchor (no template match) -> always True for now - - @property - def bbox(self) -> tuple[int, int, int, int]: - return self.x, self.y, self.w, self.h - - -@dataclass -class SamsungDetection: - """Result of visible Samsung Galaxy AI watermark detection.""" - - detected: bool = False - confidence: float = 0.0 - region: tuple[int, int, int, int] = (0, 0, 0, 0) - coverage: float = 0.0 # fraction of the box occupied by glyph pixels - - -_silhouette_cache: NDArray[Any] | None = None + """The bundled Samsung alpha template (float [0,1]), or None.""" + return _text_mark_engine.load_alpha_template(_CONFIG.asset_name) def _glyph_silhouette() -> NDArray[Any] | None: - """Binary glyph silhouette (255 = glyph) from the bundled alpha map, used as the - detection template. None if the alpha asset is missing. The threshold is a - fraction of the (faint) peak alpha so the thin strokes survive.""" - global _silhouette_cache - if _silhouette_cache is None: - at = _alpha_template() - if at is None: - return None - _silhouette_cache = (at > 0.10).astype(np.uint8) * 255 - return _silhouette_cache + """Binary "Contenuti generati dall'AI" silhouette (255 = glyph), or None.""" + return _text_mark_engine.glyph_silhouette(_CONFIG.asset_name) def _template_match_score(box_mask: NDArray[Any], image_width: int) -> float: - """Zero-mean normalized correlation of the alpha-template glyph silhouette - (scaled to the mark's expected size) against the candidate ``box_mask``.""" - sil = _glyph_silhouette() - if sil is None or box_mask.size == 0: - return 0.0 - gw = min(box_mask.shape[1] - 1, max(16, int(_ALPHA_WIDTH_FRAC * image_width))) - gh = min(box_mask.shape[0] - 1, max(4, int(_ALPHA_HEIGHT_FRAC * image_width))) - if gw < 16 or gh < 4: - return 0.0 - template = cv2.resize(sil, (gw, gh), interpolation=cv2.INTER_NEAREST) - return float(cv2.matchTemplate(box_mask, template, cv2.TM_CCOEFF_NORMED).max()) + """TM_CCOEFF_NORMED of the Samsung glyph silhouette against ``box_mask``.""" + return _text_mark_engine.template_match_score(box_mask, image_width, _CONFIG) -class SamsungEngine: - """Remove the visible Samsung Galaxy AI watermark (locate -> mask -> reverse-alpha).""" +class SamsungEngine(TextMarkEngine): + """Remove the visible Samsung Galaxy AI text mark (locate -> mask -> reverse-alpha).""" - def __init__( - self, - *, - width_frac: float = WM_WIDTH_FRAC, - height_frac: float = WM_HEIGHT_FRAC, - margin_left_frac: float = MARGIN_LEFT_FRAC, - margin_bottom_frac: float = MARGIN_BOTTOM_FRAC, - ) -> None: - self.width_frac = width_frac - self.height_frac = height_frac - self.margin_left_frac = margin_left_frac - self.margin_bottom_frac = margin_bottom_frac - - # ── Locate ──────────────────────────────────────────────────────── - - def locate(self, image: NDArray[Any]) -> SamsungLocation: - """Anchor the watermark box in the bottom-left corner by geometry.""" - h, w = image.shape[:2] - wm_w = max(40, int(w * self.width_frac)) - wm_h = max(16, int(w * self.height_frac)) - margin_l = max(2, int(w * self.margin_left_frac)) - margin_b = max(2, int(w * self.margin_bottom_frac)) - x = min(margin_l, max(0, w - wm_w)) - y = max(0, h - margin_b - wm_h) - wm_w = min(wm_w, w - x) - wm_h = min(wm_h, h - y) - return SamsungLocation(x=x, y=y, w=wm_w, h=wm_h, is_fallback=True) - - # ── Mask ────────────────────────────────────────────────────────── - - def extract_mask(self, image: NDArray[Any], loc: SamsungLocation) -> NDArray[Any]: - """Build a full-image uint8 mask (255 = watermark glyph) for the box. - - Polarity-aware: the mark is a light, low-saturation gray rendered brighter - than the local background (white top-hat), so a white-paper document is left - untouched (nothing brighter than its surroundings is masked there). - """ - h, w = image.shape[:2] - x, y, bw, bh = loc.bbox - # A degenerate ROI (a sliver from an extremely wide/short image) cannot hold - # the mark and would feed cv2's GaussianBlur/morphology a ~1-px-tall array, - # which can fault the native code on some platforms (mirrors the Doubao/Jimeng - # guard). Skip the cv2 pipeline and return an empty mask there. - if bh < 16 or bw < 16: - return np.zeros((h, w), np.uint8) - # Normalize the ROI to 3-channel BGR: a 2D grayscale or 4-channel BGRA input - # would otherwise break the axis=2 channel reductions below. - roi = image[y : y + bh, x : x + bw] - if roi.ndim == 2: - roi = cv2.cvtColor(roi, cv2.COLOR_GRAY2BGR) - elif roi.shape[2] == 4: - roi = cv2.cvtColor(roi, cv2.COLOR_BGRA2BGR) - roi = roi.astype(np.float32) - - luma = roi.mean(axis=2) - sat = roi.max(axis=2) - roi.min(axis=2) - grayish = sat < MAX_SATURATION - - sigma = max(4.0, bh * 0.4) - local_bg = cv2.GaussianBlur(luma, (0, 0), sigmaX=sigma, sigmaY=sigma) - tophat = luma - local_bg - - cand = grayish & (tophat > TOPHAT_DELTA) & (luma > LOGO_MIN_LUMA) - glyph = cand.astype(np.uint8) * 255 - glyph = cv2.morphologyEx(glyph, cv2.MORPH_CLOSE, np.ones((5, 5), np.uint8)) - glyph = cv2.morphologyEx(glyph, cv2.MORPH_OPEN, np.ones((3, 3), np.uint8)) - - mask = np.zeros((h, w), np.uint8) - mask[y : y + bh, x : x + bw] = glyph - return mask - - # ── Detect ──────────────────────────────────────────────────────── - - def detect(self, image: NDArray[Any]) -> SamsungDetection: - """Detect the visible Samsung mark by matching the alpha-template glyph - silhouette against the corner candidate (TM_CCOEFF_NORMED).""" - det = SamsungDetection() - if image is None or image.size == 0: - return det - loc = self.locate(image) - mask = self.extract_mask(image, loc) - x, y, bw, bh = loc.bbox - box = mask[y : y + bh, x : x + bw] - coverage = float((box > 0).sum()) / float(max(1, bw * bh)) - det.region = loc.bbox - det.coverage = coverage - if coverage >= DETECT_MIN_COVERAGE: - score = _template_match_score(box, image.shape[1]) - det.confidence = score - det.detected = score >= DETECT_NCC_THRESHOLD - logger.debug("Samsung detect: coverage=%.3f ncc=%.2f detected=%s", coverage, score, det.detected) - return det - - # ── Reverse-alpha (recovery + residual inpaint) ─────────────────── - - def reverse_alpha_available(self, image: NDArray[Any]) -> bool: - """True if the bundled alpha map is loadable (NCC alignment places it at any - resolution; the caller still gates on ``detect``).""" - return image is not None and image.size > 0 and _alpha_template() is not None - - def _fixed_alpha_map(self, image: NDArray[Any]) -> tuple[NDArray[Any], tuple[int, int, int, int]] | None: - """Place the template by fixed width-relative geometry (bottom-left).""" - at = _alpha_template() - if at is None: - return None - h, w = image.shape[:2] - gw = min(w, max(1, int(_ALPHA_WIDTH_FRAC * w))) - gh = min(h, max(1, int(_ALPHA_HEIGHT_FRAC * w))) - ax = min(max(0, int(_ALPHA_MARGIN_LEFT_FRAC * w)), max(0, w - gw)) - ay = max(0, h - int(_ALPHA_MARGIN_BOTTOM_FRAC * w) - gh) - amap = np.zeros((h, w), np.float32) - amap[ay : ay + gh, ax : ax + gw] = cv2.resize(at, (gw, gh), interpolation=cv2.INTER_LINEAR) - return amap, (ax, ay, gw, gh) - - def _aligned_alpha_map(self, image: NDArray[Any]) -> tuple[NDArray[Any], tuple[int, int, int, int]] | None: - """Register the captured template to the actual mark via a TM_CCOEFF_NORMED - scale + position search -- so the single capture works off the captured - width. Returns ``(alpha_map, glyph_bbox)`` or None.""" - at = _alpha_template() - sil = _glyph_silhouette() - if at is None or sil is None: - return None - h, w = image.shape[:2] - loc = self.locate(image) - bx, by, bw, bh = loc.bbox - box_mask = self.extract_mask(image, loc)[by : by + bh, bx : bx + bw] - expected = _ALPHA_WIDTH_FRAC * w - best: tuple[float, int, int, int, int] | None = None - for scale in np.linspace(*_ALPHA_ALIGN_SEARCH): - gw, gh = int(expected * scale), int(_ALPHA_HEIGHT_FRAC * w * scale) - if gw < 16 or gh < 4 or gw >= bw or gh >= bh: - continue - t = cv2.resize(sil, (gw, gh), interpolation=cv2.INTER_NEAREST) - _, score, _, top_left = cv2.minMaxLoc(cv2.matchTemplate(box_mask, t, cv2.TM_CCOEFF_NORMED)) - if best is None or score > best[0]: - best = (score, gw, gh, top_left[0], top_left[1]) - if best is None: - return None - _, gw, gh, ox, oy = best - ax, ay = bx + ox, by + oy - amap = np.zeros((h, w), np.float32) - amap[ay : ay + gh, ax : ax + gw] = cv2.resize(at, (gw, gh), interpolation=cv2.INTER_LINEAR) - return amap, (ax, ay, gw, gh) - - def _apply_reverse_alpha(self, image: NDArray[Any], amap: NDArray[Any]) -> NDArray[Any]: - """Invert the alpha blend with ``amap``: ``original = (wm - a*logo)/(1-a)``.""" - a3 = np.clip(amap, 0.0, 1.0)[:, :, None] - logo = np.array(_ALPHA_LOGO_BGR, np.float32) - return np.clip((image.astype(np.float32) - a3 * logo) / np.clip(1.0 - a3, 0.25, 1.0), 0, 255).astype(np.uint8) - - def remove_watermark_reverse_alpha(self, image: NDArray[Any], *, residual_inpaint: bool = True) -> NDArray[Any]: - """Recover the original pixels by inverting the alpha blend, then clear the - residual outline with a thin inpaint over the glyph footprint. - - Placement: fixed geometry AND the NCC-aligned placement are always tried and - the one leaving the least residual mark (lowest re-``detect`` confidence) is - kept -- the flat capture is far off the real-photo width and the mark - re-rasterizes per image, so fixed geometry alone is not reliable. A single - capture cannot pixel-cancel the upscaled mark, so a deliberately THIN residual - inpaint (``_RESIDUAL_*``) follows. Call only when - :meth:`reverse_alpha_available` and the mark is detected. - """ - # Normalize to 3-channel BGR so a 2D grayscale or 4-channel BGRA input does - # not break the reverse-alpha math (which assumes a 3-channel logo). - if image.ndim == 2: - image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR) - elif image.shape[2] == 4: - image = cv2.cvtColor(image, cv2.COLOR_BGRA2BGR) - # An image too small to hold the mark would make the geometry boxes degenerate - # and feed cv2.resize a ~1-px-tall target; skip cv2 entirely (mirrors Jimeng). - h, w = image.shape[:2] - if h < 32 or w < 64: - return image.copy() - maps = [c for c in (self._fixed_alpha_map(image), self._aligned_alpha_map(image)) if c is not None] - if not maps: - return image.copy() - best_out: NDArray[Any] | None = None - best_amap: NDArray[Any] | None = None - best_residual = float("inf") - for amap, _region in maps: - out = self._apply_reverse_alpha(image, amap) - residual = self.detect(out).confidence - if residual < best_residual: - best_residual, best_out, best_amap = residual, out, amap - if best_out is None or best_amap is None: # pragma: no cover - maps is non-empty - return image.copy() - if residual_inpaint: - kernel = np.ones((_RESIDUAL_DILATE, _RESIDUAL_DILATE), np.uint8) - rm = cv2.dilate((best_amap > _RESIDUAL_ALPHA_FLOOR).astype(np.uint8) * 255, kernel) - best_out = cv2.inpaint(best_out, rm, _RESIDUAL_INPAINT_RADIUS, cv2.INPAINT_NS) - return best_out - - -def load_image_bgr(path: str | Path) -> NDArray[Any]: - """Read an image as BGR ndarray (helper for scripts/tests).""" - from remove_ai_watermarks import image_io - - img = image_io.imread(path, cv2.IMREAD_COLOR) - if img is None: - raise FileNotFoundError(f"Failed to read image: {path}") - return img + def __init__(self) -> None: + super().__init__(_CONFIG) diff --git a/src/remove_ai_watermarks/watermark_registry.py b/src/remove_ai_watermarks/watermark_registry.py index 6ba2089..bcc3935 100644 --- a/src/remove_ai_watermarks/watermark_registry.py +++ b/src/remove_ai_watermarks/watermark_registry.py @@ -159,71 +159,45 @@ def _gemini_remove( return result, det.region -def _doubao_detect(image: NDArray[Any]) -> MarkDetection: - d = _engine("doubao").detect(image) - return MarkDetection("doubao", "Doubao 豆包AI生成 text", "bottom-right", d.detected, d.confidence, d.region) +# The three text-mark engines (Doubao/Jimeng/Samsung) share the TextMarkEngine +# interface, so one parameterized adapter pair drives all of them -- a new +# reverse-alpha text mark is one `_text_mark(...)` row below, not another copy-paste +# of these bodies. Removal is reverse-alpha only: applied when the mark is detected +# (or forced) and the alpha asset loads, otherwise skipped (no hallucination on a +# clean corner). +def _text_mark_detect(key: str, label: str, location: str) -> Callable[[NDArray[Any]], MarkDetection]: + def detect(image: NDArray[Any]) -> MarkDetection: + d = _engine(key).detect(image) + return MarkDetection(key, label, location, d.detected, d.confidence, d.region) + + return detect -def _doubao_remove( - image: NDArray[Any], _inpaint_method: InpaintMethod, _inpaint: bool, _strength: float, force: bool -) -> tuple[NDArray[Any], Region | None]: - # Reverse-alpha only: apply when the mark is present AND the resolution is in - # the alpha map's calibrated band. Outside it we do NOT inpaint (no - # hallucination) -- removal is skipped until a capture for that resolution. - engine = _engine("doubao") - det = engine.detect(image) - if (det.detected or force) and engine.reverse_alpha_available(image): - return engine.remove_watermark_reverse_alpha(image), (det.region if det.detected else None) - return image.copy(), None +def _text_mark_remove(key: str) -> Callable[..., tuple[NDArray[Any], Region | None]]: + def remove( + image: NDArray[Any], _inpaint_method: InpaintMethod, _inpaint: bool, _strength: float, force: bool + ) -> tuple[NDArray[Any], Region | None]: + engine = _engine(key) + det = engine.detect(image) + if (det.detected or force) and engine.reverse_alpha_available(image): + return engine.remove_watermark_reverse_alpha(image), (det.region if det.detected else None) + return image.copy(), None + + return remove -def _jimeng_detect(image: NDArray[Any]) -> MarkDetection: - d = _engine("jimeng").detect(image) - return MarkDetection("jimeng", "Jimeng 即梦AI wordmark", "bottom-right", d.detected, d.confidence, d.region) - - -def _jimeng_remove( - image: NDArray[Any], _inpaint_method: InpaintMethod, _inpaint: bool, _strength: float, force: bool -) -> tuple[NDArray[Any], Region | None]: - # Reverse-alpha (with an always-on residual inpaint over the glyph footprint, - # see the engine): apply when the mark is present and the alpha asset loads. - # Skipped otherwise (no hallucination on a clean corner). - engine = _engine("jimeng") - det = engine.detect(image) - if (det.detected or force) and engine.reverse_alpha_available(image): - return engine.remove_watermark_reverse_alpha(image), (det.region if det.detected else None) - return image.copy(), None - - -def _samsung_detect(image: NDArray[Any]) -> MarkDetection: - d = _engine("samsung").detect(image) - return MarkDetection("samsung", "Samsung Galaxy AI text", "bottom-left", d.detected, d.confidence, d.region) - - -def _samsung_remove( - image: NDArray[Any], _inpaint_method: InpaintMethod, _inpaint: bool, _strength: float, force: bool -) -> tuple[NDArray[Any], Region | None]: - # Reverse-alpha (with an always-on thin residual inpaint over the glyph - # footprint, see the engine): apply when the mark is present and the alpha asset - # loads. Skipped otherwise (no hallucination on a clean corner). - engine = _engine("samsung") - det = engine.detect(image) - if (det.detected or force) and engine.reverse_alpha_available(image): - return engine.remove_watermark_reverse_alpha(image), (det.region if det.detected else None) - return image.copy(), None +def _text_mark(key: str, label: str, location: str) -> KnownMark: + """A reverse-alpha text-mark registry row (Doubao/Jimeng/Samsung).""" + return KnownMark( + key, label, location, True, "reverse-alpha", _text_mark_detect(key, label, location), _text_mark_remove(key) + ) _REGISTRY: tuple[KnownMark, ...] = ( KnownMark("gemini", "Google Gemini sparkle", "bottom-right", True, "reverse-alpha", _gemini_detect, _gemini_remove), - KnownMark( - "doubao", "Doubao 豆包AI生成 text", "bottom-right", True, "reverse-alpha", _doubao_detect, _doubao_remove - ), - KnownMark( - "jimeng", "Jimeng 即梦AI wordmark", "bottom-right", True, "reverse-alpha", _jimeng_detect, _jimeng_remove - ), - KnownMark( - "samsung", "Samsung Galaxy AI text", "bottom-left", True, "reverse-alpha", _samsung_detect, _samsung_remove - ), + _text_mark("doubao", "Doubao 豆包AI生成 text", "bottom-right"), + _text_mark("jimeng", "Jimeng 即梦AI wordmark", "bottom-right"), + _text_mark("samsung", "Samsung Galaxy AI text", "bottom-left"), ) diff --git a/tests/test_cli.py b/tests/test_cli.py index defc988..34944fa 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -374,6 +374,21 @@ class TestAllCommand: result = runner.invoke(main, ["all", "/nonexistent/file.png"]) assert result.exit_code != 0 + def test_all_visible_step_uses_registry(self, runner, sample_png, tmp_path): + """Regression (#1): the `all` visible step must route through the registry + (best_auto_mark), so Doubao/Jimeng/Samsung text marks are handled -- not just + the Gemini sparkle via a hardcoded GeminiEngine.""" + mock_cls, _mock_engine = _mock_invisible_engine() + output = tmp_path / "clean.png" + with ( + patch("remove_ai_watermarks.cli.InvisibleEngine", mock_cls, create=True), + patch("remove_ai_watermarks.invisible_engine.InvisibleEngine", mock_cls), + patch("remove_ai_watermarks.watermark_registry.best_auto_mark", return_value=None) as mock_best, + ): + result = runner.invoke(main, ["all", str(sample_png), "-o", str(output)]) + assert result.exit_code == 0, result.output + mock_best.assert_called() # the registry auto-detector drove the visible pass + def test_all_preserves_rgba_across_invisible_step(self, runner, tmp_path): """Regression: ``all`` must keep transparency even when the invisible step writes a 3-channel result (as the real diffusion engine does). diff --git a/tests/test_gemini_engine.py b/tests/test_gemini_engine.py index 1f5f7e3..b5359dc 100644 --- a/tests/test_gemini_engine.py +++ b/tests/test_gemini_engine.py @@ -124,6 +124,19 @@ class TestGeminiEngine: result = self.engine.remove_watermark_custom(image, (10, 10, 48, 48)) assert result.shape == image.shape + def test_detect_on_grayscale_does_not_crash(self): + # A 2D grayscale array reaching detect_watermark (registry adapter / library + # API) must not crash the FP-gate's axis=2 reduction; it is normalized to BGR. + gray = np.full((300, 300), 100, dtype=np.uint8) + result = self.engine.detect_watermark(gray) + assert result is not None + + def test_remove_on_bgra_returns_3_channel(self): + bgra = np.zeros((300, 300, 4), dtype=np.uint8) + bgra[..., 3] = 255 + result = self.engine.remove_watermark(bgra) + assert result.shape == (300, 300, 3) + def test_remove_watermark_custom_large_region(self, tmp_image_path): image = cv2.imread(str(tmp_image_path), cv2.IMREAD_COLOR) result = self.engine.remove_watermark_custom(image, (10, 10, 96, 96)) diff --git a/tests/test_image_io.py b/tests/test_image_io.py index c23c8bd..30c3091 100644 --- a/tests/test_image_io.py +++ b/tests/test_image_io.py @@ -59,6 +59,32 @@ class TestUnicodeRoundTrip: assert np.array_equal(out, src) +class TestToBgr: + def test_grayscale_2d_promoted_to_bgr(self) -> None: + gray = np.full((4, 5), 120, dtype=np.uint8) + out = image_io.to_bgr(gray) + assert out.shape == (4, 5, 3) + # GRAY2BGR replicates the channel, so all three match the source. + assert np.array_equal(out[..., 0], gray) + assert np.array_equal(out[..., 0], out[..., 2]) + + def test_single_channel_3d_promoted(self) -> None: + gray = np.full((4, 5, 1), 7, dtype=np.uint8) + assert image_io.to_bgr(gray).shape == (4, 5, 3) + + def test_bgra_dropped_to_bgr(self) -> None: + bgra = np.zeros((4, 5, 4), dtype=np.uint8) + bgra[..., :3] = (10, 120, 240) + out = image_io.to_bgr(bgra) + assert out.shape == (4, 5, 3) + assert np.array_equal(out, bgra[..., :3]) + + def test_bgr_returned_unchanged(self) -> None: + bgr = _make_bgr() + out = image_io.to_bgr(bgr) + assert out is bgr # 3-channel: no copy + + class TestFailureSemantics: def test_missing_file_returns_none(self, tmp_path: Path) -> None: assert image_io.imread(tmp_path / "does-not-exist-不存在.png") is None diff --git a/tests/test_platform.py b/tests/test_platform.py index ae69e91..2e5d3b3 100644 --- a/tests/test_platform.py +++ b/tests/test_platform.py @@ -20,7 +20,6 @@ from remove_ai_watermarks.noai.watermark_profiles import ( GEMINI_STRENGTH, OPENAI_STRENGTH, UNKNOWN_STRENGTH, - get_model_id_for_profile, normalize_profile, resolve_strength, strength_default_help, @@ -111,24 +110,19 @@ class TestMpsErrorDetection: class TestModelProfiles: - """Tests for watermark_profiles.py.""" + """Tests for watermark_profiles.py profile-name normalization.""" - def test_sdxl_profile(self): - assert get_model_id_for_profile("sdxl") == "stabilityai/stable-diffusion-xl-base-1.0" + def test_canonical_profiles_unchanged(self): + assert normalize_profile("sdxl") == "sdxl" + assert normalize_profile("controlnet") == "controlnet" def test_default_alias_resolves_to_sdxl(self): # "default" is the legacy alias for "sdxl" (back-compat for existing scripts). - assert get_model_id_for_profile("default") == "stabilityai/stable-diffusion-xl-base-1.0" assert normalize_profile("default") == "sdxl" - assert normalize_profile("controlnet") == "controlnet" - def test_controlnet_profile(self): - # controlnet shares the SDXL base checkpoint (the ControlNet is an add-on). - assert get_model_id_for_profile("controlnet") == "stabilityai/stable-diffusion-xl-base-1.0" - - def test_unknown_profile_raises(self): - with pytest.raises(ValueError, match="Unknown model profile"): - get_model_id_for_profile("nonexistent") + def test_normalize_is_case_and_whitespace_insensitive(self): + assert normalize_profile(" Default ") == "sdxl" + assert normalize_profile("CONTROLNET") == "controlnet" class TestResolveStrength: